Compare commits

..

3 Commits

Author SHA1 Message Date
gru-agent[bot]
1066ea62e3 Add tests for filterSafeProps function in Markdown utils test suite 2025-05-29 16:12:14 +00:00
Archer
8ed35ffe7e Update dataset.md (#4927) 2025-05-29 18:25:59 +08:00
Archer
0f866fc552 feat: text collecion auto save for a txt file (#4924) 2025-05-29 17:57:27 +08:00
12 changed files with 195 additions and 140 deletions

View File

@@ -645,7 +645,7 @@ data 为集合的 ID。
{{< /tab >}} {{< /tab >}}
{{< /tabs >}} {{< /tabs >}}
### 创建一个外部文件库集合(商业版 ### 创建一个外部文件库集合(弃用
{{< tabs tabTotal="3" >}} {{< tabs tabTotal="3" >}}
{{< tab tabName="请求示例" >}} {{< tab tabName="请求示例" >}}

View File

@@ -40,5 +40,6 @@ export function getSourceNameIcon({
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => { export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
if (mode === TrainingModeEnum.qa) return data.length * 20; if (mode === TrainingModeEnum.qa) return data.length * 20;
if (mode === TrainingModeEnum.auto) return data.length * 5; if (mode === TrainingModeEnum.auto) return data.length * 5;
if (mode === TrainingModeEnum.image) return data.length * 2;
return data.length; return data.length;
}; };

View File

@@ -223,7 +223,7 @@ export const readFileContentFromMongo = async ({
rawText: string; rawText: string;
filename: string; filename: string;
}> => { }> => {
const bufferId = `${fileId}-${customPdfParse}`; const bufferId = `${String(fileId)}-${customPdfParse}`;
// read buffer // read buffer
const fileBuffer = await getRawTextBuffer(bufferId); const fileBuffer = await getRawTextBuffer(bufferId);
if (fileBuffer) { if (fileBuffer) {

View File

@@ -1,5 +1,57 @@
import { detectFileEncoding } from '@fastgpt/global/common/file/tools'; import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { PassThrough } from 'stream'; import { PassThrough } from 'stream';
import { getGridBucket } from './controller';
import { type BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { retryFn } from '@fastgpt/global/common/system/utils';
export const createFileFromText = async ({
bucket,
filename,
text,
metadata
}: {
bucket: `${BucketNameEnum}`;
filename: string;
text: string;
metadata: Record<string, any>;
}) => {
const gridBucket = getGridBucket(bucket);
const buffer = Buffer.from(text);
const fileSize = buffer.length;
// 单块大小:尽可能大,但不超过 14MB不小于128KB
const chunkSizeBytes = (() => {
// 计算理想块大小:文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB
const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024);
// 确保块大小至少为128KB
const minChunkSize = 128 * 1024; // 128KB
// 取理想块大小和最小块大小中的较大值
let chunkSize = Math.max(idealChunkSize, minChunkSize);
// 将块大小向上取整到最接近的64KB的倍数使其更整齐
chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024);
return chunkSize;
})();
const uploadStream = gridBucket.openUploadStream(filename, {
metadata,
chunkSizeBytes
});
return retryFn(async () => {
return new Promise<{ fileId: string }>((resolve, reject) => {
uploadStream.end(buffer);
uploadStream.on('finish', () => {
resolve({ fileId: String(uploadStream.id) });
});
uploadStream.on('error', reject);
});
});
};
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => { export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
return new Promise<Buffer>((resolve, reject) => { return new Promise<Buffer>((resolve, reject) => {

View File

@@ -49,7 +49,7 @@ const CustomTextInput = () => {
createStatus: 'waiting', createStatus: 'waiting',
rawText: data.value, rawText: data.value,
sourceName: data.name, sourceName: data.name,
icon: 'file/fill/manual' icon: 'file/fill/txt'
} }
]); ]);
goToNext(); goToNext();

View File

@@ -6,6 +6,7 @@ import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant
import { NextAPI } from '@/service/middleware/entry'; import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { type CreateCollectionResponse } from '@/global/core/dataset/api'; import { type CreateCollectionResponse } from '@/global/core/dataset/api';
import { createFileFromText } from '@fastgpt/service/common/file/gridfs/utils';
async function handler(req: NextApiRequest): CreateCollectionResponse { async function handler(req: NextApiRequest): CreateCollectionResponse {
const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams; const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams;
@@ -18,6 +19,18 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
per: WritePermissionVal per: WritePermissionVal
}); });
// 1. Create file from text
const filename = `${name}.txt`;
const { fileId } = await createFileFromText({
bucket: 'dataset',
filename,
text,
metadata: {
teamId,
uid: tmbId
}
});
const { collectionId, insertResults } = await createCollectionAndInsertData({ const { collectionId, insertResults } = await createCollectionAndInsertData({
dataset, dataset,
rawText: text, rawText: text,
@@ -25,9 +38,9 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
...body, ...body,
teamId, teamId,
tmbId, tmbId,
type: DatasetCollectionTypeEnum.virtual, type: DatasetCollectionTypeEnum.file,
fileId,
name name: filename
} }
}); });

View File

@@ -2,7 +2,7 @@ import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/sch
import { pushQAUsage } from '@/service/support/wallet/usage/push'; import { pushQAUsage } from '@/service/support/wallet/usage/push';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { createChatCompletion } from '@fastgpt/service/core/ai/config'; import { createChatCompletion } from '@fastgpt/service/core/ai/config';
import type { ChatCompletionMessageParam, StreamChatType } from '@fastgpt/global/core/ai/type.d'; import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d';
import { addLog } from '@fastgpt/service/common/system/log'; import { addLog } from '@fastgpt/service/common/system/log';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { replaceVariable } from '@fastgpt/global/common/string/tools'; import { replaceVariable } from '@fastgpt/global/common/string/tools';

View File

@@ -1,6 +1,6 @@
import { TeamErrEnum } from '@fastgpt/global/common/error/code/team'; import { TeamErrEnum } from '@fastgpt/global/common/error/code/team';
import { checkTeamAIPoints } from '@fastgpt/service/support/permission/teamLimit'; import { checkTeamAIPoints } from '@fastgpt/service/support/permission/teamLimit';
import { sendOneInform } from '../support/user/inform/api'; import { sendOneInform } from '../../../support/user/inform/api';
import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller'; import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller';
import { InformLevelEnum } from '@fastgpt/global/support/user/inform/constants'; import { InformLevelEnum } from '@fastgpt/global/support/user/inform/constants';
@@ -18,7 +18,7 @@ export const checkTeamAiPointsAndLock = async (teamId: string) => {
templateParam: {}, templateParam: {},
teamId teamId
}); });
console.log('余额不足,暂停【向量】生成任务'); console.log('余额不足,暂停训练生成任务');
await lockTrainingDataByTeamId(teamId); await lockTrainingDataByTeamId(teamId);
} catch (error) {} } catch (error) {}
} }

View File

@@ -1,5 +1,5 @@
import { generateQA } from '@/service/events/generateQA'; import { generateQA } from '@/service/core/dataset/queues/generateQA';
import { generateVector } from '@/service/events/generateVector'; import { generateVector } from '@/service/core/dataset/queues/generateVector';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type'; import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';

View File

@@ -1,5 +1,5 @@
import { describe, it, expect } from 'vitest'; import { describe, it, expect } from 'vitest';
import { mdTextFormat, CodeClassNameEnum } from '@/components/Markdown/utils'; import { mdTextFormat, CodeClassNameEnum, filterSafeProps } from '@/components/Markdown/utils';
describe('Markdown utils', () => { describe('Markdown utils', () => {
describe('mdTextFormat', () => { describe('mdTextFormat', () => {
@@ -56,4 +56,121 @@ describe('Markdown utils', () => {
expect(CodeClassNameEnum.audio).toBe('audio'); expect(CodeClassNameEnum.audio).toBe('audio');
}); });
}); });
describe('filterSafeProps', () => {
const allowedAttrs = new Set(['class', 'style', 'title', 'id']);
it('should filter out non-whitelisted attributes', () => {
const props = {
class: 'test',
nonexistent: 'value',
title: 'title'
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
class: 'test',
title: 'title'
});
});
it('should filter out dangerous event handlers', () => {
const props = {
class: 'test',
onClick: () => {},
onMouseover: () => {}
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
class: 'test'
});
});
it('should filter out dangerous protocols', () => {
const props = {
title: 'javascript:alert(1)',
id: 'vbscript:alert(1)',
class: 'safe'
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
class: 'safe'
});
});
it('should handle encoded malicious content', () => {
const props = {
title: '&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;alert(1)',
id: '%6A%61%76%61%73%63%72%69%70%74%3Aalert(1)',
class: 'safe'
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
class: 'safe'
});
});
it('should filter style objects', () => {
const props = {
style: {
color: 'red',
background: 'javascript:alert(1)'
},
class: 'test'
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
class: 'test'
});
});
it('should handle empty and null values', () => {
const props = {
class: '',
title: null,
style: null
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
class: '',
title: null,
style: null
});
});
it('should filter nested objects except style', () => {
const props = {
data: { key: 'value' },
style: { color: 'red' },
class: 'test'
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
style: { color: 'red' },
class: 'test'
});
});
it('should handle multiple iterations of encoded content', () => {
const props = {
title: encodeURIComponent(encodeURIComponent('javascript:alert(1)')),
class: 'safe'
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
class: 'safe'
});
});
it('should filter suspicious content patterns', () => {
const props = {
title: 'Function("alert(1)")',
id: 'eval("alert(1)")',
class: 'test'
};
const result = filterSafeProps(props, allowedAttrs);
expect(result).toEqual({
class: 'test'
});
});
});
}); });

View File

@@ -1,128 +0,0 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
import {
createDatasetTrainingMongoWatch,
startTrainingQueue
} from '@/service/core/dataset/training/utils';
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { generateQA } from '@/service/core/dataset/queues/generateQA';
import { generateVector } from '@/service/core/dataset/queues/generateVector';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
vi.mock('@/service/core/dataset/queues/generateQA', () => ({
generateQA: vi.fn()
}));
vi.mock('@/service/core/dataset/queues/generateVector', () => ({
generateVector: vi.fn()
}));
vi.mock('@fastgpt/service/core/dataset/training/schema', () => ({
MongoDatasetTraining: {
watch: vi.fn().mockReturnValue({
on: vi.fn()
})
}
}));
describe('dataset training utils', () => {
beforeEach(() => {
vi.clearAllMocks();
});
describe('createDatasetTrainingMongoWatch', () => {
it('should setup mongo watch and handle qa mode', () => {
const mockOn = vi.fn();
vi.mocked(MongoDatasetTraining.watch).mockReturnValue({
on: mockOn
});
createDatasetTrainingMongoWatch();
expect(MongoDatasetTraining.watch).toHaveBeenCalled();
expect(mockOn).toHaveBeenCalledWith('change', expect.any(Function));
// Simulate change event for QA mode
const changeHandler = mockOn.mock.calls[0][1];
changeHandler({
operationType: 'insert',
fullDocument: {
mode: TrainingModeEnum.qa
}
});
expect(generateQA).toHaveBeenCalled();
expect(generateVector).not.toHaveBeenCalled();
});
it('should handle chunk mode', () => {
const mockOn = vi.fn();
vi.mocked(MongoDatasetTraining.watch).mockReturnValue({
on: mockOn
});
createDatasetTrainingMongoWatch();
const changeHandler = mockOn.mock.calls[0][1];
changeHandler({
operationType: 'insert',
fullDocument: {
mode: TrainingModeEnum.chunk
}
});
expect(generateVector).toHaveBeenCalled();
expect(generateQA).not.toHaveBeenCalled();
});
it('should ignore non-insert operations', () => {
const mockOn = vi.fn();
vi.mocked(MongoDatasetTraining.watch).mockReturnValue({
on: mockOn
});
createDatasetTrainingMongoWatch();
const changeHandler = mockOn.mock.calls[0][1];
changeHandler({
operationType: 'update',
fullDocument: {
mode: TrainingModeEnum.qa
}
});
expect(generateQA).not.toHaveBeenCalled();
expect(generateVector).not.toHaveBeenCalled();
});
});
describe('startTrainingQueue', () => {
beforeEach(() => {
global.systemEnv = {
qaMaxProcess: 3
};
});
it('should start single process by default', () => {
startTrainingQueue();
expect(generateQA).toHaveBeenCalledTimes(1);
expect(generateVector).toHaveBeenCalledTimes(1);
});
it('should start max processes when fast mode enabled', () => {
startTrainingQueue(true);
expect(generateQA).toHaveBeenCalledTimes(3);
expect(generateVector).toHaveBeenCalledTimes(3);
});
it('should use default max process when not configured', () => {
global.systemEnv = undefined;
startTrainingQueue(true);
expect(generateQA).toHaveBeenCalledTimes(10);
expect(generateVector).toHaveBeenCalledTimes(10);
});
});
});