Compare commits
3 Commits
gru/projec
...
gru/projec
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1066ea62e3 | ||
|
|
8ed35ffe7e | ||
|
|
0f866fc552 |
@@ -645,7 +645,7 @@ data 为集合的 ID。
|
|||||||
{{< /tab >}}
|
{{< /tab >}}
|
||||||
{{< /tabs >}}
|
{{< /tabs >}}
|
||||||
|
|
||||||
### 创建一个外部文件库集合(商业版)
|
### 创建一个外部文件库集合(弃用)
|
||||||
|
|
||||||
{{< tabs tabTotal="3" >}}
|
{{< tabs tabTotal="3" >}}
|
||||||
{{< tab tabName="请求示例" >}}
|
{{< tab tabName="请求示例" >}}
|
||||||
|
|||||||
@@ -40,5 +40,6 @@ export function getSourceNameIcon({
|
|||||||
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
|
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
|
||||||
if (mode === TrainingModeEnum.qa) return data.length * 20;
|
if (mode === TrainingModeEnum.qa) return data.length * 20;
|
||||||
if (mode === TrainingModeEnum.auto) return data.length * 5;
|
if (mode === TrainingModeEnum.auto) return data.length * 5;
|
||||||
|
if (mode === TrainingModeEnum.image) return data.length * 2;
|
||||||
return data.length;
|
return data.length;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -223,7 +223,7 @@ export const readFileContentFromMongo = async ({
|
|||||||
rawText: string;
|
rawText: string;
|
||||||
filename: string;
|
filename: string;
|
||||||
}> => {
|
}> => {
|
||||||
const bufferId = `${fileId}-${customPdfParse}`;
|
const bufferId = `${String(fileId)}-${customPdfParse}`;
|
||||||
// read buffer
|
// read buffer
|
||||||
const fileBuffer = await getRawTextBuffer(bufferId);
|
const fileBuffer = await getRawTextBuffer(bufferId);
|
||||||
if (fileBuffer) {
|
if (fileBuffer) {
|
||||||
|
|||||||
@@ -1,5 +1,57 @@
|
|||||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||||
import { PassThrough } from 'stream';
|
import { PassThrough } from 'stream';
|
||||||
|
import { getGridBucket } from './controller';
|
||||||
|
import { type BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||||
|
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||||
|
|
||||||
|
export const createFileFromText = async ({
|
||||||
|
bucket,
|
||||||
|
filename,
|
||||||
|
text,
|
||||||
|
metadata
|
||||||
|
}: {
|
||||||
|
bucket: `${BucketNameEnum}`;
|
||||||
|
filename: string;
|
||||||
|
text: string;
|
||||||
|
metadata: Record<string, any>;
|
||||||
|
}) => {
|
||||||
|
const gridBucket = getGridBucket(bucket);
|
||||||
|
|
||||||
|
const buffer = Buffer.from(text);
|
||||||
|
|
||||||
|
const fileSize = buffer.length;
|
||||||
|
// 单块大小:尽可能大,但不超过 14MB,不小于128KB
|
||||||
|
const chunkSizeBytes = (() => {
|
||||||
|
// 计算理想块大小:文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB
|
||||||
|
const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024);
|
||||||
|
|
||||||
|
// 确保块大小至少为128KB
|
||||||
|
const minChunkSize = 128 * 1024; // 128KB
|
||||||
|
|
||||||
|
// 取理想块大小和最小块大小中的较大值
|
||||||
|
let chunkSize = Math.max(idealChunkSize, minChunkSize);
|
||||||
|
|
||||||
|
// 将块大小向上取整到最接近的64KB的倍数,使其更整齐
|
||||||
|
chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024);
|
||||||
|
|
||||||
|
return chunkSize;
|
||||||
|
})();
|
||||||
|
|
||||||
|
const uploadStream = gridBucket.openUploadStream(filename, {
|
||||||
|
metadata,
|
||||||
|
chunkSizeBytes
|
||||||
|
});
|
||||||
|
|
||||||
|
return retryFn(async () => {
|
||||||
|
return new Promise<{ fileId: string }>((resolve, reject) => {
|
||||||
|
uploadStream.end(buffer);
|
||||||
|
uploadStream.on('finish', () => {
|
||||||
|
resolve({ fileId: String(uploadStream.id) });
|
||||||
|
});
|
||||||
|
uploadStream.on('error', reject);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
|
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
|
||||||
return new Promise<Buffer>((resolve, reject) => {
|
return new Promise<Buffer>((resolve, reject) => {
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ const CustomTextInput = () => {
|
|||||||
createStatus: 'waiting',
|
createStatus: 'waiting',
|
||||||
rawText: data.value,
|
rawText: data.value,
|
||||||
sourceName: data.name,
|
sourceName: data.name,
|
||||||
icon: 'file/fill/manual'
|
icon: 'file/fill/txt'
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
goToNext();
|
goToNext();
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant
|
|||||||
import { NextAPI } from '@/service/middleware/entry';
|
import { NextAPI } from '@/service/middleware/entry';
|
||||||
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
|
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
|
||||||
import { type CreateCollectionResponse } from '@/global/core/dataset/api';
|
import { type CreateCollectionResponse } from '@/global/core/dataset/api';
|
||||||
|
import { createFileFromText } from '@fastgpt/service/common/file/gridfs/utils';
|
||||||
|
|
||||||
async function handler(req: NextApiRequest): CreateCollectionResponse {
|
async function handler(req: NextApiRequest): CreateCollectionResponse {
|
||||||
const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams;
|
const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams;
|
||||||
@@ -18,6 +19,18 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
|
|||||||
per: WritePermissionVal
|
per: WritePermissionVal
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// 1. Create file from text
|
||||||
|
const filename = `${name}.txt`;
|
||||||
|
const { fileId } = await createFileFromText({
|
||||||
|
bucket: 'dataset',
|
||||||
|
filename,
|
||||||
|
text,
|
||||||
|
metadata: {
|
||||||
|
teamId,
|
||||||
|
uid: tmbId
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
const { collectionId, insertResults } = await createCollectionAndInsertData({
|
const { collectionId, insertResults } = await createCollectionAndInsertData({
|
||||||
dataset,
|
dataset,
|
||||||
rawText: text,
|
rawText: text,
|
||||||
@@ -25,9 +38,9 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
|
|||||||
...body,
|
...body,
|
||||||
teamId,
|
teamId,
|
||||||
tmbId,
|
tmbId,
|
||||||
type: DatasetCollectionTypeEnum.virtual,
|
type: DatasetCollectionTypeEnum.file,
|
||||||
|
fileId,
|
||||||
name
|
name: filename
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/sch
|
|||||||
import { pushQAUsage } from '@/service/support/wallet/usage/push';
|
import { pushQAUsage } from '@/service/support/wallet/usage/push';
|
||||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import { createChatCompletion } from '@fastgpt/service/core/ai/config';
|
import { createChatCompletion } from '@fastgpt/service/core/ai/config';
|
||||||
import type { ChatCompletionMessageParam, StreamChatType } from '@fastgpt/global/core/ai/type.d';
|
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d';
|
||||||
import { addLog } from '@fastgpt/service/common/system/log';
|
import { addLog } from '@fastgpt/service/common/system/log';
|
||||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||||
import { replaceVariable } from '@fastgpt/global/common/string/tools';
|
import { replaceVariable } from '@fastgpt/global/common/string/tools';
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
import { TeamErrEnum } from '@fastgpt/global/common/error/code/team';
|
import { TeamErrEnum } from '@fastgpt/global/common/error/code/team';
|
||||||
import { checkTeamAIPoints } from '@fastgpt/service/support/permission/teamLimit';
|
import { checkTeamAIPoints } from '@fastgpt/service/support/permission/teamLimit';
|
||||||
import { sendOneInform } from '../support/user/inform/api';
|
import { sendOneInform } from '../../../support/user/inform/api';
|
||||||
import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller';
|
import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller';
|
||||||
import { InformLevelEnum } from '@fastgpt/global/support/user/inform/constants';
|
import { InformLevelEnum } from '@fastgpt/global/support/user/inform/constants';
|
||||||
|
|
||||||
@@ -18,7 +18,7 @@ export const checkTeamAiPointsAndLock = async (teamId: string) => {
|
|||||||
templateParam: {},
|
templateParam: {},
|
||||||
teamId
|
teamId
|
||||||
});
|
});
|
||||||
console.log('余额不足,暂停【向量】生成任务');
|
console.log('余额不足,暂停训练生成任务');
|
||||||
await lockTrainingDataByTeamId(teamId);
|
await lockTrainingDataByTeamId(teamId);
|
||||||
} catch (error) {}
|
} catch (error) {}
|
||||||
}
|
}
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
import { generateQA } from '@/service/events/generateQA';
|
import { generateQA } from '@/service/core/dataset/queues/generateQA';
|
||||||
import { generateVector } from '@/service/events/generateVector';
|
import { generateVector } from '@/service/core/dataset/queues/generateVector';
|
||||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||||
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
|
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { describe, it, expect } from 'vitest';
|
import { describe, it, expect } from 'vitest';
|
||||||
import { mdTextFormat, CodeClassNameEnum } from '@/components/Markdown/utils';
|
import { mdTextFormat, CodeClassNameEnum, filterSafeProps } from '@/components/Markdown/utils';
|
||||||
|
|
||||||
describe('Markdown utils', () => {
|
describe('Markdown utils', () => {
|
||||||
describe('mdTextFormat', () => {
|
describe('mdTextFormat', () => {
|
||||||
@@ -56,4 +56,121 @@ describe('Markdown utils', () => {
|
|||||||
expect(CodeClassNameEnum.audio).toBe('audio');
|
expect(CodeClassNameEnum.audio).toBe('audio');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('filterSafeProps', () => {
|
||||||
|
const allowedAttrs = new Set(['class', 'style', 'title', 'id']);
|
||||||
|
|
||||||
|
it('should filter out non-whitelisted attributes', () => {
|
||||||
|
const props = {
|
||||||
|
class: 'test',
|
||||||
|
nonexistent: 'value',
|
||||||
|
title: 'title'
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
class: 'test',
|
||||||
|
title: 'title'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter out dangerous event handlers', () => {
|
||||||
|
const props = {
|
||||||
|
class: 'test',
|
||||||
|
onClick: () => {},
|
||||||
|
onMouseover: () => {}
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
class: 'test'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter out dangerous protocols', () => {
|
||||||
|
const props = {
|
||||||
|
title: 'javascript:alert(1)',
|
||||||
|
id: 'vbscript:alert(1)',
|
||||||
|
class: 'safe'
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
class: 'safe'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle encoded malicious content', () => {
|
||||||
|
const props = {
|
||||||
|
title: 'javascript:alert(1)',
|
||||||
|
id: '%6A%61%76%61%73%63%72%69%70%74%3Aalert(1)',
|
||||||
|
class: 'safe'
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
class: 'safe'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter style objects', () => {
|
||||||
|
const props = {
|
||||||
|
style: {
|
||||||
|
color: 'red',
|
||||||
|
background: 'javascript:alert(1)'
|
||||||
|
},
|
||||||
|
class: 'test'
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
class: 'test'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle empty and null values', () => {
|
||||||
|
const props = {
|
||||||
|
class: '',
|
||||||
|
title: null,
|
||||||
|
style: null
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
class: '',
|
||||||
|
title: null,
|
||||||
|
style: null
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter nested objects except style', () => {
|
||||||
|
const props = {
|
||||||
|
data: { key: 'value' },
|
||||||
|
style: { color: 'red' },
|
||||||
|
class: 'test'
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
style: { color: 'red' },
|
||||||
|
class: 'test'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle multiple iterations of encoded content', () => {
|
||||||
|
const props = {
|
||||||
|
title: encodeURIComponent(encodeURIComponent('javascript:alert(1)')),
|
||||||
|
class: 'safe'
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
class: 'safe'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should filter suspicious content patterns', () => {
|
||||||
|
const props = {
|
||||||
|
title: 'Function("alert(1)")',
|
||||||
|
id: 'eval("alert(1)")',
|
||||||
|
class: 'test'
|
||||||
|
};
|
||||||
|
const result = filterSafeProps(props, allowedAttrs);
|
||||||
|
expect(result).toEqual({
|
||||||
|
class: 'test'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,128 +0,0 @@
|
|||||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
||||||
import {
|
|
||||||
createDatasetTrainingMongoWatch,
|
|
||||||
startTrainingQueue
|
|
||||||
} from '@/service/core/dataset/training/utils';
|
|
||||||
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
|
|
||||||
import { generateQA } from '@/service/core/dataset/queues/generateQA';
|
|
||||||
import { generateVector } from '@/service/core/dataset/queues/generateVector';
|
|
||||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
|
||||||
|
|
||||||
vi.mock('@/service/core/dataset/queues/generateQA', () => ({
|
|
||||||
generateQA: vi.fn()
|
|
||||||
}));
|
|
||||||
|
|
||||||
vi.mock('@/service/core/dataset/queues/generateVector', () => ({
|
|
||||||
generateVector: vi.fn()
|
|
||||||
}));
|
|
||||||
|
|
||||||
vi.mock('@fastgpt/service/core/dataset/training/schema', () => ({
|
|
||||||
MongoDatasetTraining: {
|
|
||||||
watch: vi.fn().mockReturnValue({
|
|
||||||
on: vi.fn()
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
|
|
||||||
describe('dataset training utils', () => {
|
|
||||||
beforeEach(() => {
|
|
||||||
vi.clearAllMocks();
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('createDatasetTrainingMongoWatch', () => {
|
|
||||||
it('should setup mongo watch and handle qa mode', () => {
|
|
||||||
const mockOn = vi.fn();
|
|
||||||
vi.mocked(MongoDatasetTraining.watch).mockReturnValue({
|
|
||||||
on: mockOn
|
|
||||||
});
|
|
||||||
|
|
||||||
createDatasetTrainingMongoWatch();
|
|
||||||
|
|
||||||
expect(MongoDatasetTraining.watch).toHaveBeenCalled();
|
|
||||||
expect(mockOn).toHaveBeenCalledWith('change', expect.any(Function));
|
|
||||||
|
|
||||||
// Simulate change event for QA mode
|
|
||||||
const changeHandler = mockOn.mock.calls[0][1];
|
|
||||||
changeHandler({
|
|
||||||
operationType: 'insert',
|
|
||||||
fullDocument: {
|
|
||||||
mode: TrainingModeEnum.qa
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(generateQA).toHaveBeenCalled();
|
|
||||||
expect(generateVector).not.toHaveBeenCalled();
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should handle chunk mode', () => {
|
|
||||||
const mockOn = vi.fn();
|
|
||||||
vi.mocked(MongoDatasetTraining.watch).mockReturnValue({
|
|
||||||
on: mockOn
|
|
||||||
});
|
|
||||||
|
|
||||||
createDatasetTrainingMongoWatch();
|
|
||||||
|
|
||||||
const changeHandler = mockOn.mock.calls[0][1];
|
|
||||||
changeHandler({
|
|
||||||
operationType: 'insert',
|
|
||||||
fullDocument: {
|
|
||||||
mode: TrainingModeEnum.chunk
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(generateVector).toHaveBeenCalled();
|
|
||||||
expect(generateQA).not.toHaveBeenCalled();
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should ignore non-insert operations', () => {
|
|
||||||
const mockOn = vi.fn();
|
|
||||||
vi.mocked(MongoDatasetTraining.watch).mockReturnValue({
|
|
||||||
on: mockOn
|
|
||||||
});
|
|
||||||
|
|
||||||
createDatasetTrainingMongoWatch();
|
|
||||||
|
|
||||||
const changeHandler = mockOn.mock.calls[0][1];
|
|
||||||
changeHandler({
|
|
||||||
operationType: 'update',
|
|
||||||
fullDocument: {
|
|
||||||
mode: TrainingModeEnum.qa
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(generateQA).not.toHaveBeenCalled();
|
|
||||||
expect(generateVector).not.toHaveBeenCalled();
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('startTrainingQueue', () => {
|
|
||||||
beforeEach(() => {
|
|
||||||
global.systemEnv = {
|
|
||||||
qaMaxProcess: 3
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should start single process by default', () => {
|
|
||||||
startTrainingQueue();
|
|
||||||
|
|
||||||
expect(generateQA).toHaveBeenCalledTimes(1);
|
|
||||||
expect(generateVector).toHaveBeenCalledTimes(1);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should start max processes when fast mode enabled', () => {
|
|
||||||
startTrainingQueue(true);
|
|
||||||
|
|
||||||
expect(generateQA).toHaveBeenCalledTimes(3);
|
|
||||||
expect(generateVector).toHaveBeenCalledTimes(3);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should use default max process when not configured', () => {
|
|
||||||
global.systemEnv = undefined;
|
|
||||||
|
|
||||||
startTrainingQueue(true);
|
|
||||||
|
|
||||||
expect(generateQA).toHaveBeenCalledTimes(10);
|
|
||||||
expect(generateVector).toHaveBeenCalledTimes(10);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
||||||
Reference in New Issue
Block a user