External dataset (#1497)

* perf: read rawText and chunk code

* perf: read raw text

* perf: read rawtext

* perf: token count

* log
This commit is contained in:
Archer
2024-05-16 11:47:53 +08:00
committed by GitHub
parent d5073f98ab
commit c6d9b15897
36 changed files with 531 additions and 267 deletions

View File

@@ -151,12 +151,12 @@ export const readFileContentFromMongo = async ({
teamId,
bucketName,
fileId,
csvFormat = false
isQAImport = false
}: {
teamId: string;
bucketName: `${BucketNameEnum}`;
fileId: string;
csvFormat?: boolean;
isQAImport?: boolean;
}): Promise<{
rawText: string;
filename: string;
@@ -198,7 +198,7 @@ export const readFileContentFromMongo = async ({
const { rawText } = await readFileRawContent({
extension,
csvFormat,
isQAImport,
teamId,
buffer: fileBuffers,
encoding,

View File

@@ -5,6 +5,7 @@ import { addHours } from 'date-fns';
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
import { ReadFileResponse } from '../../../worker/file/type';
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
export const initMarkdownText = ({
teamId,
@@ -29,36 +30,44 @@ export const initMarkdownText = ({
export const readFileRawContent = async ({
extension,
csvFormat,
isQAImport,
teamId,
buffer,
encoding,
metadata
}: {
csvFormat?: boolean;
isQAImport?: boolean;
extension: string;
teamId: string;
buffer: Buffer;
encoding: string;
metadata?: Record<string, any>;
}) => {
const result = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
let { rawText, formatText } = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
extension,
csvFormat,
encoding,
buffer
});
// markdown data format
if (['md', 'html', 'docx'].includes(extension)) {
result.rawText = await initMarkdownText({
rawText = await initMarkdownText({
teamId: teamId,
md: result.rawText,
md: rawText,
metadata: metadata
});
}
return result;
if (['csv', 'xlsx'].includes(extension)) {
// qa data
if (isQAImport) {
rawText = rawText || '';
} else {
rawText = formatText || '';
}
}
return { rawText };
};
export const htmlToMarkdown = async (html?: string | null) => {

View File

@@ -77,9 +77,8 @@ export const urlsFetch = async ({
$,
selector
});
console.log('html====', html);
const md = await htmlToMarkdown(html);
console.log('html====', md);
return {
url,

View File

@@ -12,27 +12,34 @@ import { getNanoid } from '@fastgpt/global/common/string/tools';
import { addLog } from '../../system/log';
export const getTiktokenWorker = () => {
if (global.tiktokenWorker) {
return global.tiktokenWorker;
const maxWorkers = global.systemEnv?.tokenWorkers || 20;
if (!global.tiktokenWorkers) {
global.tiktokenWorkers = [];
}
if (global.tiktokenWorkers.length >= maxWorkers) {
return global.tiktokenWorkers[Math.floor(Math.random() * global.tiktokenWorkers.length)];
}
const worker = getWorker(WorkerNameEnum.countGptMessagesTokens);
const i = global.tiktokenWorkers.push({
index: global.tiktokenWorkers.length,
worker,
callbackMap: {}
});
worker.on('message', ({ id, data }: { id: string; data: number }) => {
const callback = global.tiktokenWorker?.callbackMap?.[id];
const callback = global.tiktokenWorkers[i - 1]?.callbackMap?.[id];
if (callback) {
callback?.(data);
delete global.tiktokenWorker.callbackMap[id];
delete global.tiktokenWorkers[i - 1].callbackMap[id];
}
});
global.tiktokenWorker = {
worker,
callbackMap: {}
};
return global.tiktokenWorker;
return global.tiktokenWorkers[i - 1];
};
export const countGptMessagesTokens = (
@@ -44,20 +51,29 @@ export const countGptMessagesTokens = (
const start = Date.now();
const { worker, callbackMap } = getTiktokenWorker();
const id = getNanoid();
const timer = setTimeout(() => {
resolve(0);
console.log('Count token Time out');
resolve(
messages.reduce((sum, item) => {
if (item.content) {
return sum + item.content.length * 0.5;
}
return sum;
}, 0)
);
delete callbackMap[id];
}, 300);
}, 60000);
callbackMap[id] = (data) => {
// 检测是否有内存泄漏
addLog.info(`Count token time: ${Date.now() - start}, token: ${data}`);
// console.log(process.memoryUsage());
resolve(data);
clearTimeout(timer);
// 检测是否有内存泄漏
// addLog.info(`Count token time: ${Date.now() - start}, token: ${data}`);
// console.log(process.memoryUsage());
};
worker.postMessage({