4.7.1-alpha (#1120)
Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
@@ -6,16 +6,9 @@ import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoFileSchema } from './schema';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { readFileRawText } from '../read/rawText';
|
||||
import { ReadFileByBufferParams } from '../read/type';
|
||||
import { readMarkdown } from '../read/markdown';
|
||||
import { readHtmlRawText } from '../read/html';
|
||||
import { readPdfFile } from '../read/pdf';
|
||||
import { readWordFile } from '../read/word';
|
||||
import { readCsvRawText } from '../read/csv';
|
||||
import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
|
||||
import { readPptxRawText } from '../read/pptx';
|
||||
import { readXlsxRawText } from '../read/xlsx';
|
||||
import { readFileRawContent } from '../read/utils';
|
||||
|
||||
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
|
||||
MongoFileSchema;
|
||||
@@ -146,7 +139,7 @@ export const readFileEncode = async ({
|
||||
return encoding as BufferEncoding;
|
||||
};
|
||||
|
||||
export const readFileContent = async ({
|
||||
export const readFileContentFromMongo = async ({
|
||||
teamId,
|
||||
bucketName,
|
||||
fileId,
|
||||
@@ -205,47 +198,14 @@ export const readFileContent = async ({
|
||||
}
|
||||
};
|
||||
|
||||
const { rawText } = await (async () => {
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(params);
|
||||
case 'md':
|
||||
return readMarkdown(params);
|
||||
case 'html':
|
||||
return readHtmlRawText(params);
|
||||
case 'pdf':
|
||||
return readPdfFile(params);
|
||||
case 'docx':
|
||||
return readWordFile(params);
|
||||
case 'pptx':
|
||||
return readPptxRawText(params);
|
||||
case 'xlsx':
|
||||
const xlsxResult = await readXlsxRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: xlsxResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: xlsxResult.rawText
|
||||
};
|
||||
case 'csv':
|
||||
const csvResult = await readCsvRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: csvResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: csvResult.rawText
|
||||
};
|
||||
default:
|
||||
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
|
||||
}
|
||||
})();
|
||||
const { rawText } = await readFileRawContent({
|
||||
extension,
|
||||
csvFormat,
|
||||
params
|
||||
});
|
||||
|
||||
if (rawText.trim()) {
|
||||
await MongoRwaTextBuffer.create({
|
||||
MongoRwaTextBuffer.create({
|
||||
sourceId: fileId,
|
||||
rawText,
|
||||
metadata: {
|
||||
|
||||
@@ -2,6 +2,15 @@ import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||
import { addHours } from 'date-fns';
|
||||
import { ReadFileByBufferParams } from './type';
|
||||
import { readFileRawText } from '../read/rawText';
|
||||
import { readMarkdown } from '../read/markdown';
|
||||
import { readHtmlRawText } from '../read/html';
|
||||
import { readPdfFile } from '../read/pdf';
|
||||
import { readWordFile } from '../read/word';
|
||||
import { readCsvRawText } from '../read/csv';
|
||||
import { readPptxRawText } from '../read/pptx';
|
||||
import { readXlsxRawText } from '../read/xlsx';
|
||||
|
||||
export const initMarkdownText = ({
|
||||
teamId,
|
||||
@@ -23,3 +32,50 @@ export const initMarkdownText = ({
|
||||
expiredTime: addHours(new Date(), 2)
|
||||
})
|
||||
});
|
||||
|
||||
export const readFileRawContent = async ({
|
||||
extension,
|
||||
csvFormat,
|
||||
params
|
||||
}: {
|
||||
csvFormat?: boolean;
|
||||
extension: string;
|
||||
params: ReadFileByBufferParams;
|
||||
}) => {
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(params);
|
||||
case 'md':
|
||||
return readMarkdown(params);
|
||||
case 'html':
|
||||
return readHtmlRawText(params);
|
||||
case 'pdf':
|
||||
return readPdfFile(params);
|
||||
case 'docx':
|
||||
return readWordFile(params);
|
||||
case 'pptx':
|
||||
return readPptxRawText(params);
|
||||
case 'xlsx':
|
||||
const xlsxResult = await readXlsxRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: xlsxResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: xlsxResult.rawText
|
||||
};
|
||||
case 'csv':
|
||||
const csvResult = await readCsvRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: csvResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: csvResult.rawText
|
||||
};
|
||||
default:
|
||||
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
|
||||
}
|
||||
};
|
||||
|
||||
@@ -55,8 +55,8 @@ export const clearTmpUploadFiles = () => {
|
||||
fs.stat(filePath, (err, stats) => {
|
||||
if (err) return;
|
||||
|
||||
// 如果文件是在1小时前上传的,则认为是临时文件并删除它
|
||||
if (Date.now() - stats.mtime.getTime() > 1 * 60 * 60 * 1000) {
|
||||
// 如果文件是在2小时前上传的,则认为是临时文件并删除它
|
||||
if (Date.now() - stats.mtime.getTime() > 2 * 60 * 60 * 1000) {
|
||||
fs.unlink(filePath, (err) => {
|
||||
if (err) return;
|
||||
console.log(`Deleted temp file: ${filePath}`);
|
||||
|
||||
Reference in New Issue
Block a user