4.6.7 first pr (#726)

This commit is contained in:
Archer
2024-01-10 23:35:04 +08:00
committed by GitHub
parent 414b693303
commit 006ad17c6a
186 changed files with 2996 additions and 1838 deletions

View File

@@ -1,80 +1,5 @@
import mammoth from 'mammoth';
import Papa from 'papaparse';
import { compressBase64ImgAndUpload } from './controller';
import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
import { htmlStr2Md } from '@fastgpt/web/common/string/markdown';
import { readPdfFile } from '@fastgpt/global/common/file/read/index';
import { readFileRawText } from '@fastgpt/web/common/file/read';
/**
* read pdf to raw text
*/
export const readPdfContent = (file: File) =>
new Promise<string>((resolve, reject) => {
try {
let reader = new FileReader();
reader.readAsArrayBuffer(file);
reader.onload = async (event) => {
if (!event?.target?.result) return reject('解析 PDF 失败');
try {
const content = await readPdfFile({ pdf: event.target.result });
resolve(content);
} catch (err) {
console.log(err, 'pdf load error');
reject('解析 PDF 失败');
}
};
reader.onerror = (err) => {
console.log(err, 'pdf load error');
reject('解析 PDF 失败');
};
} catch (error) {
reject('浏览器不支持文件内容读取');
}
});
/**
* read docx to markdown
*/
export const readDocContent = (file: File, metadata: Record<string, any>) =>
new Promise<string>((resolve, reject) => {
try {
const reader = new FileReader();
reader.readAsArrayBuffer(file);
reader.onload = async ({ target }) => {
if (!target?.result) return reject('读取 doc 文件失败');
try {
const buffer = target.result as ArrayBuffer;
const { value: html } = await mammoth.convertToHtml({
arrayBuffer: buffer
});
const md = htmlStr2Md(html);
const rawText = await uploadMarkdownBase64(md, metadata);
resolve(rawText);
} catch (error) {
window.umami?.track('wordReadError', {
err: error?.toString()
});
console.log('error doc read:', error);
reject('读取 doc 文件失败, 请转换成 PDF');
}
};
reader.onerror = (err) => {
window.umami?.track('wordReadError', {
err: err?.toString()
});
console.log('error doc read:', err);
reject('读取 doc 文件失败');
};
} catch (error) {
reject('浏览器不支持文件内容读取');
}
});
import { readFileRawText } from '@fastgpt/web/common/file/read/rawText';
/**
* read csv to json
@@ -85,7 +10,7 @@ export const readDocContent = (file: File, metadata: Record<string, any>) =>
*/
export const readCsvContent = async (file: File) => {
try {
const textArr = await readFileRawText(file);
const { rawText: textArr } = await readFileRawText(file);
const csvArr = Papa.parse(textArr).data as string[][];
if (csvArr.length === 0) {
throw new Error('csv 解析失败');
@@ -99,44 +24,6 @@ export const readCsvContent = async (file: File) => {
}
};
/**
* format markdown
* 1. upload base64
* 2. replace \
*/
export const uploadMarkdownBase64 = async (rawText: string = '', metadata: Record<string, any>) => {
// match base64, upload and replace it
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
const base64Arr = rawText.match(base64Regex) || [];
// upload base64 and replace it
await Promise.all(
base64Arr.map(async (base64Img) => {
try {
const str = await compressBase64ImgAndUpload({
base64Img,
maxW: 4329,
maxH: 4329,
maxSize: 1024 * 1024 * 5,
metadata
});
rawText = rawText.replace(base64Img, str);
} catch (error) {
rawText = rawText.replace(base64Img, '');
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
}
})
);
// Remove white space on both sides of the picture
const trimReg = /(!\[.*\]\(.*\))\s*/g;
if (trimReg.test(rawText)) {
rawText = rawText.replace(trimReg, '$1');
}
return simpleMarkdownText(rawText);
};
/**
* file download by text
*/