perf: file encoding;perf: leave team code;@c121914yu perf: full text search code (#3528)
* perf: text encoding * perf: leave team code * perf: full text search code * fix: http status * perf: embedding search and vector avatar
This commit is contained in:
@@ -1,9 +1,27 @@
|
||||
import TurndownService from 'turndown';
|
||||
import { ImageType } from '../readFile/type';
|
||||
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
// @ts-ignore
|
||||
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
||||
|
||||
const processBase64Images = (htmlContent: string) => {
|
||||
const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
|
||||
const images: ImageType[] = [];
|
||||
|
||||
const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
|
||||
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
|
||||
images.push({
|
||||
uuid,
|
||||
base64: base64Data,
|
||||
mime
|
||||
});
|
||||
return `src="${uuid}"`;
|
||||
});
|
||||
|
||||
return { processedHtml, images };
|
||||
};
|
||||
|
||||
export const html2md = (
|
||||
html: string
|
||||
): {
|
||||
@@ -25,11 +43,14 @@ export const html2md = (
|
||||
turndownService.remove(['i', 'script', 'iframe', 'style']);
|
||||
turndownService.use(turndownPluginGfm.gfm);
|
||||
|
||||
const { text, imageList } = matchMdImgTextAndUpload(html);
|
||||
// Base64 img to id, otherwise it will occupy memory when going to md
|
||||
const { processedHtml, images } = processBase64Images(html);
|
||||
const md = turndownService.turndown(processedHtml);
|
||||
const { text, imageList } = matchMdImgTextAndUpload(md);
|
||||
|
||||
return {
|
||||
rawText: turndownService.turndown(text),
|
||||
imageList
|
||||
rawText: text,
|
||||
imageList: [...images, ...imageList]
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('html 2 markdown error', error);
|
||||
|
||||
Reference in New Issue
Block a user