perf: file encoding;perf: leave team code;@c121914yu perf: full text search code (#3528)

* perf: text encoding

* perf: leave team code

* perf: full text search code

* fix: http status

* perf: embedding search and vector avatar
This commit is contained in:
Archer
2025-01-05 14:40:02 +08:00
committed by archer
parent 5465ca642f
commit 2bf1fce32a
24 changed files with 345 additions and 100 deletions

View File

@@ -1,9 +1,27 @@
import TurndownService from 'turndown';
import { ImageType } from '../readFile/type';
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
import { getNanoid } from '@fastgpt/global/common/string/tools';
// @ts-ignore
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
const processBase64Images = (htmlContent: string) => {
const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
const images: ImageType[] = [];
const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
images.push({
uuid,
base64: base64Data,
mime
});
return `src="${uuid}"`;
});
return { processedHtml, images };
};
export const html2md = (
html: string
): {
@@ -25,11 +43,14 @@ export const html2md = (
turndownService.remove(['i', 'script', 'iframe', 'style']);
turndownService.use(turndownPluginGfm.gfm);
const { text, imageList } = matchMdImgTextAndUpload(html);
// Base64 img to id, otherwise it will occupy memory when going to md
const { processedHtml, images } = processBase64Images(html);
const md = turndownService.turndown(processedHtml);
const { text, imageList } = matchMdImgTextAndUpload(md);
return {
rawText: turndownService.turndown(text),
imageList
rawText: text,
imageList: [...images, ...imageList]
};
} catch (error) {
console.log('html 2 markdown error', error);