v4.6.2-alpah (#511)

This commit is contained in:
Archer
2023-11-24 15:29:43 +08:00
committed by GitHub
parent 60f752629f
commit 9cb4280a16
208 changed files with 5396 additions and 3500 deletions

View File

@@ -28,6 +28,16 @@ export const readTxtContent = (file: File) => {
*/
export const readPdfContent = (file: File) =>
new Promise<string>((resolve, reject) => {
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
try {
const pdfjsLib = window['pdfjs-dist/build/pdf'];
pdfjsLib.workerSrc = '/js/pdf.worker.js';
@@ -36,9 +46,19 @@ export const readPdfContent = (file: File) =>
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.07; // 假设页头在页面顶部5%的区域内
const footerThreshold = pageHeight * 0.93; // 假设页脚在页面底部5%的区域内
const pageText = tokenizedText.items
.map((token: any) => token.str)
.filter((item: string) => item)
.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] > headerThreshold && token.transform[5] < footerThreshold)
);
})
.map((token: TokenType) => token.str)
.join('');
return pageText;
};
@@ -54,7 +74,7 @@ export const readPdfContent = (file: File) =>
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
resolve(pageTexts.join('\n'));
resolve(pageTexts.join(''));
} catch (err) {
console.log(err, 'pdf load error');
reject('解析 PDF 失败');