4.6.7 first pr (#726)
This commit is contained in:
68
packages/service/common/file/load/pdf.ts
Normal file
68
packages/service/common/file/load/pdf.ts
Normal file
@@ -0,0 +1,68 @@
|
||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
// @ts-ignore
|
||||
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
|
||||
import { ReadFileParams } from './type';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
export const readPdfFile = async ({ path }: ReadFileParams) => {
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const pageHeight = viewport.height;
|
||||
const headerThreshold = pageHeight * 0.95;
|
||||
const footerThreshold = pageHeight * 0.05;
|
||||
|
||||
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
||||
return (
|
||||
!token.transform ||
|
||||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
|
||||
);
|
||||
});
|
||||
|
||||
// concat empty string 'hasEOL'
|
||||
for (let i = 0; i < pageTexts.length; i++) {
|
||||
const item = pageTexts[i];
|
||||
if (item.str === '' && pageTexts[i - 1]) {
|
||||
pageTexts[i - 1].hasEOL = item.hasEOL;
|
||||
pageTexts.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
page.cleanup();
|
||||
|
||||
return pageTexts
|
||||
.map((token) => {
|
||||
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
||||
|
||||
return paragraphEnd ? `${token.str}\n` : token.str;
|
||||
})
|
||||
.join('');
|
||||
};
|
||||
|
||||
const loadingTask = pdfjs.getDocument(path);
|
||||
const doc = await loadingTask.promise;
|
||||
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
|
||||
loadingTask.destroy();
|
||||
|
||||
return {
|
||||
rawText: pageTexts.join('')
|
||||
};
|
||||
};
|
||||
18
packages/service/common/file/load/type.d.ts
vendored
Normal file
18
packages/service/common/file/load/type.d.ts
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
export type ReadFileParams = {
|
||||
preview: boolean;
|
||||
teamId: string;
|
||||
path: string;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
|
||||
export type ReadFileResponse = {
|
||||
rawText: string;
|
||||
};
|
||||
|
||||
export type ReadFileBufferItemType = ReadFileParams & {
|
||||
rawText: string;
|
||||
};
|
||||
|
||||
declare global {
|
||||
var readFileBuffers: ReadFileBufferItemType[];
|
||||
}
|
||||
50
packages/service/common/file/load/utils.ts
Normal file
50
packages/service/common/file/load/utils.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
import { readPdfFile } from './pdf';
|
||||
import { readDocFle } from './word';
|
||||
import { ReadFileBufferItemType, ReadFileParams } from './type';
|
||||
|
||||
global.readFileBuffers = global.readFileBuffers || [];
|
||||
|
||||
const bufferMaxSize = 200;
|
||||
|
||||
export const pushFileReadBuffer = (params: ReadFileBufferItemType) => {
|
||||
global.readFileBuffers.push(params);
|
||||
|
||||
if (global.readFileBuffers.length > bufferMaxSize) {
|
||||
global.readFileBuffers.shift();
|
||||
}
|
||||
};
|
||||
export const getReadFileBuffer = ({ path, teamId }: ReadFileParams) =>
|
||||
global.readFileBuffers.find((item) => item.path === path && item.teamId === teamId);
|
||||
|
||||
export const readFileContent = async (params: ReadFileParams) => {
|
||||
const { path } = params;
|
||||
|
||||
const buffer = getReadFileBuffer(params);
|
||||
|
||||
if (buffer) {
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
|
||||
|
||||
const { rawText } = await (async () => {
|
||||
switch (extension) {
|
||||
case 'pdf':
|
||||
return readPdfFile(params);
|
||||
case 'docx':
|
||||
return readDocFle(params);
|
||||
default:
|
||||
return Promise.reject('Only support .pdf, .docx');
|
||||
}
|
||||
})();
|
||||
|
||||
pushFileReadBuffer({
|
||||
...params,
|
||||
rawText
|
||||
});
|
||||
|
||||
return {
|
||||
...params,
|
||||
rawText
|
||||
};
|
||||
};
|
||||
22
packages/service/common/file/load/word.ts
Normal file
22
packages/service/common/file/load/word.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
import mammoth from 'mammoth';
|
||||
import { htmlToMarkdown } from '../../string/markdown';
|
||||
import { ReadFileParams } from './type';
|
||||
/**
|
||||
* read docx to markdown
|
||||
*/
|
||||
export const readDocFle = async ({ path, metadata = {} }: ReadFileParams) => {
|
||||
try {
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
path
|
||||
});
|
||||
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
return {
|
||||
rawText: md
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('error doc read:', error);
|
||||
return Promise.reject('Can not read doc file, please convert to PDF');
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user