4.6.7 first pr (#726)

This commit is contained in:
Archer
2024-01-10 23:35:04 +08:00
committed by GitHub
parent 414b693303
commit 006ad17c6a
186 changed files with 2996 additions and 1838 deletions

View File

@@ -0,0 +1,68 @@
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
// @ts-ignore
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
import { ReadFileParams } from './type';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
export const readPdfFile = async ({ path }: ReadFileParams) => {
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.95;
const footerThreshold = pageHeight * 0.05;
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
};
const loadingTask = pdfjs.getDocument(path);
const doc = await loadingTask.promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
loadingTask.destroy();
return {
rawText: pageTexts.join('')
};
};

View File

@@ -0,0 +1,18 @@
export type ReadFileParams = {
preview: boolean;
teamId: string;
path: string;
metadata?: Record<string, any>;
};
export type ReadFileResponse = {
rawText: string;
};
export type ReadFileBufferItemType = ReadFileParams & {
rawText: string;
};
declare global {
var readFileBuffers: ReadFileBufferItemType[];
}

View File

@@ -0,0 +1,50 @@
import { readPdfFile } from './pdf';
import { readDocFle } from './word';
import { ReadFileBufferItemType, ReadFileParams } from './type';
global.readFileBuffers = global.readFileBuffers || [];
const bufferMaxSize = 200;
export const pushFileReadBuffer = (params: ReadFileBufferItemType) => {
global.readFileBuffers.push(params);
if (global.readFileBuffers.length > bufferMaxSize) {
global.readFileBuffers.shift();
}
};
export const getReadFileBuffer = ({ path, teamId }: ReadFileParams) =>
global.readFileBuffers.find((item) => item.path === path && item.teamId === teamId);
export const readFileContent = async (params: ReadFileParams) => {
const { path } = params;
const buffer = getReadFileBuffer(params);
if (buffer) {
return buffer;
}
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
const { rawText } = await (async () => {
switch (extension) {
case 'pdf':
return readPdfFile(params);
case 'docx':
return readDocFle(params);
default:
return Promise.reject('Only support .pdf, .docx');
}
})();
pushFileReadBuffer({
...params,
rawText
});
return {
...params,
rawText
};
};

View File

@@ -0,0 +1,22 @@
import mammoth from 'mammoth';
import { htmlToMarkdown } from '../../string/markdown';
import { ReadFileParams } from './type';
/**
* read docx to markdown
*/
export const readDocFle = async ({ path, metadata = {} }: ReadFileParams) => {
try {
const { value: html } = await mammoth.convertToHtml({
path
});
const md = await htmlToMarkdown(html);
return {
rawText: md
};
} catch (error) {
console.log('error doc read:', error);
return Promise.reject('Can not read doc file, please convert to PDF');
}
};