4.6.7 first pr (#726)
This commit is contained in:
6
packages/service/common/file/constants.ts
Normal file
6
packages/service/common/file/constants.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
import path from 'path';
|
||||
|
||||
export const tmpFileDirPath =
|
||||
process.env.NODE_ENV === 'production' ? '/app/tmp' : path.join(process.cwd(), 'tmp');
|
||||
|
||||
export const previewMaxCharCount = 3000;
|
||||
@@ -1 +0,0 @@
|
||||
export const imageBaseUrl = '/api/system/img/';
|
||||
@@ -1,5 +1,5 @@
|
||||
import { UploadImgProps } from '@fastgpt/global/common/file/api';
|
||||
import { imageBaseUrl } from './constant';
|
||||
import { imageBaseUrl } from '@fastgpt/global/common/file/image/constants';
|
||||
import { MongoImage } from './schema';
|
||||
|
||||
export function getMongoImgUrl(id: string) {
|
||||
@@ -8,10 +8,13 @@ export function getMongoImgUrl(id: string) {
|
||||
|
||||
export const maxImgSize = 1024 * 1024 * 12;
|
||||
export async function uploadMongoImg({
|
||||
type,
|
||||
base64Img,
|
||||
teamId,
|
||||
expiredTime,
|
||||
metadata
|
||||
metadata,
|
||||
|
||||
shareId
|
||||
}: UploadImgProps & {
|
||||
teamId: string;
|
||||
}) {
|
||||
@@ -20,12 +23,16 @@ export async function uploadMongoImg({
|
||||
}
|
||||
|
||||
const base64Data = base64Img.split(',')[1];
|
||||
const binary = Buffer.from(base64Data, 'base64');
|
||||
|
||||
const { _id } = await MongoImage.create({
|
||||
type,
|
||||
teamId,
|
||||
binary: Buffer.from(base64Data, 'base64'),
|
||||
binary,
|
||||
expiredTime: expiredTime,
|
||||
metadata
|
||||
metadata,
|
||||
|
||||
shareId
|
||||
});
|
||||
|
||||
return getMongoImgUrl(String(_id));
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import { TeamCollectionName } from '@fastgpt/global/support/user/team/constant';
|
||||
import { connectionMongo, type Model } from '../../mongo';
|
||||
import { MongoImageSchemaType } from '@fastgpt/global/common/file/image/type.d';
|
||||
import { mongoImageTypeMap } from '@fastgpt/global/common/file/image/constants';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
|
||||
const ImageSchema = new Schema({
|
||||
@@ -12,12 +14,18 @@ const ImageSchema = new Schema({
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
binary: {
|
||||
type: Buffer
|
||||
},
|
||||
expiredTime: {
|
||||
type: Date
|
||||
},
|
||||
binary: {
|
||||
type: Buffer
|
||||
},
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.keys(mongoImageTypeMap),
|
||||
required: true
|
||||
},
|
||||
|
||||
metadata: {
|
||||
type: Object
|
||||
}
|
||||
@@ -25,14 +33,13 @@ const ImageSchema = new Schema({
|
||||
|
||||
try {
|
||||
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
|
||||
ImageSchema.index({ type: 1 });
|
||||
ImageSchema.index({ teamId: 1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
export const MongoImage: Model<{
|
||||
teamId: string;
|
||||
binary: Buffer;
|
||||
metadata?: { fileId?: string };
|
||||
}> = models['image'] || model('image', ImageSchema);
|
||||
export const MongoImage: Model<MongoImageSchemaType> =
|
||||
models['image'] || model('image', ImageSchema);
|
||||
|
||||
MongoImage.syncIndexes();
|
||||
|
||||
68
packages/service/common/file/load/pdf.ts
Normal file
68
packages/service/common/file/load/pdf.ts
Normal file
@@ -0,0 +1,68 @@
|
||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
// @ts-ignore
|
||||
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
|
||||
import { ReadFileParams } from './type';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
export const readPdfFile = async ({ path }: ReadFileParams) => {
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const pageHeight = viewport.height;
|
||||
const headerThreshold = pageHeight * 0.95;
|
||||
const footerThreshold = pageHeight * 0.05;
|
||||
|
||||
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
||||
return (
|
||||
!token.transform ||
|
||||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
|
||||
);
|
||||
});
|
||||
|
||||
// concat empty string 'hasEOL'
|
||||
for (let i = 0; i < pageTexts.length; i++) {
|
||||
const item = pageTexts[i];
|
||||
if (item.str === '' && pageTexts[i - 1]) {
|
||||
pageTexts[i - 1].hasEOL = item.hasEOL;
|
||||
pageTexts.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
page.cleanup();
|
||||
|
||||
return pageTexts
|
||||
.map((token) => {
|
||||
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
||||
|
||||
return paragraphEnd ? `${token.str}\n` : token.str;
|
||||
})
|
||||
.join('');
|
||||
};
|
||||
|
||||
const loadingTask = pdfjs.getDocument(path);
|
||||
const doc = await loadingTask.promise;
|
||||
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
|
||||
loadingTask.destroy();
|
||||
|
||||
return {
|
||||
rawText: pageTexts.join('')
|
||||
};
|
||||
};
|
||||
18
packages/service/common/file/load/type.d.ts
vendored
Normal file
18
packages/service/common/file/load/type.d.ts
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
export type ReadFileParams = {
|
||||
preview: boolean;
|
||||
teamId: string;
|
||||
path: string;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
|
||||
export type ReadFileResponse = {
|
||||
rawText: string;
|
||||
};
|
||||
|
||||
export type ReadFileBufferItemType = ReadFileParams & {
|
||||
rawText: string;
|
||||
};
|
||||
|
||||
declare global {
|
||||
var readFileBuffers: ReadFileBufferItemType[];
|
||||
}
|
||||
50
packages/service/common/file/load/utils.ts
Normal file
50
packages/service/common/file/load/utils.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
import { readPdfFile } from './pdf';
|
||||
import { readDocFle } from './word';
|
||||
import { ReadFileBufferItemType, ReadFileParams } from './type';
|
||||
|
||||
global.readFileBuffers = global.readFileBuffers || [];
|
||||
|
||||
const bufferMaxSize = 200;
|
||||
|
||||
export const pushFileReadBuffer = (params: ReadFileBufferItemType) => {
|
||||
global.readFileBuffers.push(params);
|
||||
|
||||
if (global.readFileBuffers.length > bufferMaxSize) {
|
||||
global.readFileBuffers.shift();
|
||||
}
|
||||
};
|
||||
export const getReadFileBuffer = ({ path, teamId }: ReadFileParams) =>
|
||||
global.readFileBuffers.find((item) => item.path === path && item.teamId === teamId);
|
||||
|
||||
export const readFileContent = async (params: ReadFileParams) => {
|
||||
const { path } = params;
|
||||
|
||||
const buffer = getReadFileBuffer(params);
|
||||
|
||||
if (buffer) {
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
|
||||
|
||||
const { rawText } = await (async () => {
|
||||
switch (extension) {
|
||||
case 'pdf':
|
||||
return readPdfFile(params);
|
||||
case 'docx':
|
||||
return readDocFle(params);
|
||||
default:
|
||||
return Promise.reject('Only support .pdf, .docx');
|
||||
}
|
||||
})();
|
||||
|
||||
pushFileReadBuffer({
|
||||
...params,
|
||||
rawText
|
||||
});
|
||||
|
||||
return {
|
||||
...params,
|
||||
rawText
|
||||
};
|
||||
};
|
||||
22
packages/service/common/file/load/word.ts
Normal file
22
packages/service/common/file/load/word.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
import mammoth from 'mammoth';
|
||||
import { htmlToMarkdown } from '../../string/markdown';
|
||||
import { ReadFileParams } from './type';
|
||||
/**
|
||||
* read docx to markdown
|
||||
*/
|
||||
export const readDocFle = async ({ path, metadata = {} }: ReadFileParams) => {
|
||||
try {
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
path
|
||||
});
|
||||
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
return {
|
||||
rawText: md
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('error doc read:', error);
|
||||
return Promise.reject('Can not read doc file, please convert to PDF');
|
||||
}
|
||||
};
|
||||
@@ -1,11 +1,9 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { customAlphabet } from 'nanoid';
|
||||
import multer from 'multer';
|
||||
import path from 'path';
|
||||
import { BucketNameEnum, bucketNameMap } from '@fastgpt/global/common/file/constants';
|
||||
import fs from 'fs';
|
||||
|
||||
const nanoid = customAlphabet('1234567890abcdef', 12);
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
import { tmpFileDirPath } from './constants';
|
||||
|
||||
type FileType = {
|
||||
fieldname: string;
|
||||
@@ -17,7 +15,9 @@ type FileType = {
|
||||
size: number;
|
||||
};
|
||||
|
||||
export function getUploadModel({ maxSize = 500 }: { maxSize?: number }) {
|
||||
const expiredTime = 30 * 60 * 1000;
|
||||
|
||||
export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
|
||||
maxSize *= 1024 * 1024;
|
||||
class UploadModel {
|
||||
uploader = multer({
|
||||
@@ -26,9 +26,12 @@ export function getUploadModel({ maxSize = 500 }: { maxSize?: number }) {
|
||||
},
|
||||
preservePath: true,
|
||||
storage: multer.diskStorage({
|
||||
filename: (_req, file, cb) => {
|
||||
// destination: (_req, _file, cb) => {
|
||||
// cb(null, tmpFileDirPath);
|
||||
// },
|
||||
filename: async (req, file, cb) => {
|
||||
const { ext } = path.parse(decodeURIComponent(file.originalname));
|
||||
cb(null, nanoid() + ext);
|
||||
cb(null, `${Date.now() + expiredTime}-${getNanoid(32)}${ext}`);
|
||||
}
|
||||
})
|
||||
}).any();
|
||||
@@ -75,14 +78,4 @@ export function getUploadModel({ maxSize = 500 }: { maxSize?: number }) {
|
||||
}
|
||||
|
||||
return new UploadModel();
|
||||
}
|
||||
|
||||
export const removeFilesByPaths = (paths: string[]) => {
|
||||
paths.forEach((path) => {
|
||||
fs.unlink(path, (err) => {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
}
|
||||
});
|
||||
});
|
||||
};
|
||||
33
packages/service/common/file/utils.ts
Normal file
33
packages/service/common/file/utils.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import fs from 'fs';
|
||||
import { tmpFileDirPath } from './constants';
|
||||
|
||||
export const removeFilesByPaths = (paths: string[]) => {
|
||||
paths.forEach((path) => {
|
||||
fs.unlink(path, (err) => {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
}
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
/* cron job. check expired tmp files */
|
||||
export const checkExpiredTmpFiles = () => {
|
||||
// get all file name
|
||||
const files = fs.readdirSync(tmpFileDirPath).map((name) => {
|
||||
const timestampStr = name.split('-')[0];
|
||||
const expiredTimestamp = timestampStr ? Number(timestampStr) : 0;
|
||||
|
||||
return {
|
||||
filename: name,
|
||||
expiredTimestamp,
|
||||
path: `${tmpFileDirPath}/${name}`
|
||||
};
|
||||
});
|
||||
|
||||
// count expiredFiles
|
||||
const expiredFiles = files.filter((item) => item.expiredTimestamp < Date.now());
|
||||
|
||||
// remove expiredFiles
|
||||
removeFilesByPaths(expiredFiles.map((item) => item.path));
|
||||
};
|
||||
@@ -50,8 +50,11 @@ export const cheerioToHtml = ({
|
||||
.get()
|
||||
.join('\n');
|
||||
|
||||
const title = $('head title').text() || $('h1:first').text() || fetchUrl;
|
||||
|
||||
return {
|
||||
html,
|
||||
title,
|
||||
usedSelector
|
||||
};
|
||||
};
|
||||
@@ -70,7 +73,7 @@ export const urlsFetch = async ({
|
||||
});
|
||||
|
||||
const $ = cheerio.load(fetchRes.data);
|
||||
const { html, usedSelector } = cheerioToHtml({
|
||||
const { title, html, usedSelector } = cheerioToHtml({
|
||||
fetchUrl: url,
|
||||
$,
|
||||
selector
|
||||
@@ -79,6 +82,7 @@ export const urlsFetch = async ({
|
||||
|
||||
return {
|
||||
url,
|
||||
title,
|
||||
content: md,
|
||||
selector: usedSelector
|
||||
};
|
||||
@@ -87,6 +91,7 @@ export const urlsFetch = async ({
|
||||
|
||||
return {
|
||||
url,
|
||||
title: '',
|
||||
content: '',
|
||||
selector: ''
|
||||
};
|
||||
|
||||
@@ -15,7 +15,9 @@ export const htmlToMarkdown = (html?: string | null) =>
|
||||
worker.on('message', (md: string) => {
|
||||
worker.terminate();
|
||||
|
||||
resolve(simpleMarkdownText(md));
|
||||
let rawText = simpleMarkdownText(md);
|
||||
|
||||
resolve(rawText);
|
||||
});
|
||||
worker.on('error', (err) => {
|
||||
worker.terminate();
|
||||
|
||||
6
packages/service/common/system/cron.ts
Normal file
6
packages/service/common/system/cron.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
import nodeCron from 'node-cron';
|
||||
|
||||
export const setCron = (time: string, cb: () => void) => {
|
||||
// second minute hour day month week
|
||||
return nodeCron.schedule(time, cb);
|
||||
};
|
||||
@@ -49,6 +49,7 @@ export const addLog = {
|
||||
},
|
||||
error(msg: string, error?: any) {
|
||||
this.log('error', msg, {
|
||||
message: error?.message,
|
||||
stack: error?.stack,
|
||||
...(error?.config && {
|
||||
config: {
|
||||
|
||||
@@ -2,6 +2,8 @@ export type DeleteDatasetVectorProps = {
|
||||
id?: string;
|
||||
datasetIds?: string[];
|
||||
collectionIds?: string[];
|
||||
|
||||
collectionId?: string;
|
||||
dataIds?: string[];
|
||||
};
|
||||
|
||||
|
||||
@@ -101,14 +101,19 @@ export const deleteDatasetDataVector = async (
|
||||
retry?: number;
|
||||
}
|
||||
): Promise<any> => {
|
||||
const { id, datasetIds, collectionIds, dataIds, retry = 2 } = props;
|
||||
const { id, datasetIds, collectionIds, collectionId, dataIds, retry = 2 } = props;
|
||||
|
||||
const where = await (() => {
|
||||
if (id) return `id=${id}`;
|
||||
if (datasetIds) return `dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')})`;
|
||||
if (collectionIds)
|
||||
if (collectionIds) {
|
||||
return `collection_id IN (${collectionIds.map((id) => `'${String(id)}'`).join(',')})`;
|
||||
if (dataIds) return `data_id IN (${dataIds.map((id) => `'${String(id)}'`).join(',')})`;
|
||||
}
|
||||
if (collectionId && dataIds) {
|
||||
return `collection_id='${String(collectionId)}' and data_id IN (${dataIds
|
||||
.map((id) => `'${String(id)}'`)
|
||||
.join(',')})`;
|
||||
}
|
||||
return Promise.reject('deleteDatasetData: no where');
|
||||
})();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user