Compare commits
15 Commits
v4.8.23-fi
...
v4.8.23-fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6d4776b3aa | ||
|
|
2d351c3654 | ||
|
|
662a4a4671 | ||
|
|
3fadabd28b | ||
|
|
dbf25cef88 | ||
|
|
b2e2fa6b76 | ||
|
|
576c60bd55 | ||
|
|
33617ab5dc | ||
|
|
b4dda6a41b | ||
|
|
e860c56b77 | ||
|
|
efac5312b4 | ||
|
|
4bc7f21182 | ||
|
|
113e8f711f | ||
|
|
abc6dffb41 | ||
|
|
f7b2a57ca3 |
2
.github/workflows/docs-deploy-kubeconfig.yml
vendored
2
.github/workflows/docs-deploy-kubeconfig.yml
vendored
@@ -6,8 +6,6 @@ on:
|
||||
- 'docSite/**'
|
||||
branches:
|
||||
- 'main'
|
||||
tags:
|
||||
- 'v*.*.*'
|
||||
|
||||
jobs:
|
||||
build-fastgpt-docs-images:
|
||||
|
||||
2
.github/workflows/docs-deploy-vercel.yml
vendored
2
.github/workflows/docs-deploy-vercel.yml
vendored
@@ -7,8 +7,6 @@ on:
|
||||
- 'docSite/**'
|
||||
branches:
|
||||
- 'main'
|
||||
tags:
|
||||
- 'v*.*.*'
|
||||
|
||||
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
|
||||
jobs:
|
||||
|
||||
2
.github/workflows/docs-preview.yml
vendored
2
.github/workflows/docs-preview.yml
vendored
@@ -4,8 +4,6 @@ on:
|
||||
pull_request_target:
|
||||
paths:
|
||||
- 'docSite/**'
|
||||
branches:
|
||||
- 'main'
|
||||
workflow_dispatch:
|
||||
|
||||
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
|
||||
|
||||
6
.github/workflows/fastgpt-build-image.yml
vendored
6
.github/workflows/fastgpt-build-image.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
with:
|
||||
driver-opts: network=host
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
with:
|
||||
driver-opts: network=host
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
@@ -191,7 +191,7 @@ jobs:
|
||||
with:
|
||||
driver-opts: network=host
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
|
||||
2
.github/workflows/sandbox-build-image.yml
vendored
2
.github/workflows/sandbox-build-image.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
with:
|
||||
driver-opts: network=host
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-buildx-${{ github.sha }}
|
||||
|
||||
26
SECURITY.md
Normal file
26
SECURITY.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# 安全策略
|
||||
|
||||
## 漏洞报告
|
||||
|
||||
如果您发现了 FastGPT 的安全漏洞,请按照以下步骤进行报告:
|
||||
|
||||
1. **报告方式**
|
||||
发送邮件至:yujinlong@sealos.io
|
||||
请备注版本以及您的 GitHub 账号
|
||||
|
||||
3. **响应时间**
|
||||
- 我们会在 48 小时内确认收到您的报告
|
||||
- 一般在 3 个工作日内给出初步评估结果
|
||||
|
||||
4. **漏洞处理流程**
|
||||
- 确认漏洞:我们会验证漏洞的存在性和影响范围
|
||||
- 修复开发:针对已确认的漏洞进行修复
|
||||
- 版本发布:在下一个版本更新中发布安全补丁
|
||||
- 公开披露:在修复完成后,我们会在更新日志中公布相关信息
|
||||
|
||||
5. **注意事项**
|
||||
- 在漏洞未修复前,请勿公开披露漏洞详情
|
||||
- 我们欢迎负责任的漏洞披露
|
||||
- 对于重大贡献者,我们会在项目致谢名单中提及
|
||||
|
||||
感谢您为 FastGPT 的安全性做出贡献!
|
||||
@@ -114,15 +114,15 @@ services:
|
||||
# fastgpt
|
||||
sandbox:
|
||||
container_name: sandbox
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.22 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.22 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.23-fix # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.23-fix # 阿里云
|
||||
networks:
|
||||
- fastgpt
|
||||
restart: always
|
||||
fastgpt:
|
||||
container_name: fastgpt
|
||||
image: ghcr.io/labring/fastgpt:v4.8.22 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.22 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt:v4.8.23-fix # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.23-fix # 阿里云
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
|
||||
@@ -72,15 +72,15 @@ services:
|
||||
# fastgpt
|
||||
sandbox:
|
||||
container_name: sandbox
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.22 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.22 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.23-fix # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.23-fix # 阿里云
|
||||
networks:
|
||||
- fastgpt
|
||||
restart: always
|
||||
fastgpt:
|
||||
container_name: fastgpt
|
||||
image: ghcr.io/labring/fastgpt:v4.8.22 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.22 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt:v4.8.23-fix # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.23-fix # 阿里云
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
|
||||
@@ -53,15 +53,15 @@ services:
|
||||
wait $$!
|
||||
sandbox:
|
||||
container_name: sandbox
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.22 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.22 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.23-fix # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.23-fix # 阿里云
|
||||
networks:
|
||||
- fastgpt
|
||||
restart: always
|
||||
fastgpt:
|
||||
container_name: fastgpt
|
||||
image: ghcr.io/labring/fastgpt:v4.8.22 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.22 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt:v4.8.23-fix # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.23-fix # 阿里云
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
title: 'V4.8.23(进行中)'
|
||||
title: 'V4.8.23'
|
||||
description: 'FastGPT V4.8.23 更新说明'
|
||||
icon: 'upgrade'
|
||||
draft: false
|
||||
@@ -7,6 +7,28 @@ toc: true
|
||||
weight: 802
|
||||
---
|
||||
|
||||
## 更新指南
|
||||
|
||||
### 1. 做好数据库备份
|
||||
|
||||
### 2. 更新镜像:
|
||||
|
||||
- 更新 fastgpt 镜像 tag: v4.8.23-fix
|
||||
- 更新 fastgpt-pro 商业版镜像 tag: v4.8.23-fix
|
||||
- Sandbox 镜像无需更新
|
||||
|
||||
### 3. 运行升级脚本
|
||||
|
||||
从任意终端,发起 1 个 HTTP 请求。其中 {{rootkey}} 替换成环境变量里的 `rootkey`;{{host}} 替换成**FastGPT 域名**。
|
||||
|
||||
```bash
|
||||
curl --location --request POST 'https://{{host}}/api/admin/initv4823' \
|
||||
--header 'rootkey: {{rootkey}}' \
|
||||
--header 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
脚本会清理一些知识库脏数据,主要是多余的全文索引。
|
||||
|
||||
## 🚀 新增内容
|
||||
|
||||
1. 增加默认“知识库文本理解模型”配置
|
||||
@@ -28,4 +50,5 @@ weight: 802
|
||||
2. 暂时移除 md 阅读优化,避免链接分割错误。
|
||||
3. 离开团队时,未刷新成员列表。
|
||||
4. PPTX 编码错误,导致解析失败。
|
||||
5. 删除知识库单条数据时,全文索引未跟随删除。
|
||||
5. 删除知识库单条数据时,全文索引未跟随删除。
|
||||
6. 修复 Mongo Dataset text 索引在查询数据时未生效。
|
||||
@@ -10,7 +10,6 @@ export type AuthTeamRoleProps = {
|
||||
export type CreateTeamProps = {
|
||||
name: string;
|
||||
avatar?: string;
|
||||
defaultTeam?: boolean;
|
||||
memberName?: string;
|
||||
memberAvatar?: string;
|
||||
notificationAccount?: string;
|
||||
|
||||
2
packages/global/support/user/team/type.d.ts
vendored
2
packages/global/support/user/team/type.d.ts
vendored
@@ -47,7 +47,6 @@ export type TeamMemberSchema = {
|
||||
role: `${TeamMemberRoleEnum}`;
|
||||
status: `${TeamMemberStatusEnum}`;
|
||||
avatar: string;
|
||||
defaultTeam: boolean;
|
||||
};
|
||||
|
||||
export type TeamMemberWithTeamAndUserSchema = TeamMemberSchema & {
|
||||
@@ -65,7 +64,6 @@ export type TeamTmbItemType = {
|
||||
balance?: number;
|
||||
tmbId: string;
|
||||
teamDomain: string;
|
||||
defaultTeam: boolean;
|
||||
role: `${TeamMemberRoleEnum}`;
|
||||
status: `${TeamMemberStatusEnum}`;
|
||||
notificationAccount?: string;
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
"nodeId": "lmpb9v2lo2lk",
|
||||
"name": "插件开始",
|
||||
"intro": "自定义配置外部输入,使用插件时,仅暴露自定义配置的输入",
|
||||
"avatar": "/imgs/workflow/input.png",
|
||||
"avatar": "core/workflow/template/workflowStart",
|
||||
"flowNodeType": "pluginInput",
|
||||
"showStatus": false,
|
||||
"position": {
|
||||
@@ -26,14 +26,16 @@
|
||||
"version": "481",
|
||||
"inputs": [
|
||||
{
|
||||
"renderTypeList": ["reference"],
|
||||
"renderTypeList": ["input", "reference"],
|
||||
"selectedTypeIndex": 0,
|
||||
"valueType": "string",
|
||||
"key": "url",
|
||||
"label": "url",
|
||||
"description": "需要读取的网页链接",
|
||||
"required": true,
|
||||
"toolDescription": "需要读取的网页链接"
|
||||
"toolDescription": "需要读取的网页链接",
|
||||
"list": [],
|
||||
"defaultValue": ""
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@@ -50,12 +52,12 @@
|
||||
"nodeId": "i7uow4wj2wdp",
|
||||
"name": "插件输出",
|
||||
"intro": "自定义配置外部输出,使用插件时,仅暴露自定义配置的输出",
|
||||
"avatar": "/imgs/workflow/output.png",
|
||||
"avatar": "core/workflow/template/pluginOutput",
|
||||
"flowNodeType": "pluginOutput",
|
||||
"showStatus": false,
|
||||
"position": {
|
||||
"x": 1607.7142331269129,
|
||||
"y": -150.8808596935447
|
||||
"x": 1853.935047606551,
|
||||
"y": -154.13661665265613
|
||||
},
|
||||
"version": "481",
|
||||
"inputs": [
|
||||
@@ -81,12 +83,12 @@
|
||||
"nodeId": "ebLCxU43hHuZ",
|
||||
"name": "HTTP 请求",
|
||||
"intro": "可以发出一个 HTTP 请求,实现更为复杂的操作(联网搜索、数据库查询等)",
|
||||
"avatar": "/imgs/workflow/http.png",
|
||||
"avatar": "core/workflow/template/httpRequest",
|
||||
"flowNodeType": "httpRequest468",
|
||||
"showStatus": true,
|
||||
"position": {
|
||||
"x": 1050.9890727421412,
|
||||
"y": -415.2085119990912
|
||||
"x": 1054.2940501177068,
|
||||
"y": -503.13661665265613
|
||||
},
|
||||
"version": "481",
|
||||
"inputs": [
|
||||
@@ -96,7 +98,7 @@
|
||||
"valueType": "dynamic",
|
||||
"label": "",
|
||||
"required": false,
|
||||
"description": "core.module.input.description.HTTP Dynamic Input",
|
||||
"description": "common:core.module.input.description.HTTP Dynamic Input",
|
||||
"customInputConfig": {
|
||||
"selectValueTypeList": [
|
||||
"string",
|
||||
@@ -107,16 +109,19 @@
|
||||
"arrayNumber",
|
||||
"arrayBoolean",
|
||||
"arrayObject",
|
||||
"arrayAny",
|
||||
"any",
|
||||
"chatHistory",
|
||||
"datasetQuote",
|
||||
"dynamic",
|
||||
"selectApp",
|
||||
"selectDataset"
|
||||
"selectDataset",
|
||||
"selectApp"
|
||||
],
|
||||
"showDescription": false,
|
||||
"showDefaultValue": true
|
||||
}
|
||||
},
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"key": "system_httpMethod",
|
||||
@@ -124,17 +129,33 @@
|
||||
"valueType": "string",
|
||||
"label": "",
|
||||
"value": "POST",
|
||||
"required": true
|
||||
"required": true,
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"key": "system_httpTimeout",
|
||||
"renderTypeList": ["custom"],
|
||||
"valueType": "number",
|
||||
"label": "",
|
||||
"value": 30,
|
||||
"min": 5,
|
||||
"max": 600,
|
||||
"required": true,
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"key": "system_httpReqUrl",
|
||||
"renderTypeList": ["hidden"],
|
||||
"valueType": "string",
|
||||
"label": "",
|
||||
"description": "core.module.input.description.Http Request Url",
|
||||
"description": "common:core.module.input.description.Http Request Url",
|
||||
"placeholder": "https://api.ai.com/getInventory",
|
||||
"required": false,
|
||||
"value": "fetchUrl"
|
||||
"value": "fetchUrl",
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"key": "system_httpHeader",
|
||||
@@ -142,9 +163,11 @@
|
||||
"valueType": "any",
|
||||
"value": [],
|
||||
"label": "",
|
||||
"description": "core.module.input.description.Http Request Header",
|
||||
"placeholder": "core.module.input.description.Http Request Header",
|
||||
"required": false
|
||||
"description": "common:core.module.input.description.Http Request Header",
|
||||
"placeholder": "common:core.module.input.description.Http Request Header",
|
||||
"required": false,
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"key": "system_httpParams",
|
||||
@@ -152,7 +175,9 @@
|
||||
"valueType": "any",
|
||||
"value": [],
|
||||
"label": "",
|
||||
"required": false
|
||||
"required": false,
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"key": "system_httpJsonBody",
|
||||
@@ -160,7 +185,29 @@
|
||||
"valueType": "any",
|
||||
"value": "{\n \"url\": \"{{url}}\"\n}",
|
||||
"label": "",
|
||||
"required": false
|
||||
"required": false,
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"key": "system_httpFormBody",
|
||||
"renderTypeList": ["hidden"],
|
||||
"valueType": "any",
|
||||
"value": [],
|
||||
"label": "",
|
||||
"required": false,
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"key": "system_httpContentType",
|
||||
"renderTypeList": ["hidden"],
|
||||
"valueType": "string",
|
||||
"value": "json",
|
||||
"label": "",
|
||||
"required": false,
|
||||
"debugLabel": "",
|
||||
"toolDescription": ""
|
||||
},
|
||||
{
|
||||
"renderTypeList": ["reference"],
|
||||
@@ -178,12 +225,13 @@
|
||||
"arrayNumber",
|
||||
"arrayBoolean",
|
||||
"arrayObject",
|
||||
"arrayAny",
|
||||
"any",
|
||||
"chatHistory",
|
||||
"datasetQuote",
|
||||
"dynamic",
|
||||
"selectApp",
|
||||
"selectDataset"
|
||||
"selectDataset",
|
||||
"selectApp"
|
||||
],
|
||||
"showDescription": false,
|
||||
"showDefaultValue": true
|
||||
@@ -193,6 +241,23 @@
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "error",
|
||||
"key": "error",
|
||||
"label": "workflow:request_error",
|
||||
"description": "HTTP请求错误信息,成功时返回空",
|
||||
"valueType": "object",
|
||||
"type": "static"
|
||||
},
|
||||
{
|
||||
"id": "httpRawResponse",
|
||||
"key": "httpRawResponse",
|
||||
"required": true,
|
||||
"label": "workflow:raw_response",
|
||||
"description": "HTTP请求的原始响应。只能接受字符串或JSON类型响应数据。",
|
||||
"valueType": "any",
|
||||
"type": "static"
|
||||
},
|
||||
{
|
||||
"id": "system_addOutputParam",
|
||||
"key": "system_addOutputParam",
|
||||
@@ -220,23 +285,6 @@
|
||||
"showDefaultValue": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "error",
|
||||
"key": "error",
|
||||
"label": "请求错误",
|
||||
"description": "HTTP请求错误信息,成功时返回空",
|
||||
"valueType": "object",
|
||||
"type": "static"
|
||||
},
|
||||
{
|
||||
"id": "httpRawResponse",
|
||||
"key": "httpRawResponse",
|
||||
"label": "原始响应",
|
||||
"required": true,
|
||||
"description": "HTTP请求的原始响应。只能接受字符串或JSON类型响应数据。",
|
||||
"valueType": "any",
|
||||
"type": "static"
|
||||
},
|
||||
{
|
||||
"id": "rH4tMV02robs",
|
||||
"valueType": "string",
|
||||
@@ -260,6 +308,34 @@
|
||||
"sourceHandle": "ebLCxU43hHuZ-source-right",
|
||||
"targetHandle": "i7uow4wj2wdp-target-left"
|
||||
}
|
||||
]
|
||||
],
|
||||
"chatConfig": {
|
||||
"welcomeText": "",
|
||||
"variables": [],
|
||||
"questionGuide": {
|
||||
"open": false,
|
||||
"model": "gpt-4o-mini",
|
||||
"customPrompt": "You are an AI assistant tasked with predicting the user's next question based on the conversation history. Your goal is to generate 3 potential questions that will guide the user to continue the conversation. When generating these questions, adhere to the following rules:\n\n1. Use the same language as the user's last question in the conversation history.\n2. Keep each question under 20 characters in length.\n\nAnalyze the conversation history provided to you and use it as context to generate relevant and engaging follow-up questions. Your predictions should be logical extensions of the current topic or related areas that the user might be interested in exploring further.\n\nRemember to maintain consistency in tone and style with the existing conversation while providing diverse options for the user to choose from. Your goal is to keep the conversation flowing naturally and help the user delve deeper into the subject matter or explore related topics."
|
||||
},
|
||||
"ttsConfig": {
|
||||
"type": "web"
|
||||
},
|
||||
"whisperConfig": {
|
||||
"open": false,
|
||||
"autoSend": false,
|
||||
"autoTTSResponse": false
|
||||
},
|
||||
"chatInputGuide": {
|
||||
"open": false,
|
||||
"textList": [],
|
||||
"customUrl": ""
|
||||
},
|
||||
"instruction": "",
|
||||
"autoExecute": {
|
||||
"open": false,
|
||||
"defaultPrompt": ""
|
||||
},
|
||||
"_id": "677b59849d672185a5671b45"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,13 +3,16 @@ import { PassThrough } from 'stream';
|
||||
|
||||
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
|
||||
return new Promise<Buffer>((resolve, reject) => {
|
||||
let tmpBuffer: Buffer = Buffer.from([]);
|
||||
const chunks: Buffer[] = [];
|
||||
let totalLength = 0;
|
||||
|
||||
stream.on('data', (chunk) => {
|
||||
tmpBuffer = Buffer.concat([tmpBuffer, chunk]);
|
||||
chunks.push(chunk);
|
||||
totalLength += chunk.length;
|
||||
});
|
||||
stream.on('end', () => {
|
||||
resolve(tmpBuffer);
|
||||
const resultBuffer = Buffer.concat(chunks, totalLength); // 一次性拼接
|
||||
resolve(resultBuffer);
|
||||
});
|
||||
stream.on('error', (err) => {
|
||||
reject(err);
|
||||
|
||||
@@ -118,7 +118,7 @@ export async function delImgByRelatedId({
|
||||
}: {
|
||||
teamId: string;
|
||||
relateIds: string[];
|
||||
session: ClientSession;
|
||||
session?: ClientSession;
|
||||
}) {
|
||||
if (relateIds.length === 0) return;
|
||||
|
||||
|
||||
@@ -16,16 +16,30 @@ export async function connectMongo(): Promise<Mongoose> {
|
||||
|
||||
console.log('mongo start connect');
|
||||
try {
|
||||
// Remove existing listeners to prevent duplicates
|
||||
connectionMongo.connection.removeAllListeners('error');
|
||||
connectionMongo.connection.removeAllListeners('disconnected');
|
||||
connectionMongo.set('strictQuery', false);
|
||||
|
||||
connectionMongo.connection.on('error', async (error) => {
|
||||
console.log('mongo error', error);
|
||||
await connectionMongo.disconnect();
|
||||
await delay(1000);
|
||||
connectMongo();
|
||||
try {
|
||||
if (connectionMongo.connection.readyState !== 0) {
|
||||
await connectionMongo.disconnect();
|
||||
await delay(1000);
|
||||
await connectMongo();
|
||||
}
|
||||
} catch (error) {}
|
||||
});
|
||||
connectionMongo.connection.on('disconnected', () => {
|
||||
connectionMongo.connection.on('disconnected', async () => {
|
||||
console.log('mongo disconnected');
|
||||
try {
|
||||
if (connectionMongo.connection.readyState !== 0) {
|
||||
await connectionMongo.disconnect();
|
||||
await delay(1000);
|
||||
await connectMongo();
|
||||
}
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
await connectionMongo.connect(process.env.MONGODB_URI as string, {
|
||||
|
||||
@@ -2,6 +2,7 @@ import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/ap
|
||||
import * as cheerio from 'cheerio';
|
||||
import axios from 'axios';
|
||||
import { htmlToMarkdown } from './utils';
|
||||
import { isInternalAddress } from '../system/utils';
|
||||
|
||||
export const cheerioToHtml = ({
|
||||
fetchUrl,
|
||||
@@ -75,6 +76,16 @@ export const urlsFetch = async ({
|
||||
|
||||
const response = await Promise.all(
|
||||
urlList.map(async (url) => {
|
||||
const isInternal = isInternalAddress(url);
|
||||
if (isInternal) {
|
||||
return {
|
||||
url,
|
||||
title: '',
|
||||
content: 'Cannot fetch internal url',
|
||||
selector: ''
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const fetchRes = await axios.get(url, {
|
||||
timeout: 30000
|
||||
|
||||
63
packages/service/common/system/utils.ts
Normal file
63
packages/service/common/system/utils.ts
Normal file
@@ -0,0 +1,63 @@
|
||||
import { SERVICE_LOCAL_HOST } from './tools';
|
||||
|
||||
export const isInternalAddress = (url: string): boolean => {
|
||||
try {
|
||||
const parsedUrl = new URL(url);
|
||||
const hostname = parsedUrl.hostname;
|
||||
const fullUrl = parsedUrl.toString();
|
||||
|
||||
// Check for localhost and common internal domains
|
||||
if (hostname === SERVICE_LOCAL_HOST) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Metadata endpoints whitelist
|
||||
const metadataEndpoints = [
|
||||
// AWS
|
||||
'http://169.254.169.254/latest/meta-data/',
|
||||
// Azure
|
||||
'http://169.254.169.254/metadata/instance?api-version=2021-02-01',
|
||||
// GCP
|
||||
'http://metadata.google.internal/computeMetadata/v1/',
|
||||
// Alibaba Cloud
|
||||
'http://100.100.100.200/latest/meta-data/',
|
||||
// Tencent Cloud
|
||||
'http://metadata.tencentyun.com/latest/meta-data/',
|
||||
// Huawei Cloud
|
||||
'http://169.254.169.254/latest/meta-data/'
|
||||
];
|
||||
if (metadataEndpoints.some((endpoint) => fullUrl.startsWith(endpoint))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// For non-metadata URLs, check if it's a domain name
|
||||
const ipv4Pattern = /^(\d{1,3}\.){3}\d{1,3}$/;
|
||||
if (!ipv4Pattern.test(hostname)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// ... existing IP validation code ...
|
||||
const parts = hostname.split('.').map(Number);
|
||||
|
||||
if (parts.length !== 4 || parts.some((part) => part < 0 || part > 255)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Only allow public IP ranges
|
||||
return (
|
||||
parts[0] !== 0 &&
|
||||
parts[0] !== 10 &&
|
||||
parts[0] !== 127 &&
|
||||
!(parts[0] === 169 && parts[1] === 254) &&
|
||||
!(parts[0] === 172 && parts[1] >= 16 && parts[1] <= 31) &&
|
||||
!(parts[0] === 192 && parts[1] === 168) &&
|
||||
!(parts[0] >= 224 && parts[0] <= 239) &&
|
||||
!(parts[0] >= 240 && parts[0] <= 255) &&
|
||||
!(parts[0] === 100 && parts[1] >= 64 && parts[1] <= 127) &&
|
||||
!(parts[0] === 9 && parts[1] === 0) &&
|
||||
!(parts[0] === 11 && parts[1] === 0)
|
||||
);
|
||||
} catch {
|
||||
return false; // If URL parsing fails, reject it as potentially unsafe
|
||||
}
|
||||
};
|
||||
@@ -35,7 +35,7 @@ export const getAxiosConfig = (props?: { userKey?: OpenaiAccountType }) => {
|
||||
const { userKey } = props || {};
|
||||
|
||||
const baseUrl = userKey?.baseUrl || global?.systemEnv?.oneapiUrl || openaiBaseUrl;
|
||||
const apiKey = userKey?.key || global?.systemEnv?.chatApiKey || process.env.CHAT_API_KEY || '';
|
||||
const apiKey = userKey?.key || global?.systemEnv?.chatApiKey || openaiBaseKey;
|
||||
|
||||
return {
|
||||
baseUrl,
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
{
|
||||
"provider": "AliCloud",
|
||||
"list": []
|
||||
}
|
||||
"list": [
|
||||
{
|
||||
"model": "SenseVoiceSmall",
|
||||
"name": "SenseVoiceSmall",
|
||||
"type": "stt"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
1
packages/service/core/app/plugin/type.d.ts
vendored
1
packages/service/core/app/plugin/type.d.ts
vendored
@@ -25,6 +25,7 @@ export type SystemPluginConfigSchemaType = {
|
||||
templateType: string;
|
||||
associatedPluginId: string;
|
||||
userGuide: string;
|
||||
author?: string;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -18,6 +18,9 @@ const AppTemplateSchema = new Schema({
|
||||
avatar: {
|
||||
type: String
|
||||
},
|
||||
author: {
|
||||
type: String
|
||||
},
|
||||
tags: {
|
||||
type: [String],
|
||||
default: undefined
|
||||
|
||||
@@ -25,6 +25,7 @@ import { MongoImage } from '../../../common/file/image/schema';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { addDays } from 'date-fns';
|
||||
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
||||
import { delay, retryFn } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
export const createCollectionAndInsertData = async ({
|
||||
dataset,
|
||||
@@ -216,7 +217,7 @@ export async function createOneCollection({
|
||||
nextSyncTime
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
return collection;
|
||||
@@ -234,7 +235,7 @@ export const delCollectionRelatedSource = async ({
|
||||
relatedImgId?: string;
|
||||
};
|
||||
}[];
|
||||
session: ClientSession;
|
||||
session?: ClientSession;
|
||||
}) => {
|
||||
if (collections.length === 0) return;
|
||||
|
||||
@@ -282,47 +283,55 @@ export async function delCollection({
|
||||
const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
|
||||
const collectionIds = collections.map((item) => String(item._id));
|
||||
|
||||
// Delete training data
|
||||
await MongoDatasetTraining.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
collectionId: { $in: collectionIds }
|
||||
await retryFn(async () => {
|
||||
await Promise.all([
|
||||
// Delete training data
|
||||
MongoDatasetTraining.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
collectionId: { $in: collectionIds }
|
||||
}),
|
||||
// Delete dataset_data_texts
|
||||
MongoDatasetDataText.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
collectionId: { $in: collectionIds }
|
||||
}),
|
||||
// Delete dataset_datas
|
||||
MongoDatasetData.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
collectionId: { $in: collectionIds }
|
||||
}),
|
||||
...(delImg
|
||||
? [
|
||||
delImgByRelatedId({
|
||||
teamId,
|
||||
relateIds: collections
|
||||
.map((item) => item?.metadata?.relatedImgId || '')
|
||||
.filter(Boolean)
|
||||
})
|
||||
]
|
||||
: []),
|
||||
...(delFile
|
||||
? [
|
||||
delFileByFileIdList({
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileIdList: collections.map((item) => item?.fileId || '').filter(Boolean)
|
||||
})
|
||||
]
|
||||
: []),
|
||||
// Delete vector data
|
||||
deleteDatasetDataVector({ teamId, datasetIds, collectionIds })
|
||||
]);
|
||||
|
||||
// delete collections
|
||||
await MongoDatasetCollection.deleteMany(
|
||||
{
|
||||
teamId,
|
||||
_id: { $in: collectionIds }
|
||||
},
|
||||
{ session }
|
||||
);
|
||||
});
|
||||
|
||||
if (delImg) {
|
||||
await delImgByRelatedId({
|
||||
teamId,
|
||||
relateIds: collections.map((item) => item?.metadata?.relatedImgId || '').filter(Boolean),
|
||||
session
|
||||
});
|
||||
}
|
||||
if (delFile) {
|
||||
await delFileByFileIdList({
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileIdList: collections.map((item) => item?.fileId || '').filter(Boolean)
|
||||
});
|
||||
}
|
||||
|
||||
// Delete dataset_datas
|
||||
await MongoDatasetData.deleteMany(
|
||||
{ teamId, datasetId: { $in: datasetIds }, collectionId: { $in: collectionIds } },
|
||||
{ session }
|
||||
);
|
||||
// Delete dataset_data_texts
|
||||
await MongoDatasetDataText.deleteMany(
|
||||
{ teamId, datasetId: { $in: datasetIds }, collectionId: { $in: collectionIds } },
|
||||
{ session }
|
||||
);
|
||||
|
||||
// delete collections
|
||||
await MongoDatasetCollection.deleteMany(
|
||||
{
|
||||
teamId,
|
||||
_id: { $in: collectionIds }
|
||||
},
|
||||
{ session }
|
||||
);
|
||||
|
||||
// no session delete: delete files, vector data
|
||||
await deleteDatasetDataVector({ teamId, datasetIds, collectionIds });
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ export const createOrGetCollectionTags = async ({
|
||||
datasetId,
|
||||
tag: tagContent
|
||||
})),
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)];
|
||||
|
||||
@@ -8,6 +8,7 @@ import { MongoDatasetData } from './data/schema';
|
||||
import { deleteDatasetDataVector } from '../../common/vectorStore/controller';
|
||||
import { MongoDatasetDataText } from './data/dataTextSchema';
|
||||
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
/* ============= dataset ========== */
|
||||
/* find all datasetId by top datasetId */
|
||||
@@ -78,40 +79,39 @@ export async function delDatasetRelevantData({
|
||||
|
||||
const datasetIds = datasets.map((item) => item._id);
|
||||
|
||||
// delete training data
|
||||
await MongoDatasetTraining.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
});
|
||||
|
||||
// Get _id, teamId, fileId, metadata.relatedImgId for all collections
|
||||
const collections = await MongoDatasetCollection.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
},
|
||||
'_id teamId datasetId fileId metadata',
|
||||
{ session }
|
||||
'_id teamId datasetId fileId metadata'
|
||||
).lean();
|
||||
|
||||
// Delete Image and file
|
||||
await delCollectionRelatedSource({ collections, session });
|
||||
await retryFn(async () => {
|
||||
await Promise.all([
|
||||
// delete training data
|
||||
MongoDatasetTraining.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
}),
|
||||
//Delete dataset_data_texts
|
||||
MongoDatasetDataText.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
}),
|
||||
//delete dataset_datas
|
||||
MongoDatasetData.deleteMany({ teamId, datasetId: { $in: datasetIds } }),
|
||||
// Delete Image and file
|
||||
delCollectionRelatedSource({ collections }),
|
||||
// Delete vector data
|
||||
deleteDatasetDataVector({ teamId, datasetIds })
|
||||
]);
|
||||
});
|
||||
|
||||
// delete collections
|
||||
await MongoDatasetCollection.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
}).session(session);
|
||||
|
||||
// No session delete:
|
||||
// Delete dataset_data_texts
|
||||
await MongoDatasetDataText.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
});
|
||||
// delete dataset_datas
|
||||
await MongoDatasetData.deleteMany({ teamId, datasetId: { $in: datasetIds } });
|
||||
|
||||
// Delete vector data
|
||||
await deleteDatasetDataVector({ teamId, datasetIds });
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ try {
|
||||
default_language: 'none'
|
||||
}
|
||||
);
|
||||
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, collectionId: 1 });
|
||||
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
||||
@@ -62,6 +62,7 @@ export const dispatchLoop = async (props: Props): Promise<Response> => {
|
||||
|
||||
const response = await dispatchWorkFlow({
|
||||
...props,
|
||||
variables: newVariables,
|
||||
runtimeEdges: cloneDeep(runtimeEdges)
|
||||
});
|
||||
|
||||
|
||||
@@ -120,27 +120,144 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise<H
|
||||
2. Replace newline strings
|
||||
*/
|
||||
const replaceJsonBodyString = (text: string) => {
|
||||
const valToStr = (val: any) => {
|
||||
// Check if the variable is in quotes
|
||||
const isVariableInQuotes = (text: string, variable: string) => {
|
||||
const index = text.indexOf(variable);
|
||||
if (index === -1) return false;
|
||||
|
||||
// 计算变量前面的引号数量
|
||||
const textBeforeVar = text.substring(0, index);
|
||||
const matches = textBeforeVar.match(/"/g) || [];
|
||||
|
||||
// 如果引号数量为奇数,则变量在引号内
|
||||
return matches.length % 2 === 1;
|
||||
};
|
||||
const valToStr = (val: any, isQuoted = false) => {
|
||||
if (val === undefined) return 'null';
|
||||
if (val === null) return 'null';
|
||||
|
||||
if (typeof val === 'object') return JSON.stringify(val);
|
||||
|
||||
if (typeof val === 'string') {
|
||||
if (isQuoted) {
|
||||
return val.replace(/(?<!\\)"/g, '\\"');
|
||||
}
|
||||
try {
|
||||
const parsed = JSON.parse(val);
|
||||
if (typeof parsed === 'object') {
|
||||
return JSON.stringify(parsed);
|
||||
}
|
||||
JSON.parse(val);
|
||||
return val;
|
||||
} catch (error) {
|
||||
const str = JSON.stringify(val);
|
||||
|
||||
return str.startsWith('"') && str.endsWith('"') ? str.slice(1, -1) : str;
|
||||
}
|
||||
}
|
||||
|
||||
return String(val);
|
||||
};
|
||||
// Test cases for variable replacement in JSON body
|
||||
// const bodyTest = () => {
|
||||
// const testData = [
|
||||
// // 基本字符串替换
|
||||
// {
|
||||
// body: `{"name":"{{name}}","age":"18"}`,
|
||||
// variables: [{ key: '{{name}}', value: '测试' }],
|
||||
// result: `{"name":"测试","age":"18"}`
|
||||
// },
|
||||
// // 特殊字符处理
|
||||
// {
|
||||
// body: `{"text":"{{text}}"}`,
|
||||
// variables: [{ key: '{{text}}', value: '包含"引号"和\\反斜杠' }],
|
||||
// result: `{"text":"包含\\"引号\\"和\\反斜杠"}`
|
||||
// },
|
||||
// // 数字类型处理
|
||||
// {
|
||||
// body: `{"count":{{count}},"price":{{price}}}`,
|
||||
// variables: [
|
||||
// { key: '{{count}}', value: '42' },
|
||||
// { key: '{{price}}', value: '99.99' }
|
||||
// ],
|
||||
// result: `{"count":42,"price":99.99}`
|
||||
// },
|
||||
// // 布尔值处理
|
||||
// {
|
||||
// body: `{"isActive":{{isActive}},"hasData":{{hasData}}}`,
|
||||
// variables: [
|
||||
// { key: '{{isActive}}', value: 'true' },
|
||||
// { key: '{{hasData}}', value: 'false' }
|
||||
// ],
|
||||
// result: `{"isActive":true,"hasData":false}`
|
||||
// },
|
||||
// // 对象类型处理
|
||||
// {
|
||||
// body: `{"user":{{user}},"user2":"{{user2}}"}`,
|
||||
// variables: [
|
||||
// { key: '{{user}}', value: `{"id":1,"name":"张三"}` },
|
||||
// { key: '{{user2}}', value: `{"id":1,"name":"张三"}` }
|
||||
// ],
|
||||
// result: `{"user":{"id":1,"name":"张三"},"user2":"{\\"id\\":1,\\"name\\":\\"张三\\"}"}`
|
||||
// },
|
||||
// // 数组类型处理
|
||||
// {
|
||||
// body: `{"items":{{items}}}`,
|
||||
// variables: [{ key: '{{items}}', value: '[1, 2, 3]' }],
|
||||
// result: `{"items":[1,2,3]}`
|
||||
// },
|
||||
// // null 和 undefined 处理
|
||||
// {
|
||||
// body: `{"nullValue":{{nullValue}},"undefinedValue":{{undefinedValue}}}`,
|
||||
// variables: [
|
||||
// { key: '{{nullValue}}', value: 'null' },
|
||||
// { key: '{{undefinedValue}}', value: 'undefined' }
|
||||
// ],
|
||||
// result: `{"nullValue":null,"undefinedValue":null}`
|
||||
// },
|
||||
// // 嵌套JSON结构
|
||||
// {
|
||||
// body: `{"data":{"nested":{"value":"{{nestedValue}}"}}}`,
|
||||
// variables: [{ key: '{{nestedValue}}', value: '嵌套值' }],
|
||||
// result: `{"data":{"nested":{"value":"嵌套值"}}}`
|
||||
// },
|
||||
// // 多变量替换
|
||||
// {
|
||||
// body: `{"first":"{{first}}","second":"{{second}}","third":{{third}}}`,
|
||||
// variables: [
|
||||
// { key: '{{first}}', value: '第一' },
|
||||
// { key: '{{second}}', value: '第二' },
|
||||
// { key: '{{third}}', value: '3' }
|
||||
// ],
|
||||
// result: `{"first":"第一","second":"第二","third":3}`
|
||||
// },
|
||||
// // JSON字符串作为变量值
|
||||
// {
|
||||
// body: `{"config":{{config}}}`,
|
||||
// variables: [{ key: '{{config}}', value: '{"setting":"enabled","mode":"advanced"}' }],
|
||||
// result: `{"config":{"setting":"enabled","mode":"advanced"}}`
|
||||
// }
|
||||
// ];
|
||||
|
||||
// for (let i = 0; i < testData.length; i++) {
|
||||
// const item = testData[i];
|
||||
// let bodyStr = item.body;
|
||||
// for (const variable of item.variables) {
|
||||
// const isQuote = isVariableInQuotes(bodyStr, variable.key);
|
||||
// bodyStr = bodyStr.replace(variable.key, valToStr(variable.value, isQuote));
|
||||
// }
|
||||
// bodyStr = bodyStr.replace(/(".*?")\s*:\s*undefined\b/g, '$1:null');
|
||||
|
||||
// console.log(bodyStr === item.result, i);
|
||||
// if (bodyStr !== item.result) {
|
||||
// console.log(bodyStr);
|
||||
// console.log(item.result);
|
||||
// } else {
|
||||
// try {
|
||||
// JSON.parse(item.result);
|
||||
// } catch (error) {
|
||||
// console.log('反序列化异常', i, item.result);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// };
|
||||
// bodyTest();
|
||||
|
||||
// 1. Replace {{key.key}} variables
|
||||
const regex1 = /\{\{\$([^.]+)\.([^$]+)\$\}\}/g;
|
||||
@@ -148,6 +265,10 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise<H
|
||||
matches1.forEach((match) => {
|
||||
const nodeId = match[1];
|
||||
const id = match[2];
|
||||
const fullMatch = match[0];
|
||||
|
||||
// 检查变量是否在引号内
|
||||
const isInQuotes = isVariableInQuotes(text, fullMatch);
|
||||
|
||||
const variableVal = (() => {
|
||||
if (nodeId === VARIABLE_NODE_ID) {
|
||||
@@ -165,9 +286,9 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise<H
|
||||
return getReferenceVariableValue({ value: input.value, nodes: runtimeNodes, variables });
|
||||
})();
|
||||
|
||||
const formatVal = valToStr(variableVal);
|
||||
const formatVal = valToStr(variableVal, isInQuotes);
|
||||
|
||||
const regex = new RegExp(`\\{\\{\\$(${nodeId}\\.${id})\\$\\}\\}`, 'g');
|
||||
const regex = new RegExp(`\\{\\{\\$(${nodeId}\\.${id})\\$\\}\\}`, '');
|
||||
text = text.replace(regex, () => formatVal);
|
||||
});
|
||||
|
||||
@@ -176,10 +297,16 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise<H
|
||||
const matches2 = text.match(regex2) || [];
|
||||
const uniqueKeys2 = [...new Set(matches2.map((match) => match.slice(2, -2)))];
|
||||
for (const key of uniqueKeys2) {
|
||||
text = text.replace(new RegExp(`{{(${key})}}`, 'g'), () => valToStr(allVariables[key]));
|
||||
const fullMatch = `{{${key}}}`;
|
||||
// 检查变量是否在引号内
|
||||
const isInQuotes = isVariableInQuotes(text, fullMatch);
|
||||
|
||||
text = text.replace(new RegExp(`{{(${key})}}`, ''), () =>
|
||||
valToStr(allVariables[key], isInQuotes)
|
||||
);
|
||||
}
|
||||
|
||||
return text.replace(/(".*?")\s*:\s*undefined\b/g, '$1: null');
|
||||
return text.replace(/(".*?")\s*:\s*undefined\b/g, '$1:null');
|
||||
};
|
||||
|
||||
httpReqUrl = replaceStringVariables(httpReqUrl);
|
||||
|
||||
@@ -196,7 +196,8 @@ export async function syncCollaborators({
|
||||
permission: item.permission
|
||||
})),
|
||||
{
|
||||
session
|
||||
session,
|
||||
ordered: true
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -43,7 +43,6 @@ async function getTeamMember(match: Record<string, any>): Promise<TeamTmbItemTyp
|
||||
teamDomain: tmb.team?.teamDomain,
|
||||
role: tmb.role,
|
||||
status: tmb.status,
|
||||
defaultTeam: tmb.defaultTeam,
|
||||
permission: new TeamPermission({
|
||||
per: Per ?? TeamDefaultPermissionVal,
|
||||
isOwner: tmb.role === TeamMemberRoleEnum.owner
|
||||
@@ -71,8 +70,7 @@ export async function getUserDefaultTeam({ userId }: { userId: string }) {
|
||||
return Promise.reject('tmbId or userId is required');
|
||||
}
|
||||
return getTeamMember({
|
||||
userId: new Types.ObjectId(userId),
|
||||
defaultTeam: true
|
||||
userId: new Types.ObjectId(userId)
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -39,14 +39,14 @@ const TeamMemberSchema = new Schema({
|
||||
updateTime: {
|
||||
type: Date
|
||||
},
|
||||
defaultTeam: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
},
|
||||
|
||||
// Abandoned
|
||||
role: {
|
||||
type: String
|
||||
},
|
||||
// Abandoned
|
||||
defaultTeam: {
|
||||
type: Boolean
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ export const initTeamFreePlan = async ({
|
||||
surplusPoints: freePoints
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
};
|
||||
|
||||
|
||||
@@ -160,7 +160,7 @@ export const createTrainingUsage = async ({
|
||||
]
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
return { billId: String(_id) };
|
||||
|
||||
@@ -214,10 +214,15 @@ export function useScrollPagination<
|
||||
async (init = false, ScrollContainerRef?: RefObject<HTMLDivElement>) => {
|
||||
if (noMore && !init) return;
|
||||
|
||||
if (init) {
|
||||
setData([]);
|
||||
setTotal(0);
|
||||
}
|
||||
|
||||
const offset = init ? 0 : data.length;
|
||||
|
||||
setTrue();
|
||||
console.log(offset);
|
||||
|
||||
try {
|
||||
const res = await api({
|
||||
offset,
|
||||
@@ -288,7 +293,7 @@ export function useScrollPagination<
|
||||
// Watch scroll position
|
||||
useThrottleEffect(
|
||||
() => {
|
||||
if (!ref?.current || noMore) return;
|
||||
if (!ref?.current || noMore || isLoading || data.length === 0) return;
|
||||
const { scrollTop, scrollHeight, clientHeight } = ref.current;
|
||||
|
||||
if (
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
"key_type": "API key format:",
|
||||
"log": "Call log",
|
||||
"log_detail": "Log details",
|
||||
"log_request_id_search": "Search by requestId",
|
||||
"log_status": "Status",
|
||||
"mapping": "Model Mapping",
|
||||
"mapping_tip": "A valid Json is required. \nThe model can be mapped when sending a request to the actual address. \nFor example:\n{\n \n \"gpt-4o\": \"gpt-4o-test\"\n\n}\n\nWhen FastGPT requests the gpt-4o model, the gpt-4o-test model is sent to the actual address, instead of gpt-4o.",
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
"key_type": "API key 格式: ",
|
||||
"log": "调用日志",
|
||||
"log_detail": "日志详情",
|
||||
"log_request_id_search": "根据 requestId 搜索",
|
||||
"log_status": "状态",
|
||||
"mapping": "模型映射",
|
||||
"mapping_tip": "需填写一个有效 Json。可在向实际地址发送请求时,对模型进行映射。例如:\n{\n \"gpt-4o\": \"gpt-4o-test\"\n}\n当 FastGPT 请求 gpt-4o 模型时,会向实际地址发送 gpt-4o-test 的模型,而不是 gpt-4o。",
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
"key_type": "API key 格式:",
|
||||
"log": "調用日誌",
|
||||
"log_detail": "日誌詳情",
|
||||
"log_request_id_search": "根據 requestId 搜索",
|
||||
"log_status": "狀態",
|
||||
"mapping": "模型映射",
|
||||
"mapping_tip": "需填寫一個有效 Json。\n可在向實際地址發送請求時,對模型進行映射。\n例如:\n{\n \n \"gpt-4o\": \"gpt-4o-test\"\n\n}\n\n當 FastGPT 請求 gpt-4o 模型時,會向實際地址發送 gpt-4o-test 的模型,而不是 gpt-4o。",
|
||||
|
||||
3
plugins/webcrawler/.dockerignore
Normal file
3
plugins/webcrawler/.dockerignore
Normal file
@@ -0,0 +1,3 @@
|
||||
# 忽略 .git 目录及其内容
|
||||
.git
|
||||
.gitignore
|
||||
25
plugins/webcrawler/.gitignore
vendored
Normal file
25
plugins/webcrawler/.gitignore
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
*~
|
||||
|
||||
searxng-docker.service
|
||||
caddy
|
||||
srv
|
||||
searxng/uwsgi.ini
|
||||
.env
|
||||
SPIDER/.env
|
||||
|
||||
# 忽略 node_modules 文件夹
|
||||
SPIDER/node_modules/
|
||||
|
||||
# 忽略构建输出文件夹
|
||||
SPIDER/dist/
|
||||
|
||||
# 忽略日志文件
|
||||
*.log
|
||||
|
||||
# 忽略操作系统生成的文件
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# 忽略 IDE/编辑器生成的文件
|
||||
.vscode/
|
||||
.idea/
|
||||
14
plugins/webcrawler/.searchxng.env
Normal file
14
plugins/webcrawler/.searchxng.env
Normal file
@@ -0,0 +1,14 @@
|
||||
# By default listen on https://localhost
|
||||
# To change this:
|
||||
# * uncomment SEARXNG_HOSTNAME, and replace <host> by the SearXNG hostname
|
||||
# * uncomment LETSENCRYPT_EMAIL, and replace <email> by your email (require to create a Let's Encrypt certificate)
|
||||
|
||||
# SEARXNG_HOSTNAME=<host>
|
||||
# LETSENCRYPT_EMAIL=<email>
|
||||
|
||||
# Optional:
|
||||
# If you run a very small or a very large instance, you might want to change the amount of used uwsgi workers and threads per worker
|
||||
# More workers (= processes) means that more search requests can be handled at the same time, but it also causes more resource usage
|
||||
|
||||
SEARXNG_UWSGI_WORKERS=4
|
||||
SEARXNG_UWSGI_THREADS=4
|
||||
91
plugins/webcrawler/Caddyfile
Normal file
91
plugins/webcrawler/Caddyfile
Normal file
@@ -0,0 +1,91 @@
|
||||
{
|
||||
admin off
|
||||
|
||||
log {
|
||||
output stderr
|
||||
format filter {
|
||||
# Preserves first 8 bits from IPv4 and 32 bits from IPv6
|
||||
request>remote_ip ip_mask 8 32
|
||||
request>client_ip ip_mask 8 32
|
||||
|
||||
# Remove identificable information
|
||||
request>remote_port delete
|
||||
request>headers delete
|
||||
request>uri query {
|
||||
delete url
|
||||
delete h
|
||||
delete q
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{$SEARXNG_HOSTNAME}
|
||||
|
||||
tls {$SEARXNG_TLS}
|
||||
|
||||
encode zstd gzip
|
||||
|
||||
@api {
|
||||
path /config
|
||||
path /healthz
|
||||
path /stats/errors
|
||||
path /stats/checker
|
||||
}
|
||||
|
||||
@search {
|
||||
path /search
|
||||
}
|
||||
|
||||
@imageproxy {
|
||||
path /image_proxy
|
||||
}
|
||||
|
||||
@static {
|
||||
path /static/*
|
||||
}
|
||||
|
||||
header {
|
||||
# CSP (https://content-security-policy.com)
|
||||
Content-Security-Policy "upgrade-insecure-requests; default-src 'none'; script-src 'self'; style-src 'self' 'unsafe-inline'; form-action 'self' https://github.com/searxng/searxng/issues/new; font-src 'self'; frame-ancestors 'self'; base-uri 'self'; connect-src 'self' https://overpass-api.de; img-src * data:; frame-src https://www.youtube-nocookie.com https://player.vimeo.com https://www.dailymotion.com https://www.deezer.com https://www.mixcloud.com https://w.soundcloud.com https://embed.spotify.com;"
|
||||
|
||||
# Disable some browser features
|
||||
Permissions-Policy "accelerometer=(),camera=(),geolocation=(),gyroscope=(),magnetometer=(),microphone=(),payment=(),usb=()"
|
||||
|
||||
# Set referrer policy
|
||||
Referrer-Policy "no-referrer"
|
||||
|
||||
# Force clients to use HTTPS
|
||||
Strict-Transport-Security "max-age=31536000"
|
||||
|
||||
# Prevent MIME type sniffing from the declared Content-Type
|
||||
X-Content-Type-Options "nosniff"
|
||||
|
||||
# X-Robots-Tag (comment to allow site indexing)
|
||||
X-Robots-Tag "noindex, noarchive, nofollow"
|
||||
|
||||
# Remove "Server" header
|
||||
-Server
|
||||
}
|
||||
|
||||
header @api {
|
||||
Access-Control-Allow-Methods "GET, OPTIONS"
|
||||
Access-Control-Allow-Origin "*"
|
||||
}
|
||||
|
||||
route {
|
||||
# Cache policy
|
||||
header Cache-Control "max-age=0, no-store"
|
||||
header @search Cache-Control "max-age=5, private"
|
||||
header @imageproxy Cache-Control "max-age=604800, public"
|
||||
header @static Cache-Control "max-age=31536000, public, immutable"
|
||||
}
|
||||
|
||||
# SearXNG (uWSGI)
|
||||
reverse_proxy localhost:8080 {
|
||||
header_up X-Forwarded-Port ""
|
||||
header_up X-Real-IP ""
|
||||
|
||||
# https://github.com/searx/searx-docker/issues/24
|
||||
header_up Connection "close"
|
||||
}
|
||||
57
plugins/webcrawler/Dockerfile
Normal file
57
plugins/webcrawler/Dockerfile
Normal file
@@ -0,0 +1,57 @@
|
||||
FROM node:20.10.0-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 安装 Chrome 运行依赖
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ca-certificates \
|
||||
fonts-liberation \
|
||||
libasound2 \
|
||||
libatk-bridge2.0-0 \
|
||||
libatk1.0-0 \
|
||||
libc6 \
|
||||
libcairo2 \
|
||||
libcups2 \
|
||||
libdbus-1-3 \
|
||||
libexpat1 \
|
||||
libfontconfig1 \
|
||||
libgbm1 \
|
||||
libgcc1 \
|
||||
libglib2.0-0 \
|
||||
libgtk-3-0 \
|
||||
libnspr4 \
|
||||
libnss3 \
|
||||
libpango-1.0-0 \
|
||||
libpangocairo-1.0-0 \
|
||||
libstdc++6 \
|
||||
libx11-6 \
|
||||
libx11-xcb1 \
|
||||
libxcb1 \
|
||||
libxcomposite1 \
|
||||
libxcursor1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxi6 \
|
||||
libxrandr2 \
|
||||
libxrender1 \
|
||||
libxss1 \
|
||||
libxtst6 \
|
||||
lsb-release \
|
||||
wget \
|
||||
xdg-utils \
|
||||
chromium \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 安装中文字体
|
||||
RUN apt-get update && apt-get install -y fonts-wqy-microhei && fc-cache -f -v
|
||||
|
||||
COPY SPIDER/. .
|
||||
|
||||
RUN test -f package.json || (echo "package.json missing" && exit 1)
|
||||
RUN test -f .env || (echo ".env file missing in SPIDER directory" && exit 1)
|
||||
|
||||
RUN npm run build
|
||||
|
||||
EXPOSE 3000
|
||||
CMD ["npm", "start"]
|
||||
73
plugins/webcrawler/README.md
Normal file
73
plugins/webcrawler/README.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# webcrawler
|
||||
## docker版快速部署
|
||||
|
||||
## 代码版部署
|
||||
0. 按照 https://github.com/searxng/searxng-docker 的方式处理docker
|
||||
1. 参考SPIDER文件夹下的.env.example,添加.env文件
|
||||
2. 进入SPIDER文件夹进行pnpm install
|
||||
3. 回到根目录,运行docker compose up -d
|
||||
|
||||
## 代码版开发
|
||||
1. 将docker-compose.yml中与SPIDER相关的部分注释掉(nodeapp)
|
||||
2. .env文件中的URL参照注释修改
|
||||
3. 注释掉启动puppteer部分里面指定浏览器地址的代码
|
||||
4. pnpm run dev
|
||||
|
||||
|
||||
## 测试样例:
|
||||
Auth的Bear Token记得填,也就是.env里的ACCESS_TOKEN
|
||||
|
||||
### 读取单页面(content以HTML形式返回)
|
||||
```
|
||||
http://localhost:3000/api/read?queryUrl=<url>
|
||||
```
|
||||
|
||||
返回结构
|
||||
```json
|
||||
|
||||
{
|
||||
"status": 200,
|
||||
"data": {
|
||||
"title": "something here",
|
||||
"content": "something here"
|
||||
}
|
||||
}
|
||||
{
|
||||
"status": 400,
|
||||
"error": {
|
||||
"code": "MISSING_PARAM",
|
||||
"message": "缺少必要参数: query"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 搜索(content以HTML形式返回)
|
||||
```
|
||||
http://localhost:3000/api/search?query=<something>&pageCount=5&needDetails=true&engine=baidu
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"status": 200,
|
||||
"data": {
|
||||
"results": [
|
||||
{
|
||||
"title": "string",
|
||||
"url": "string",
|
||||
"snippet": "string",
|
||||
"source": "string",
|
||||
"crawlStatus": "string",
|
||||
"score": 0,
|
||||
"content": "string"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"status": 400,
|
||||
"error": {
|
||||
"code": "MISSING_PARAM",
|
||||
"message": "缺少必要参数: query"
|
||||
}
|
||||
}
|
||||
```
|
||||
23
plugins/webcrawler/SPIDER/.env.example
Normal file
23
plugins/webcrawler/SPIDER/.env.example
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
ACCESS_TOKEN=114514
|
||||
DETECT_WEBSITE = zhuanlan.zhihu.com
|
||||
STRATEGIES=[{"waitUntil":"networkidle0","timeout":5000},{"waitUntil":"networkidle2","timeout":10000},{"waitUntil":"load","timeout":15000}]
|
||||
PORT=3000
|
||||
MAX_CONCURRENCY=10
|
||||
NODE_ENV=development
|
||||
ENGINE = [
|
||||
|
||||
]
|
||||
|
||||
ENGINE_BAIDUURL=https://www.baidu.com/s
|
||||
#ENGINE_SEARCHXNGURL=http://localhost:8080/search
|
||||
ENGINE_SEARCHXNGURL=http://searxng:8080/search
|
||||
|
||||
#MONGODB_URI=mongodb://root:example@localhost:27017
|
||||
MONGODB_URI=mongodb://root:example@mongodb:27017
|
||||
BLACKLIST = [".gov.cn",".edu.cn"]
|
||||
|
||||
STD_TTL=3600
|
||||
EXPIRE_AFTER_SECONDS=9000
|
||||
|
||||
#VALIDATE_PROXY=[{"ip":"","port":},{"ip":"","port":}]
|
||||
5804
plugins/webcrawler/SPIDER/package-lock.json
generated
Normal file
5804
plugins/webcrawler/SPIDER/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
62
plugins/webcrawler/SPIDER/package.json
Normal file
62
plugins/webcrawler/SPIDER/package.json
Normal file
@@ -0,0 +1,62 @@
|
||||
{
|
||||
"name": "spider",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "/dist/index.ts",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"start": "ts-node src/index.ts",
|
||||
"build": "webpack",
|
||||
"dev": "ts-node-dev --respawn src/index.ts"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@types/node-fetch": "^2.6.12",
|
||||
"assert": "^2.1.0",
|
||||
"axios": "^1.7.9",
|
||||
"body-parser": "^1.20.3",
|
||||
"browserify-zlib": "^0.2.0",
|
||||
"buffer": "^6.0.3",
|
||||
"cheerio": "^1.0.0",
|
||||
"crypto-browserify": "^3.12.1",
|
||||
"dotenv": "^16.4.7",
|
||||
"express": "^4.21.2",
|
||||
"https-proxy-agent": "^7.0.6",
|
||||
"jsdom": "^26.0.0",
|
||||
"mongodb": "^6.13.1",
|
||||
"node-cache": "^5.1.2",
|
||||
"node-fetch": "^2.7.0",
|
||||
"os-browserify": "^0.3.0",
|
||||
"path-browserify": "^1.0.1",
|
||||
"puppeteer": "^24.2.1",
|
||||
"puppeteer-cluster": "^0.24.0",
|
||||
"querystring-es3": "^0.2.1",
|
||||
"random-useragent": "^0.5.0",
|
||||
"spider": "file:",
|
||||
"stream-browserify": "^3.0.0",
|
||||
"stream-http": "^3.2.0",
|
||||
"string_decoder": "^1.3.0",
|
||||
"turndown": "^7.2.0",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"url": "^0.11.4",
|
||||
"user-agents": "^1.1.454",
|
||||
"util": "^0.12.5",
|
||||
"vm-browserify": "^1.1.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/body-parser": "^1.19.5",
|
||||
"@types/express": "^5.0.0",
|
||||
"@types/jsdom": "^21.1.7",
|
||||
"@types/node": "^22.13.4",
|
||||
"@types/random-useragent": "^0.3.3",
|
||||
"@types/user-agents": "^1.0.4",
|
||||
"ts-loader": "^9.5.2",
|
||||
"ts-node-dev": "^2.0.0",
|
||||
"typescript": "^5.7.3",
|
||||
"webpack": "^5.98.0",
|
||||
"webpack-cli": "^6.0.1",
|
||||
"webpack-node-externals": "^3.0.0"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
import { Request, Response } from 'express';
|
||||
import fetch from 'node-fetch';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const userAgents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
||||
];
|
||||
|
||||
export const quickFetch = async (req: Request, res: Response): Promise<void> => {
|
||||
const { url } = req.query;
|
||||
|
||||
if (!url) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: 'MISSING_PARAM',
|
||||
message: '缺少必要参数: url'
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(url as string, {
|
||||
headers: {
|
||||
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
||||
Referer: 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
Connection: 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
const data = await response.text();
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
content: data
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching the page:', error);
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: '发生错误'
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
export default { quickFetch };
|
||||
148
plugins/webcrawler/SPIDER/src/controllers/readController.ts
Normal file
148
plugins/webcrawler/SPIDER/src/controllers/readController.ts
Normal file
@@ -0,0 +1,148 @@
|
||||
import { Request, Response } from 'express';
|
||||
import puppeteer, { Page } from 'puppeteer';
|
||||
import * as cheerio from 'cheerio';
|
||||
import UserAgent from 'user-agents';
|
||||
import { setupPage } from '../utils/setupPage'; // 导入 setupPage 模块
|
||||
import dotenv from 'dotenv'; // 导入 dotenv 模块
|
||||
import { URL } from 'url'; // 导入 URL 模块
|
||||
import { handleSpecialWebsite } from '../specialHandlers'; // 导入 handleSpecialWebsite 模块
|
||||
import fetch from 'node-fetch';
|
||||
import { getCachedPage, updateCacheAsync } from '../utils/cacheUpdater'; // 导入缓存相关模块
|
||||
|
||||
dotenv.config(); // 加载环境变量
|
||||
|
||||
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
||||
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
||||
|
||||
export const readPage = async (req: Request, res: Response): Promise<void> => {
|
||||
const { queryUrl } = req.query;
|
||||
console.log('-------');
|
||||
console.log(queryUrl);
|
||||
console.log('-------');
|
||||
|
||||
if (!queryUrl) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: 'MISSING_PARAM',
|
||||
message: '缺少必要参数: queryUrl'
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const urlDomain = new URL(queryUrl as string).hostname;
|
||||
if (blacklistDomains.some((domain: string) => urlDomain.endsWith(domain))) {
|
||||
res.status(403).json({
|
||||
status: 403,
|
||||
error: {
|
||||
code: 'BLACKLISTED_DOMAIN',
|
||||
message: '该域名受到保护中'
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(queryUrl as string, {
|
||||
headers: {
|
||||
'User-Agent': new UserAgent({
|
||||
deviceCategory: 'desktop',
|
||||
platform: 'Linux x86_64'
|
||||
}).toString(),
|
||||
Referer: 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
Connection: 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const content = await response.text();
|
||||
const $ = cheerio.load(content);
|
||||
const cleanedContent = $('body').html();
|
||||
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
title: $('title').text(),
|
||||
content: cleanedContent
|
||||
}
|
||||
});
|
||||
|
||||
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||
console.log('Page read successfully');
|
||||
return;
|
||||
} else {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('快速抓取页面时发生错误:', error);
|
||||
}
|
||||
|
||||
try {
|
||||
const browser = await puppeteer.launch({
|
||||
ignoreDefaultArgs: ['--enable-automation'],
|
||||
headless: true,
|
||||
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu'
|
||||
// '--single-process'
|
||||
]
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
|
||||
// 检测是否需要特殊处理
|
||||
if (
|
||||
typeof queryUrl === 'string' &&
|
||||
detectWebsites.some((website) => queryUrl.includes(website))
|
||||
) {
|
||||
await setupPage(page);
|
||||
} else {
|
||||
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||
await page.setUserAgent(userAgent.toString());
|
||||
}
|
||||
|
||||
const queryUrlSafe = new URL(queryUrl as string).toString();
|
||||
|
||||
await page.goto(queryUrlSafe, { waitUntil: 'load' });
|
||||
await page.waitForSelector('body');
|
||||
|
||||
const title = await page.title();
|
||||
let cleanedContent = await handleSpecialWebsite(page, queryUrl as string);
|
||||
|
||||
if (!cleanedContent) {
|
||||
const content = await page.content();
|
||||
const $ = cheerio.load(content);
|
||||
cleanedContent = $('body').html();
|
||||
}
|
||||
|
||||
await page.close();
|
||||
await browser.close();
|
||||
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
title,
|
||||
content: cleanedContent
|
||||
}
|
||||
});
|
||||
|
||||
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||
console.log('Page read successfully');
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: '读取页面时发生内部服务器错误'
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
132
plugins/webcrawler/SPIDER/src/controllers/searchController.ts
Normal file
132
plugins/webcrawler/SPIDER/src/controllers/searchController.ts
Normal file
@@ -0,0 +1,132 @@
|
||||
import { Request, Response } from 'express';
|
||||
import { Cluster } from 'puppeteer-cluster';
|
||||
import dotenv from 'dotenv';
|
||||
import { performDeepSearch } from '../utils/deepSearch';
|
||||
import { fetchSearchResults as fetchBaiduResults } from '../engines/baiduEngine';
|
||||
import { fetchSearchResults as fetchSearchxngResults } from '../engines/searchxngEngine';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const strategies = JSON.parse(process.env.STRATEGIES || '[]');
|
||||
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
||||
const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
|
||||
|
||||
export const search = async (req: Request, res: Response): Promise<void> => {
|
||||
const {
|
||||
query,
|
||||
pageCount = 10,
|
||||
needDetails = 'false',
|
||||
engine = 'baidu',
|
||||
categories = 'general'
|
||||
} = req.query;
|
||||
const needDetailsBool = needDetails === 'true';
|
||||
|
||||
if (!query) {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: 'MISSING_PARAM',
|
||||
message: '缺少必要参数: query'
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
let fetchSearchResults;
|
||||
let searchUrlBase;
|
||||
try {
|
||||
if (engine === 'baidu') {
|
||||
fetchSearchResults = fetchBaiduResults;
|
||||
searchUrlBase = process.env.ENGINE_BAIDUURL;
|
||||
} else if (engine === 'searchxng') {
|
||||
fetchSearchResults = fetchSearchxngResults;
|
||||
searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
|
||||
} else {
|
||||
res.status(400).json({
|
||||
status: 400,
|
||||
error: {
|
||||
code: 'INVALID_ENGINE',
|
||||
message: '无效的搜索引擎'
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const { resultUrls, results } = await fetchSearchResults(
|
||||
query as string,
|
||||
Number(pageCount),
|
||||
searchUrlBase || '',
|
||||
categories as string
|
||||
);
|
||||
|
||||
//如果返回值为空,返回空数组
|
||||
if (results.size === 0) {
|
||||
console.log('No results found');
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: []
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if (!needDetailsBool) {
|
||||
console.log('Need details is false');
|
||||
results.forEach((value: any) => {
|
||||
if (value.crawlStatus === 'Pending') {
|
||||
value.crawlStatus = 'Success';
|
||||
}
|
||||
});
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: Array.from(results.values())
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.log('Need details is true');
|
||||
|
||||
const clusterInstance = await Cluster.launch({
|
||||
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||
maxConcurrency: maxConcurrency,
|
||||
puppeteerOptions: {
|
||||
ignoreDefaultArgs: ['--enable-automation'],
|
||||
headless: 'true',
|
||||
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu'
|
||||
]
|
||||
}
|
||||
});
|
||||
|
||||
const sortedResults = await performDeepSearch(
|
||||
clusterInstance,
|
||||
resultUrls,
|
||||
results,
|
||||
strategies,
|
||||
detectWebsites,
|
||||
Number(pageCount)
|
||||
);
|
||||
res.status(200).json({
|
||||
status: 200,
|
||||
data: {
|
||||
results: sortedResults.slice(0, Number(pageCount))
|
||||
}
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
res.status(500).json({
|
||||
status: 500,
|
||||
error: {
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: '发生错误'
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
export default { search };
|
||||
207
plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
Normal file
207
plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
Normal file
@@ -0,0 +1,207 @@
|
||||
import { URL } from 'url';
|
||||
import { JSDOM } from 'jsdom';
|
||||
import puppeteer from 'puppeteer';
|
||||
import { setupPage } from '../utils/setupPage';
|
||||
import { Cluster } from 'puppeteer-cluster';
|
||||
|
||||
async function randomWait(min: number, max: number) {
|
||||
// 随机等待时间
|
||||
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
|
||||
return new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
export const fetchSearchResults = async (
|
||||
query: string,
|
||||
pageCount: number,
|
||||
searchUrlBase: string,
|
||||
categories: string
|
||||
) => {
|
||||
console.log(`Fetching Baidu search results for query: ${query}`);
|
||||
// 如果 searchUrlBase 为空,返回空数组
|
||||
if (!searchUrlBase) {
|
||||
return { resultUrls: [], results: new Map() };
|
||||
}
|
||||
const resultUrls: string[] = [];
|
||||
const results = new Map<string, any>();
|
||||
|
||||
const pagesToFetch = Math.ceil(pageCount / 10);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
ignoreDefaultArgs: ['--enable-automation'],
|
||||
headless: true,
|
||||
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu'
|
||||
// '--single-process'
|
||||
]
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await setupPage(page);
|
||||
|
||||
for (let i = 0; i < pagesToFetch; i++) {
|
||||
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
|
||||
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
|
||||
let retryCount = 0;
|
||||
let success = false;
|
||||
|
||||
while (retryCount < 5 && !success) {
|
||||
try {
|
||||
console.time(`Page Load Time for page ${i + 1}`);
|
||||
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
|
||||
console.timeEnd(`Page Load Time for page ${i + 1}`);
|
||||
|
||||
let content = await page.content();
|
||||
let dom = new JSDOM(content);
|
||||
let document = dom.window.document;
|
||||
console.log(document.title);
|
||||
|
||||
// 如果是百度安全验证页面,重新设置页面并重新访问
|
||||
if (document.title.includes('百度安全验证')) {
|
||||
console.log('Detected Baidu security verification, retrying...');
|
||||
await setupPage(page);
|
||||
retryCount++;
|
||||
//随机等待时间
|
||||
await randomWait(1000, 3000);
|
||||
continue;
|
||||
}
|
||||
|
||||
// 解析搜索结果
|
||||
console.time(`Link Retrieval Time for page ${i + 1}`);
|
||||
|
||||
const resultContainers = document.querySelectorAll('.result.c-container');
|
||||
for (const result of resultContainers) {
|
||||
if (resultUrls.length > pageCount + 5) {
|
||||
break;
|
||||
}
|
||||
const titleElement = result.querySelector('h3 a');
|
||||
const title = titleElement ? titleElement.textContent : '';
|
||||
const url = titleElement ? titleElement.getAttribute('href') : '';
|
||||
const contentElement = result.querySelector('[class^="content"]');
|
||||
const content = contentElement ? contentElement.textContent : '';
|
||||
|
||||
if (url) {
|
||||
resultUrls.push(url);
|
||||
results.set(url, {
|
||||
title,
|
||||
url,
|
||||
snippet: content,
|
||||
source: 'baidu',
|
||||
crawlStatus: 'Pending',
|
||||
score: 0
|
||||
});
|
||||
}
|
||||
}
|
||||
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
|
||||
success = true;
|
||||
} catch (error) {
|
||||
console.error(`Error fetching page ${i + 1}:`, error);
|
||||
retryCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
console.log('fetch all fake urls');
|
||||
|
||||
// 快速检索真实 URL
|
||||
const urlsToProcessWithPuppeteer = [];
|
||||
for (const url of resultUrls) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||
Referer: 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
Connection: 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const realUrl = response.url;
|
||||
console.log('realurl:', realUrl);
|
||||
const result = results.get(url);
|
||||
if (result) {
|
||||
result.url = realUrl;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
} else {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error fetching original URL for ${url}:`, error);
|
||||
urlsToProcessWithPuppeteer.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('pass quickfetch');
|
||||
|
||||
// 并发处理真实 URL
|
||||
const cluster = await Cluster.launch({
|
||||
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||
maxConcurrency: 10,
|
||||
puppeteerOptions: {
|
||||
ignoreDefaultArgs: ['--enable-automation'],
|
||||
headless: 'true',
|
||||
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||
pipe: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
|
||||
}
|
||||
});
|
||||
|
||||
let failedUrlCount = 0;
|
||||
|
||||
await cluster.task(async ({ page, data: url }) => {
|
||||
let retryUrlCount = 0;
|
||||
let urlSuccess = false;
|
||||
while (retryUrlCount < 3 && !urlSuccess) {
|
||||
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
|
||||
try {
|
||||
await page.goto(url, { waitUntil: 'load' });
|
||||
// 检查页面是否被分离
|
||||
if (page.isClosed()) {
|
||||
throw new Error('Page has been closed');
|
||||
}
|
||||
const realUrl = page.url(); // 获取真实 URL
|
||||
const result = results.get(url);
|
||||
if (result) {
|
||||
result.url = realUrl;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
urlSuccess = true;
|
||||
} catch (error) {
|
||||
console.error(`Error fetching original URL, retrying...`, error);
|
||||
retryUrlCount++;
|
||||
await randomWait(1000, 3000);
|
||||
}
|
||||
}
|
||||
if (!urlSuccess) {
|
||||
failedUrlCount++;
|
||||
}
|
||||
});
|
||||
|
||||
for (const url of urlsToProcessWithPuppeteer) {
|
||||
cluster.queue(url);
|
||||
}
|
||||
|
||||
await cluster.idle();
|
||||
await cluster.close();
|
||||
|
||||
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
|
||||
|
||||
// 过滤并返回前 pageCount 个结果
|
||||
const filteredResults = Array.from(results.values()).slice(0, pageCount);
|
||||
|
||||
return {
|
||||
resultUrls: filteredResults.map((result) => result.url),
|
||||
results: new Map(filteredResults.map((result) => [result.url, result]))
|
||||
};
|
||||
};
|
||||
64
plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
Normal file
64
plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
Normal file
@@ -0,0 +1,64 @@
|
||||
import axios from 'axios';
|
||||
import { URL } from 'url';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
||||
|
||||
export const fetchSearchResults = async (
|
||||
query: string,
|
||||
pageCount: number,
|
||||
searchUrlBase: string,
|
||||
categories: string
|
||||
) => {
|
||||
const MAX_PAGES = (pageCount / 10 + 1) * 2 + 1; // 最多搜索的页面数
|
||||
//如果searchUrlBase为空,返回空数组,pagecount是需要搜索结果的数量
|
||||
if (!searchUrlBase) {
|
||||
return { resultUrls: [], results: new Map() };
|
||||
}
|
||||
const resultUrls: string[] = [];
|
||||
const results = new Map<string, any>();
|
||||
|
||||
let fetchedResultsCount = 0;
|
||||
let pageIndex = 0;
|
||||
|
||||
while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
|
||||
const searchUrl = new URL(
|
||||
`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`
|
||||
);
|
||||
console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
|
||||
const response = await axios.get(searchUrl.toString());
|
||||
const jsonResults = response.data.results;
|
||||
|
||||
for (let index = 0; index < jsonResults.length; index++) {
|
||||
const result = jsonResults[index];
|
||||
const resultDomain = new URL(result.url).hostname;
|
||||
if (
|
||||
blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) ||
|
||||
resultDomain.includes('zhihu')
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
resultUrls.push(result.url);
|
||||
results.set(result.url, {
|
||||
title: result.title,
|
||||
url: result.url,
|
||||
snippet: result.content,
|
||||
source: result.engine,
|
||||
crawlStatus: 'Pending',
|
||||
score: result.score
|
||||
});
|
||||
fetchedResultsCount++;
|
||||
if (fetchedResultsCount >= pageCount) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
pageIndex++;
|
||||
if (jsonResults.length === 0) {
|
||||
break; // 如果没有更多结果,退出循环
|
||||
}
|
||||
}
|
||||
|
||||
return { resultUrls, results };
|
||||
};
|
||||
18
plugins/webcrawler/SPIDER/src/index.ts
Normal file
18
plugins/webcrawler/SPIDER/src/index.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import express, { Application } from 'express';
|
||||
import bodyParser from 'body-parser';
|
||||
import searchRoutes from './routes/searchRoutes';
|
||||
import readRoutes from './routes/readRoutes';
|
||||
import quickfetchRoutes from './routes/quickfetchRoutes';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const app: Application = express();
|
||||
|
||||
app.use(bodyParser.json());
|
||||
app.use('/api', searchRoutes);
|
||||
app.use('/api', readRoutes);
|
||||
app.use('/api', quickfetchRoutes);
|
||||
|
||||
const PORT = process.env.PORT || 3000;
|
||||
app.listen(PORT, () => console.log(`Server running on port ${PORT}`));
|
||||
21
plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
Normal file
21
plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
import { Request, Response, NextFunction } from 'express';
|
||||
|
||||
const authMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
||||
const bearerHeader = req.headers['authorization'];
|
||||
|
||||
if (bearerHeader) {
|
||||
console.log('bearerHeader:' + bearerHeader);
|
||||
const bearer = bearerHeader.split(' ');
|
||||
const bearerToken = bearer[1];
|
||||
|
||||
if (bearerToken === process.env.ACCESS_TOKEN) {
|
||||
next();
|
||||
} else {
|
||||
res.status(403).json({ message: 'Invalid token' });
|
||||
}
|
||||
} else {
|
||||
res.status(401).json({ message: 'Bearer token not found' });
|
||||
}
|
||||
};
|
||||
|
||||
export default authMiddleware;
|
||||
9
plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import express from 'express';
|
||||
import { quickFetch } from '../controllers/quickfetchController';
|
||||
import authMiddleware from '../middleware/authMiddleware';
|
||||
|
||||
const readRoutes = express.Router();
|
||||
|
||||
readRoutes.get('/quickFetch', authMiddleware, quickFetch);
|
||||
|
||||
export default readRoutes;
|
||||
9
plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import express from 'express';
|
||||
import { readPage } from '../controllers/readController';
|
||||
import authMiddleware from '../middleware/authMiddleware';
|
||||
|
||||
const readRoutes = express.Router();
|
||||
|
||||
readRoutes.get('/read', authMiddleware, readPage);
|
||||
|
||||
export default readRoutes;
|
||||
9
plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import express from 'express';
|
||||
import searchController from '../controllers/searchController';
|
||||
import authMiddleware from '../middleware/authMiddleware';
|
||||
|
||||
const searchRoutes = express.Router();
|
||||
|
||||
searchRoutes.get('/search', authMiddleware, searchController.search);
|
||||
|
||||
export default searchRoutes;
|
||||
26
plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
Normal file
26
plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
import { Page } from 'puppeteer';
|
||||
|
||||
export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
|
||||
if (url.includes('blog.csdn.net')) {
|
||||
await page.waitForSelector('article');
|
||||
const content = await page.$eval('article', (el) => el.innerHTML);
|
||||
return content;
|
||||
}
|
||||
if (url.includes('zhuanlan.zhihu.com')) {
|
||||
console.log('是知乎,需要点击按掉!');
|
||||
console.log(await page.content());
|
||||
if (
|
||||
(await page.content()).includes(
|
||||
'{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}'
|
||||
)
|
||||
)
|
||||
return null;
|
||||
await page.waitForSelector('button[aria-label="关闭"]');
|
||||
await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
|
||||
await page.waitForSelector('article');
|
||||
const content = await page.$eval('article', (el) => el.innerHTML);
|
||||
return content;
|
||||
}
|
||||
// 可以添加更多特殊网站的处理逻辑
|
||||
return null;
|
||||
};
|
||||
77
plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
Normal file
77
plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
Normal file
@@ -0,0 +1,77 @@
|
||||
import NodeCache from 'node-cache';
|
||||
import { MongoClient } from 'mongodb';
|
||||
import crypto from 'crypto';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const cache = new NodeCache({ stdTTL: parseInt(process.env.STD_TTL || '3600') });
|
||||
const mongoClient = new MongoClient(process.env.MONGODB_URI || 'mongodb://localhost:27017');
|
||||
const dbName = 'pageCache';
|
||||
const collectionName = 'pages';
|
||||
|
||||
const connectToMongo = async () => {
|
||||
await mongoClient.connect();
|
||||
return mongoClient.db(dbName);
|
||||
};
|
||||
|
||||
const createTTLIndex = async () => {
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
await db
|
||||
.collection(collectionName)
|
||||
.createIndex(
|
||||
{ updatedAt: 1 },
|
||||
{ expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') }
|
||||
);
|
||||
console.log('TTL index created successfully');
|
||||
} catch (error) {
|
||||
console.error('Error creating TTL index:', error);
|
||||
}
|
||||
};
|
||||
|
||||
const getPageHash = (content: string) => {
|
||||
return crypto.createHash('md5').update(content).digest('hex');
|
||||
};
|
||||
|
||||
export const getCachedPage = async (url: string) => {
|
||||
const cachedPage = cache.get(url);
|
||||
if (cachedPage) return cachedPage;
|
||||
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
const page = await db.collection(collectionName).findOne({ url });
|
||||
if (page) cache.set(url, page);
|
||||
return page;
|
||||
} catch (error) {
|
||||
console.error('Error getting cached page:', error);
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
const savePageToCache = async (url: string, content: string) => {
|
||||
const hash = getPageHash(content);
|
||||
const page = { url, content, hash, updatedAt: new Date() };
|
||||
|
||||
cache.set(url, page); // 更新内存缓存
|
||||
|
||||
try {
|
||||
const db = await connectToMongo();
|
||||
await db.collection(collectionName).updateOne({ url }, { $set: page }, { upsert: true }); // 更新持久化缓存
|
||||
} catch (error) {
|
||||
console.error('Error saving page to cache:', error);
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
export const updateCacheAsync = async (url: string, content: string) => {
|
||||
await savePageToCache(url, content);
|
||||
};
|
||||
|
||||
process.on('SIGINT', async () => {
|
||||
await mongoClient.close();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
// 在应用启动时创建 TTL 索引
|
||||
createTTLIndex();
|
||||
158
plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
Normal file
158
plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
Normal file
@@ -0,0 +1,158 @@
|
||||
import { Cluster } from 'puppeteer-cluster';
|
||||
import * as cheerio from 'cheerio';
|
||||
import UserAgent from 'user-agents';
|
||||
import { setupPage } from './setupPage';
|
||||
import { getCachedPage, updateCacheAsync } from './cacheUpdater';
|
||||
import { handleSpecialWebsite } from '../specialHandlers';
|
||||
import fetch from 'node-fetch';
|
||||
|
||||
interface CachedPage {
|
||||
url: string;
|
||||
content: string;
|
||||
hash: string;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
export const performDeepSearch = async (
|
||||
clusterInstance: Cluster,
|
||||
resultUrls: string[],
|
||||
results: Map<string, any>,
|
||||
strategies: any[],
|
||||
detectWebsites: string[],
|
||||
pageCount: number
|
||||
) => {
|
||||
const tasks = [];
|
||||
|
||||
await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
|
||||
try {
|
||||
const cachedPage = (await getCachedPage(searchUrl)) as CachedPage | null;
|
||||
if (cachedPage) {
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cachedPage.content;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
return;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
|
||||
results.set(searchUrl, {
|
||||
url: searchUrl,
|
||||
error: (error as Error).message,
|
||||
crawlStatus: 'Failed'
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(searchUrl, {
|
||||
headers: {
|
||||
'User-Agent': new UserAgent({
|
||||
deviceCategory: 'desktop',
|
||||
platform: 'Linux x86_64'
|
||||
}).toString(),
|
||||
Referer: 'https://www.google.com/',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
Connection: 'keep-alive',
|
||||
'Cache-Control': 'no-cache'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const content = await response.text();
|
||||
const $ = cheerio.load(content);
|
||||
const cleanedContent = $('body').html() || '';
|
||||
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cleanedContent;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
|
||||
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||
return;
|
||||
} else {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`快速抓取页面 ${searchUrl} 时发生错误:`, error);
|
||||
}
|
||||
|
||||
try {
|
||||
if (detectWebsites.some((website) => searchUrl.includes(website))) {
|
||||
await setupPage(page);
|
||||
} else {
|
||||
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||
await page.setUserAgent(userAgent.toString());
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`访问页面 ${searchUrl} 设置用户代理时发生错误:`, error);
|
||||
}
|
||||
|
||||
let pageLoaded = false;
|
||||
let pageLoadError: Error | null = null;
|
||||
for (const strategy of strategies) {
|
||||
try {
|
||||
await page.goto(searchUrl, { waitUntil: strategy.waitUntil, timeout: strategy.timeout });
|
||||
pageLoaded = true;
|
||||
break;
|
||||
} catch (error: any) {
|
||||
if (error.name === 'TimeoutError') {
|
||||
pageLoadError = error;
|
||||
continue;
|
||||
} else {
|
||||
pageLoadError = error;
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!pageLoaded) {
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.error = pageLoadError;
|
||||
result.crawlStatus = 'Failed';
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
let cleanedContent = await handleSpecialWebsite(page, searchUrl);
|
||||
if (!cleanedContent) {
|
||||
const content = await page.content();
|
||||
const $ = cheerio.load(content);
|
||||
cleanedContent = $('body').html() || '';
|
||||
}
|
||||
|
||||
const result = results.get(searchUrl);
|
||||
if (result) {
|
||||
result.content = cleanedContent;
|
||||
result.crawlStatus = 'Success';
|
||||
}
|
||||
|
||||
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||
} catch (error) {
|
||||
results.set(searchUrl, {
|
||||
url: searchUrl,
|
||||
error: (error as Error).message,
|
||||
crawlStatus: 'Failed'
|
||||
});
|
||||
} finally {
|
||||
await page.close().catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
for (const url of resultUrls) {
|
||||
if (tasks.length >= pageCount + 10) {
|
||||
break;
|
||||
}
|
||||
tasks.push(clusterInstance.queue({ searchUrl: url }));
|
||||
}
|
||||
|
||||
await Promise.all(tasks);
|
||||
|
||||
await clusterInstance.idle();
|
||||
await clusterInstance.close();
|
||||
|
||||
return Array.from(results.values()).sort((a, b) => b.score - a.score);
|
||||
};
|
||||
81
plugins/webcrawler/SPIDER/src/utils/setupPage.ts
Normal file
81
plugins/webcrawler/SPIDER/src/utils/setupPage.ts
Normal file
@@ -0,0 +1,81 @@
|
||||
import { Page } from 'puppeteer';
|
||||
import randomUseragent from 'random-useragent';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
const getRandomUserAgent = () => {
|
||||
return randomUseragent.getRandom();
|
||||
};
|
||||
|
||||
const getRandomPlatform = () => {
|
||||
const platforms = ['Win32', 'MacIntel', 'Linux x86_64'];
|
||||
return platforms[Math.floor(Math.random() * platforms.length)];
|
||||
};
|
||||
|
||||
//代理池
|
||||
const validateproxy = process.env.VALIDATE_PROXY ? JSON.parse(process.env.VALIDATE_PROXY) : [];
|
||||
|
||||
const getRandomProxy = () => {
|
||||
return validateproxy.length > 0
|
||||
? validateproxy[Math.floor(Math.random() * validateproxy.length)]
|
||||
: null;
|
||||
};
|
||||
|
||||
const getRandomLanguages = () => {
|
||||
const languages = [
|
||||
['zh-CN', 'zh', 'en'],
|
||||
['en-US', 'en', 'fr'],
|
||||
['es-ES', 'es', 'en']
|
||||
];
|
||||
return languages[Math.floor(Math.random() * languages.length)];
|
||||
};
|
||||
|
||||
export const setupPage = async (page: Page): Promise<void> => {
|
||||
const proxy = getRandomProxy();
|
||||
if (proxy) {
|
||||
await page.authenticate({
|
||||
username: proxy.ip,
|
||||
password: proxy.port.toString()
|
||||
});
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
const newProto = (navigator as any).__proto__;
|
||||
delete newProto.webdriver;
|
||||
(navigator as any).__proto__ = newProto;
|
||||
(window as any).chrome = {};
|
||||
(window as any).chrome.app = {
|
||||
InstallState: 'testt',
|
||||
RunningState: 'estt',
|
||||
getDetails: 'stte',
|
||||
getIsInstalled: 'ttes'
|
||||
};
|
||||
(window as any).chrome.csi = function () {};
|
||||
(window as any).chrome.loadTimes = function () {};
|
||||
(window as any).chrome.runtime = function () {};
|
||||
Object.defineProperty(navigator, 'userAgent', {
|
||||
get: () => getRandomUserAgent()
|
||||
});
|
||||
Object.defineProperty(navigator, 'platform', {
|
||||
get: () => getRandomPlatform()
|
||||
});
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [
|
||||
{
|
||||
description: 'Shockwave Flash',
|
||||
filename: 'pepflashplayer.dll',
|
||||
length: 1,
|
||||
name: 'Shockwave Flash'
|
||||
}
|
||||
]
|
||||
});
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => getRandomLanguages()
|
||||
});
|
||||
const originalQuery = (window.navigator.permissions as any).query;
|
||||
(window.navigator.permissions as any).query = (parameters: any) =>
|
||||
parameters.name === 'notifications'
|
||||
? Promise.resolve({ state: Notification.permission } as PermissionStatus)
|
||||
: originalQuery(parameters);
|
||||
});
|
||||
};
|
||||
113
plugins/webcrawler/SPIDER/tsconfig.json
Normal file
113
plugins/webcrawler/SPIDER/tsconfig.json
Normal file
@@ -0,0 +1,113 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
/* Visit https://aka.ms/tsconfig to read more about this file */
|
||||
/* Projects */
|
||||
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
|
||||
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
|
||||
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
|
||||
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
|
||||
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
|
||||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||
"types": ["node"],
|
||||
/* Language and Environment */
|
||||
"target": "es6", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
|
||||
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
|
||||
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
|
||||
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
|
||||
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
|
||||
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
|
||||
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
|
||||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||
|
||||
/* Modules */
|
||||
//"module": "es6", /* Specify what module code is generated. */
|
||||
"rootDir": "./src", /* Specify the root folder within your source files. */
|
||||
"moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
|
||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ /* Specify type package names to be included without being referenced in a source file. */
|
||||
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
|
||||
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
|
||||
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
|
||||
// "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
|
||||
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
||||
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
||||
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
||||
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
|
||||
// "resolveJsonModule": true, /* Enable importing .json files. */
|
||||
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
||||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||
|
||||
/* JavaScript Support */
|
||||
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
||||
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
||||
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||
|
||||
/* Emit */
|
||||
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
|
||||
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
|
||||
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
|
||||
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||
"outDir": "./dist", /* Specify an output folder for all emitted files. */
|
||||
// "removeComments": true, /* Disable emitting comments. */
|
||||
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
|
||||
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
|
||||
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
|
||||
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
|
||||
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
|
||||
// "newLine": "crlf", /* Set the newline character for emitting files. */
|
||||
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
|
||||
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
|
||||
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
|
||||
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
|
||||
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
|
||||
|
||||
/* Interop Constraints */
|
||||
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
|
||||
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
|
||||
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
|
||||
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
||||
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
|
||||
|
||||
/* Type Checking */
|
||||
"typeRoots": ["./node_modules/@types"],
|
||||
"strict": true, /* Enable all strict type-checking options. */
|
||||
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
|
||||
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
|
||||
// "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
|
||||
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
|
||||
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
|
||||
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
|
||||
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
|
||||
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
|
||||
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
|
||||
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
|
||||
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
|
||||
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
|
||||
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
|
||||
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
|
||||
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
|
||||
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
|
||||
|
||||
/* Completeness */
|
||||
// "skipDefaultLibCheck": true,
|
||||
// /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||
"skipLibCheck": true/* Skip type checking all .d.ts files. */
|
||||
|
||||
},
|
||||
"include": ["src/**/*.ts"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
55
plugins/webcrawler/SPIDER/webpack.config.js
Normal file
55
plugins/webcrawler/SPIDER/webpack.config.js
Normal file
@@ -0,0 +1,55 @@
|
||||
// 引入path包
|
||||
const path = require('path')
|
||||
require('dotenv').config();
|
||||
const mode = process.env.NODE_ENV || 'development'
|
||||
|
||||
const nodeExternals = require('webpack-node-externals');
|
||||
module.exports = {
|
||||
target: 'node', // 指定构建目标为 Node.js
|
||||
externals: [nodeExternals()], // 排除 node_modules
|
||||
// 指定入口文件
|
||||
entry: "./src/index.ts",
|
||||
|
||||
// 指定打包文件所在目录
|
||||
output: {
|
||||
path: path.resolve(__dirname, 'dist'),
|
||||
// 打包后文件的名称
|
||||
filename: "bundle.js"
|
||||
},
|
||||
resolve: {
|
||||
extensions: ['.ts', '.tsx', '.js', '.json'],
|
||||
fallback: {
|
||||
"zlib": require.resolve("browserify-zlib"),
|
||||
"querystring": require.resolve("querystring-es3"),
|
||||
"path": require.resolve("path-browserify"),
|
||||
"crypto": require.resolve("crypto-browserify"),
|
||||
"stream": require.resolve("stream-browserify"),
|
||||
"os": require.resolve("os-browserify/browser"),
|
||||
"http": require.resolve("stream-http"),
|
||||
"net": false,
|
||||
"string_decoder": require.resolve("string_decoder/"),
|
||||
"url": require.resolve("url/"),
|
||||
"buffer": require.resolve("buffer/"),
|
||||
"util": require.resolve("util/"),
|
||||
// 新增 assert 的 fallback
|
||||
"assert": require.resolve("assert/"),
|
||||
// 处理新出现的 vm 警告
|
||||
"vm": require.resolve("vm-browserify"),
|
||||
"fs": false
|
||||
}
|
||||
},
|
||||
|
||||
// 指定webpack打包的时候要使用的模块
|
||||
module: {
|
||||
// 指定要价在的规则
|
||||
rules: [
|
||||
{
|
||||
// test指定的是规则生效的文件,意思是,用ts-loader来处理以ts为结尾的文件
|
||||
test: /\.ts$/,
|
||||
use: 'ts-loader',
|
||||
exclude: /node_modules/
|
||||
}
|
||||
]
|
||||
},
|
||||
mode,
|
||||
}
|
||||
124
plugins/webcrawler/docker-compose.yaml
Normal file
124
plugins/webcrawler/docker-compose.yaml
Normal file
@@ -0,0 +1,124 @@
|
||||
name: spider
|
||||
version: "0.0.1"
|
||||
|
||||
services:
|
||||
caddy:
|
||||
container_name: caddy
|
||||
image: docker.io/library/caddy:2-alpine
|
||||
network_mode: host
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./Caddyfile:/etc/caddy/Caddyfile:ro
|
||||
- caddy-data:/data:rw
|
||||
- caddy-config:/config:rw
|
||||
environment:
|
||||
- SEARXNG_HOSTNAME=${SEARXNG_HOSTNAME:-http://localhost}
|
||||
- SEARXNG_TLS=${LETSENCRYPT_EMAIL:-internal}
|
||||
cap_add:
|
||||
- NET_BIND_SERVICE
|
||||
cap_drop:
|
||||
- ALL
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "1m"
|
||||
max-file: "1"
|
||||
|
||||
redis:
|
||||
container_name: redis
|
||||
image: docker.io/valkey/valkey:8-alpine
|
||||
command: valkey-server --save 30 1 --loglevel warning
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- searxng
|
||||
volumes:
|
||||
- valkey-data2:/data
|
||||
cap_drop:
|
||||
- ALL
|
||||
cap_add:
|
||||
- SETGID
|
||||
- SETUID
|
||||
- DAC_OVERRIDE
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "1m"
|
||||
max-file: "1"
|
||||
|
||||
searxng:
|
||||
container_name: searxng
|
||||
image: docker.io/searxng/searxng:latest
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- searxng
|
||||
ports:
|
||||
- "127.0.0.1:8080:8080"
|
||||
volumes:
|
||||
- ./searxng:/etc/searxng:rw
|
||||
environment:
|
||||
- SEARXNG_BASE_URL=https://${SEARXNG_HOSTNAME:-localhost}/
|
||||
- UWSGI_WORKERS=${SEARXNG_UWSGI_WORKERS:-4}
|
||||
- UWSGI_THREADS=${SEARXNG_UWSGI_THREADS:-4}
|
||||
env_file:
|
||||
- .searchxng.env
|
||||
cap_drop:
|
||||
- ALL
|
||||
cap_add:
|
||||
- CHOWN
|
||||
- SETGID
|
||||
- SETUID
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "1m"
|
||||
max-file: "1"
|
||||
|
||||
mongodb:
|
||||
container_name: mongodb
|
||||
image: mongo:4.4
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- searxng
|
||||
ports:
|
||||
- "27017:27017"
|
||||
volumes:
|
||||
- mongo-data:/data/db
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: root
|
||||
MONGO_INITDB_ROOT_PASSWORD: example
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "1m"
|
||||
max-file: "1"
|
||||
|
||||
nodeapp:
|
||||
container_name: main
|
||||
build:
|
||||
context: .
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks:
|
||||
- searxng
|
||||
depends_on:
|
||||
- mongodb
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "1m"
|
||||
max-file: "1"
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
cpus: '2.0'
|
||||
networks:
|
||||
searxng:
|
||||
|
||||
volumes:
|
||||
caddy-data:
|
||||
caddy-config:
|
||||
valkey-data2:
|
||||
mongo-data:
|
||||
16
plugins/webcrawler/searxng-docker.service.template
Normal file
16
plugins/webcrawler/searxng-docker.service.template
Normal file
@@ -0,0 +1,16 @@
|
||||
[Unit]
|
||||
Description=SearXNG service
|
||||
Requires=docker.service
|
||||
After=docker.service
|
||||
|
||||
[Service]
|
||||
Restart=on-failure
|
||||
|
||||
Environment=SEARXNG_DOCKERCOMPOSEFILE=docker-compose.yaml
|
||||
|
||||
WorkingDirectory=/usr/local/searxng-docker
|
||||
ExecStart=/usr/local/bin/docker compose -f ${SEARXNG_DOCKERCOMPOSEFILE} up --remove-orphans
|
||||
ExecStop=/usr/local/bin/docker compose -f ${SEARXNG_DOCKERCOMPOSEFILE} down
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
6
plugins/webcrawler/searxng/limiter.toml
Normal file
6
plugins/webcrawler/searxng/limiter.toml
Normal file
@@ -0,0 +1,6 @@
|
||||
# This configuration file updates the default configuration file
|
||||
# See https://github.com/searxng/searxng/blob/master/searx/limiter.toml
|
||||
|
||||
[botdetection.ip_limit]
|
||||
# activate link_token method in the ip_limit method
|
||||
link_token = true
|
||||
38
plugins/webcrawler/searxng/settings.yml
Normal file
38
plugins/webcrawler/searxng/settings.yml
Normal file
@@ -0,0 +1,38 @@
|
||||
# see https://docs.searxng.org/admin/settings/settings.html#settings-use-default-settings
|
||||
use_default_settings: true
|
||||
server:
|
||||
# base_url is defined in the SEARXNG_BASE_URL environment variable, see .env and docker-compose.yml
|
||||
secret_key: "01042f00ae8bb522a9c03d3e7e1910318208a2c9fbdd23a6315577a9c98553a8" # change this!
|
||||
limiter: false # can be disabled for a private instance
|
||||
image_proxy: true
|
||||
ui:
|
||||
static_use_hash: true
|
||||
# 启用 cn 分类
|
||||
enabled_categories: [cn, general, images] # 按需添加其他分类
|
||||
# 或者定义分类显示顺序
|
||||
categories_order: [cn, general, images]
|
||||
redis:
|
||||
url: redis://redis:6379/0
|
||||
engines:
|
||||
- name: bing
|
||||
disabled: false
|
||||
categories: cn
|
||||
#- name: bilibili
|
||||
# engine: bilibili
|
||||
# shortcut: bil
|
||||
# disabled: false
|
||||
# categories: cn
|
||||
- name : baidu
|
||||
engine : json_engine
|
||||
paging : True
|
||||
first_page_num : 0
|
||||
search_url : https://www.baidu.com/s?tn=json&wd={query}&pn={pageno}&rn=50
|
||||
url_query : url
|
||||
title_query : title
|
||||
content_query : abs
|
||||
categories : cn
|
||||
|
||||
search:
|
||||
formats:
|
||||
- html
|
||||
- json
|
||||
@@ -32,6 +32,7 @@ import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||
import { formatTime2YMDHMS } from '@fastgpt/global/common/string/time';
|
||||
import MyModal from '@fastgpt/web/components/common/MyModal';
|
||||
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
|
||||
import SearchInput from '@fastgpt/web/components/common/Input/SearchInput';
|
||||
|
||||
type LogDetailType = {
|
||||
id: number;
|
||||
@@ -55,11 +56,13 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
||||
|
||||
const isRoot = userInfo?.username === 'root';
|
||||
const [filterProps, setFilterProps] = useState<{
|
||||
request_id?: string;
|
||||
channelId?: string;
|
||||
model?: string;
|
||||
code_type: 'all' | 'success' | 'error';
|
||||
dateRange: DateRangeType;
|
||||
}>({
|
||||
request_id: '',
|
||||
code_type: 'all',
|
||||
dateRange: {
|
||||
from: (() => {
|
||||
@@ -125,6 +128,7 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
||||
pageSize: 20,
|
||||
refreshDeps: [filterProps],
|
||||
params: {
|
||||
request_id: filterProps.request_id,
|
||||
channel: filterProps.channelId,
|
||||
model_name: filterProps.model,
|
||||
code_type: filterProps.code_type,
|
||||
@@ -162,7 +166,7 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
||||
content: item.content
|
||||
};
|
||||
});
|
||||
}, [data]);
|
||||
}, [channelList, data, systemModelList]);
|
||||
|
||||
const [logDetail, setLogDetail] = useState<LogDetailType>();
|
||||
|
||||
@@ -172,6 +176,13 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
||||
<Flex alignItems={'center'}>
|
||||
{Tab}
|
||||
<Box flex={1} />
|
||||
<Box flex={'0 0 200px'}>
|
||||
<SearchInput
|
||||
placeholder={t('account_model:log_request_id_search')}
|
||||
defaultValue={filterProps.request_id}
|
||||
onBlur={(e) => setFilterProps({ ...filterProps, request_id: e.target.value })}
|
||||
/>
|
||||
</Box>
|
||||
</Flex>
|
||||
)}
|
||||
<HStack spacing={4}>
|
||||
@@ -244,8 +255,8 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
||||
</Tr>
|
||||
</Thead>
|
||||
<Tbody>
|
||||
{formatData.map((item) => (
|
||||
<Tr key={item.id}>
|
||||
{formatData.map((item, index) => (
|
||||
<Tr key={index}>
|
||||
<Td>{item.channelName}</Td>
|
||||
<Td>{item.model}</Td>
|
||||
<Td>
|
||||
@@ -393,7 +404,7 @@ const LogDetail = ({ data, onClose }: { data: LogDetailType; onClose: () => void
|
||||
</GridItem>
|
||||
)}
|
||||
{detailData?.response_body && (
|
||||
<GridItem display={'flex'} borderBottomWidth="1px" borderRightWidth="1px" colSpan={2}>
|
||||
<GridItem display={'flex'} colSpan={2}>
|
||||
<Title>Response Body</Title>
|
||||
<Container>{detailData?.response_body}</Container>
|
||||
</GridItem>
|
||||
|
||||
@@ -93,7 +93,7 @@ function MemberTable({ Tabs }: { Tabs: React.ReactNode }) {
|
||||
|
||||
const { runAsync: onLeaveTeam } = useRequest2(
|
||||
async () => {
|
||||
const defaultTeam = myTeams.find((item) => item.defaultTeam) || myTeams[0];
|
||||
const defaultTeam = myTeams[0];
|
||||
// change to personal team
|
||||
onSwitchTeam(defaultTeam.teamId);
|
||||
return delLeaveTeam();
|
||||
|
||||
@@ -143,7 +143,7 @@ const checkInvalidData = async () => {
|
||||
|
||||
console.log(`检测集合完成`);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.log('checkInvalidData error', error);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -166,7 +166,9 @@ const checkInvalidDataText = async () => {
|
||||
await MongoDatasetDataText.deleteMany({
|
||||
dataId: { $in: unExistsSet }
|
||||
});
|
||||
} catch (error) {}
|
||||
} catch (error) {
|
||||
console.log('checkInvalidDataText error', error);
|
||||
}
|
||||
};
|
||||
|
||||
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
|
||||
|
||||
@@ -63,7 +63,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
||||
}
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
/* 批量创建子插件 */
|
||||
@@ -88,7 +88,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
||||
}
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
if (item.version === 'v2') {
|
||||
await MongoAppVersion.create(
|
||||
@@ -100,7 +100,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
||||
edges: item.edges
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -160,7 +160,7 @@ async function initPlugin(teamId?: string): Promise<any> {
|
||||
}
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
if (plugin.version === 'v2') {
|
||||
@@ -173,7 +173,7 @@ async function initPlugin(teamId?: string): Promise<any> {
|
||||
edges: plugin.edges
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -98,7 +98,8 @@ async function handler(
|
||||
}
|
||||
],
|
||||
{
|
||||
session
|
||||
session,
|
||||
ordered: true
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -126,7 +126,7 @@ export const onCreateApp = async ({
|
||||
'pluginData.nodeVersion': defaultNodeVersion
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
if (!AppFolderTypeList.includes(type!)) {
|
||||
@@ -144,7 +144,7 @@ export const onCreateApp = async ({
|
||||
isPublish: true
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -89,7 +89,8 @@ async function handler(req: ApiRequestProps<CreateAppFolderBody>) {
|
||||
}
|
||||
],
|
||||
{
|
||||
session
|
||||
session,
|
||||
ordered: true
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,18 +1,23 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { loadOpenAPISchemaFromUrl } from '@fastgpt/global/common/string/swagger';
|
||||
import { NextAPI } from '@/service/middleware/entry';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { isInternalAddress } from '@fastgpt/service/common/system/utils';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
const apiURL = req.body.url as string;
|
||||
async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
const apiURL = req.body.url as string;
|
||||
|
||||
return jsonRes(res, {
|
||||
data: await loadOpenAPISchemaFromUrl(apiURL)
|
||||
});
|
||||
} catch (err) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error: err
|
||||
});
|
||||
if (!apiURL) {
|
||||
return Promise.reject(CommonErrEnum.missingParams);
|
||||
}
|
||||
|
||||
const isInternal = isInternalAddress(apiURL);
|
||||
|
||||
if (isInternal) {
|
||||
return Promise.reject('Invalid url');
|
||||
}
|
||||
|
||||
return await loadOpenAPISchemaFromUrl(apiURL);
|
||||
}
|
||||
|
||||
export default NextAPI(handler);
|
||||
|
||||
@@ -45,7 +45,7 @@ async function handler(req: ApiRequestProps<PostPublishAppProps>, res: NextApiRe
|
||||
tmbId
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
// update app
|
||||
|
||||
@@ -88,7 +88,7 @@ async function handler(
|
||||
yuqueServer
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
await refreshSourceAvatar(avatar, undefined, session);
|
||||
|
||||
|
||||
@@ -34,17 +34,17 @@ async function handler(req: NextApiRequest) {
|
||||
});
|
||||
const datasetIds = datasets.map((d) => d._id);
|
||||
|
||||
// delete collection.tags
|
||||
await MongoDatasetCollectionTags.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
});
|
||||
|
||||
// delete all dataset.data and pg data
|
||||
await mongoSessionRun(async (session) => {
|
||||
// delete dataset data
|
||||
await delDatasetRelevantData({ datasets, session });
|
||||
|
||||
// delete collection.tags
|
||||
await MongoDatasetCollectionTags.deleteMany({
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
}).session(session);
|
||||
|
||||
// delete dataset
|
||||
await MongoDataset.deleteMany(
|
||||
{
|
||||
|
||||
@@ -87,7 +87,7 @@ async function handler(
|
||||
permission: OwnerPermissionVal
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -122,7 +122,8 @@ async function handler(req: ApiRequestProps<rebuildEmbeddingBody>): Promise<Resp
|
||||
}
|
||||
],
|
||||
{
|
||||
session
|
||||
session,
|
||||
ordered: true
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -98,7 +98,7 @@ export async function insertData2Dataset({
|
||||
}))
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
// 3. Create mongo data text
|
||||
@@ -112,7 +112,7 @@ export async function insertData2Dataset({
|
||||
fullTextToken: jiebaSplit({ text: qaStr })
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
|
||||
return {
|
||||
|
||||
@@ -192,7 +192,7 @@ const rebuildData = async ({
|
||||
retryCount: 50
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -37,7 +37,7 @@ export async function initRootUser(retry = 3): Promise<any> {
|
||||
password: hashStr(psw)
|
||||
}
|
||||
],
|
||||
{ session }
|
||||
{ session, ordered: true }
|
||||
);
|
||||
rootId = _id;
|
||||
}
|
||||
|
||||
@@ -152,6 +152,7 @@ export const putChannel = (data: ChannelInfoType) =>
|
||||
export const deleteChannel = (id: number) => DELETE(`/channel/${id}`);
|
||||
|
||||
export const getChannelLog = (params: {
|
||||
request_id?: string;
|
||||
channel?: string;
|
||||
model_name?: string;
|
||||
code_type?: 'all' | 'success' | 'error';
|
||||
@@ -164,6 +165,7 @@ export const getChannelLog = (params: {
|
||||
logs: ChannelLogListItemType[];
|
||||
total: number;
|
||||
}>(`/logs/search`, {
|
||||
request_id: params.request_id,
|
||||
channel: params.channel,
|
||||
model_name: params.model_name,
|
||||
code_type: params.code_type,
|
||||
|
||||
Reference in New Issue
Block a user