Compare commits
6 Commits
v4.8.23-fi
...
v4.8.23-fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e860c56b77 | ||
|
|
efac5312b4 | ||
|
|
4bc7f21182 | ||
|
|
113e8f711f | ||
|
|
abc6dffb41 | ||
|
|
f7b2a57ca3 |
@@ -114,15 +114,15 @@ services:
|
|||||||
# fastgpt
|
# fastgpt
|
||||||
sandbox:
|
sandbox:
|
||||||
container_name: sandbox
|
container_name: sandbox
|
||||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.22 # git
|
image: ghcr.io/labring/fastgpt-sandbox:v4.8.23-fix # git
|
||||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.22 # 阿里云
|
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.23-fix # 阿里云
|
||||||
networks:
|
networks:
|
||||||
- fastgpt
|
- fastgpt
|
||||||
restart: always
|
restart: always
|
||||||
fastgpt:
|
fastgpt:
|
||||||
container_name: fastgpt
|
container_name: fastgpt
|
||||||
image: ghcr.io/labring/fastgpt:v4.8.22 # git
|
image: ghcr.io/labring/fastgpt:v4.8.23-fix # git
|
||||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.22 # 阿里云
|
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.23-fix # 阿里云
|
||||||
ports:
|
ports:
|
||||||
- 3000:3000
|
- 3000:3000
|
||||||
networks:
|
networks:
|
||||||
|
|||||||
@@ -72,15 +72,15 @@ services:
|
|||||||
# fastgpt
|
# fastgpt
|
||||||
sandbox:
|
sandbox:
|
||||||
container_name: sandbox
|
container_name: sandbox
|
||||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.22 # git
|
image: ghcr.io/labring/fastgpt-sandbox:v4.8.23-fix # git
|
||||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.22 # 阿里云
|
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.23-fix # 阿里云
|
||||||
networks:
|
networks:
|
||||||
- fastgpt
|
- fastgpt
|
||||||
restart: always
|
restart: always
|
||||||
fastgpt:
|
fastgpt:
|
||||||
container_name: fastgpt
|
container_name: fastgpt
|
||||||
image: ghcr.io/labring/fastgpt:v4.8.22 # git
|
image: ghcr.io/labring/fastgpt:v4.8.23-fix # git
|
||||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.22 # 阿里云
|
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.23-fix # 阿里云
|
||||||
ports:
|
ports:
|
||||||
- 3000:3000
|
- 3000:3000
|
||||||
networks:
|
networks:
|
||||||
|
|||||||
@@ -53,15 +53,15 @@ services:
|
|||||||
wait $$!
|
wait $$!
|
||||||
sandbox:
|
sandbox:
|
||||||
container_name: sandbox
|
container_name: sandbox
|
||||||
image: ghcr.io/labring/fastgpt-sandbox:v4.8.22 # git
|
image: ghcr.io/labring/fastgpt-sandbox:v4.8.23-fix # git
|
||||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.22 # 阿里云
|
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.8.23-fix # 阿里云
|
||||||
networks:
|
networks:
|
||||||
- fastgpt
|
- fastgpt
|
||||||
restart: always
|
restart: always
|
||||||
fastgpt:
|
fastgpt:
|
||||||
container_name: fastgpt
|
container_name: fastgpt
|
||||||
image: ghcr.io/labring/fastgpt:v4.8.22 # git
|
image: ghcr.io/labring/fastgpt:v4.8.23-fix # git
|
||||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.22 # 阿里云
|
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.8.23-fix # 阿里云
|
||||||
ports:
|
ports:
|
||||||
- 3000:3000
|
- 3000:3000
|
||||||
networks:
|
networks:
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
title: 'V4.8.23(进行中)'
|
title: 'V4.8.23'
|
||||||
description: 'FastGPT V4.8.23 更新说明'
|
description: 'FastGPT V4.8.23 更新说明'
|
||||||
icon: 'upgrade'
|
icon: 'upgrade'
|
||||||
draft: false
|
draft: false
|
||||||
@@ -7,6 +7,28 @@ toc: true
|
|||||||
weight: 802
|
weight: 802
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## 更新指南
|
||||||
|
|
||||||
|
### 1. 做好数据库备份
|
||||||
|
|
||||||
|
### 2. 更新镜像:
|
||||||
|
|
||||||
|
- 更新 fastgpt 镜像 tag: v4.8.23-fix
|
||||||
|
- 更新 fastgpt-pro 商业版镜像 tag: v4.8.23-fix
|
||||||
|
- Sandbox 镜像无需更新
|
||||||
|
|
||||||
|
### 3. 运行升级脚本
|
||||||
|
|
||||||
|
从任意终端,发起 1 个 HTTP 请求。其中 {{rootkey}} 替换成环境变量里的 `rootkey`;{{host}} 替换成**FastGPT 域名**。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --location --request POST 'https://{{host}}/api/admin/initv4823' \
|
||||||
|
--header 'rootkey: {{rootkey}}' \
|
||||||
|
--header 'Content-Type: application/json'
|
||||||
|
```
|
||||||
|
|
||||||
|
脚本会清理一些知识库脏数据,主要是多余的全文索引。
|
||||||
|
|
||||||
## 🚀 新增内容
|
## 🚀 新增内容
|
||||||
|
|
||||||
1. 增加默认“知识库文本理解模型”配置
|
1. 增加默认“知识库文本理解模型”配置
|
||||||
@@ -28,4 +50,5 @@ weight: 802
|
|||||||
2. 暂时移除 md 阅读优化,避免链接分割错误。
|
2. 暂时移除 md 阅读优化,避免链接分割错误。
|
||||||
3. 离开团队时,未刷新成员列表。
|
3. 离开团队时,未刷新成员列表。
|
||||||
4. PPTX 编码错误,导致解析失败。
|
4. PPTX 编码错误,导致解析失败。
|
||||||
5. 删除知识库单条数据时,全文索引未跟随删除。
|
5. 删除知识库单条数据时,全文索引未跟随删除。
|
||||||
|
6. 修复 Mongo Dataset text 索引在查询数据时未生效。
|
||||||
@@ -118,7 +118,7 @@ export async function delImgByRelatedId({
|
|||||||
}: {
|
}: {
|
||||||
teamId: string;
|
teamId: string;
|
||||||
relateIds: string[];
|
relateIds: string[];
|
||||||
session: ClientSession;
|
session?: ClientSession;
|
||||||
}) {
|
}) {
|
||||||
if (relateIds.length === 0) return;
|
if (relateIds.length === 0) return;
|
||||||
|
|
||||||
|
|||||||
@@ -16,16 +16,30 @@ export async function connectMongo(): Promise<Mongoose> {
|
|||||||
|
|
||||||
console.log('mongo start connect');
|
console.log('mongo start connect');
|
||||||
try {
|
try {
|
||||||
|
// Remove existing listeners to prevent duplicates
|
||||||
|
connectionMongo.connection.removeAllListeners('error');
|
||||||
|
connectionMongo.connection.removeAllListeners('disconnected');
|
||||||
connectionMongo.set('strictQuery', false);
|
connectionMongo.set('strictQuery', false);
|
||||||
|
|
||||||
connectionMongo.connection.on('error', async (error) => {
|
connectionMongo.connection.on('error', async (error) => {
|
||||||
console.log('mongo error', error);
|
console.log('mongo error', error);
|
||||||
await connectionMongo.disconnect();
|
try {
|
||||||
await delay(1000);
|
if (connectionMongo.connection.readyState !== 0) {
|
||||||
connectMongo();
|
await connectionMongo.disconnect();
|
||||||
|
await delay(1000);
|
||||||
|
await connectMongo();
|
||||||
|
}
|
||||||
|
} catch (error) {}
|
||||||
});
|
});
|
||||||
connectionMongo.connection.on('disconnected', () => {
|
connectionMongo.connection.on('disconnected', async () => {
|
||||||
console.log('mongo disconnected');
|
console.log('mongo disconnected');
|
||||||
|
try {
|
||||||
|
if (connectionMongo.connection.readyState !== 0) {
|
||||||
|
await connectionMongo.disconnect();
|
||||||
|
await delay(1000);
|
||||||
|
await connectMongo();
|
||||||
|
}
|
||||||
|
} catch (error) {}
|
||||||
});
|
});
|
||||||
|
|
||||||
await connectionMongo.connect(process.env.MONGODB_URI as string, {
|
await connectionMongo.connect(process.env.MONGODB_URI as string, {
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ export const getAxiosConfig = (props?: { userKey?: OpenaiAccountType }) => {
|
|||||||
const { userKey } = props || {};
|
const { userKey } = props || {};
|
||||||
|
|
||||||
const baseUrl = userKey?.baseUrl || global?.systemEnv?.oneapiUrl || openaiBaseUrl;
|
const baseUrl = userKey?.baseUrl || global?.systemEnv?.oneapiUrl || openaiBaseUrl;
|
||||||
const apiKey = userKey?.key || global?.systemEnv?.chatApiKey || process.env.CHAT_API_KEY || '';
|
const apiKey = userKey?.key || global?.systemEnv?.chatApiKey || openaiBaseKey;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
baseUrl,
|
baseUrl,
|
||||||
|
|||||||
@@ -1,4 +1,10 @@
|
|||||||
{
|
{
|
||||||
"provider": "AliCloud",
|
"provider": "AliCloud",
|
||||||
"list": []
|
"list": [
|
||||||
}
|
{
|
||||||
|
"model": "SenseVoiceSmall",
|
||||||
|
"name": "SenseVoiceSmall",
|
||||||
|
"type": "stt"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ import { MongoImage } from '../../../common/file/image/schema';
|
|||||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||||
import { addDays } from 'date-fns';
|
import { addDays } from 'date-fns';
|
||||||
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
||||||
|
import { delay, retryFn } from '@fastgpt/global/common/system/utils';
|
||||||
|
|
||||||
export const createCollectionAndInsertData = async ({
|
export const createCollectionAndInsertData = async ({
|
||||||
dataset,
|
dataset,
|
||||||
@@ -216,7 +217,7 @@ export async function createOneCollection({
|
|||||||
nextSyncTime
|
nextSyncTime
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return collection;
|
return collection;
|
||||||
@@ -234,7 +235,7 @@ export const delCollectionRelatedSource = async ({
|
|||||||
relatedImgId?: string;
|
relatedImgId?: string;
|
||||||
};
|
};
|
||||||
}[];
|
}[];
|
||||||
session: ClientSession;
|
session?: ClientSession;
|
||||||
}) => {
|
}) => {
|
||||||
if (collections.length === 0) return;
|
if (collections.length === 0) return;
|
||||||
|
|
||||||
@@ -282,47 +283,55 @@ export async function delCollection({
|
|||||||
const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
|
const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
|
||||||
const collectionIds = collections.map((item) => String(item._id));
|
const collectionIds = collections.map((item) => String(item._id));
|
||||||
|
|
||||||
// Delete training data
|
await retryFn(async () => {
|
||||||
await MongoDatasetTraining.deleteMany({
|
await Promise.all([
|
||||||
teamId,
|
// Delete training data
|
||||||
datasetId: { $in: datasetIds },
|
MongoDatasetTraining.deleteMany({
|
||||||
collectionId: { $in: collectionIds }
|
teamId,
|
||||||
|
datasetId: { $in: datasetIds },
|
||||||
|
collectionId: { $in: collectionIds }
|
||||||
|
}),
|
||||||
|
// Delete dataset_data_texts
|
||||||
|
MongoDatasetDataText.deleteMany({
|
||||||
|
teamId,
|
||||||
|
datasetId: { $in: datasetIds },
|
||||||
|
collectionId: { $in: collectionIds }
|
||||||
|
}),
|
||||||
|
// Delete dataset_datas
|
||||||
|
MongoDatasetData.deleteMany({
|
||||||
|
teamId,
|
||||||
|
datasetId: { $in: datasetIds },
|
||||||
|
collectionId: { $in: collectionIds }
|
||||||
|
}),
|
||||||
|
...(delImg
|
||||||
|
? [
|
||||||
|
delImgByRelatedId({
|
||||||
|
teamId,
|
||||||
|
relateIds: collections
|
||||||
|
.map((item) => item?.metadata?.relatedImgId || '')
|
||||||
|
.filter(Boolean)
|
||||||
|
})
|
||||||
|
]
|
||||||
|
: []),
|
||||||
|
...(delFile
|
||||||
|
? [
|
||||||
|
delFileByFileIdList({
|
||||||
|
bucketName: BucketNameEnum.dataset,
|
||||||
|
fileIdList: collections.map((item) => item?.fileId || '').filter(Boolean)
|
||||||
|
})
|
||||||
|
]
|
||||||
|
: []),
|
||||||
|
// Delete vector data
|
||||||
|
deleteDatasetDataVector({ teamId, datasetIds, collectionIds })
|
||||||
|
]);
|
||||||
|
|
||||||
|
// delete collections
|
||||||
|
await MongoDatasetCollection.deleteMany(
|
||||||
|
{
|
||||||
|
teamId,
|
||||||
|
_id: { $in: collectionIds }
|
||||||
|
},
|
||||||
|
{ session }
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
if (delImg) {
|
|
||||||
await delImgByRelatedId({
|
|
||||||
teamId,
|
|
||||||
relateIds: collections.map((item) => item?.metadata?.relatedImgId || '').filter(Boolean),
|
|
||||||
session
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if (delFile) {
|
|
||||||
await delFileByFileIdList({
|
|
||||||
bucketName: BucketNameEnum.dataset,
|
|
||||||
fileIdList: collections.map((item) => item?.fileId || '').filter(Boolean)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete dataset_datas
|
|
||||||
await MongoDatasetData.deleteMany(
|
|
||||||
{ teamId, datasetId: { $in: datasetIds }, collectionId: { $in: collectionIds } },
|
|
||||||
{ session }
|
|
||||||
);
|
|
||||||
// Delete dataset_data_texts
|
|
||||||
await MongoDatasetDataText.deleteMany(
|
|
||||||
{ teamId, datasetId: { $in: datasetIds }, collectionId: { $in: collectionIds } },
|
|
||||||
{ session }
|
|
||||||
);
|
|
||||||
|
|
||||||
// delete collections
|
|
||||||
await MongoDatasetCollection.deleteMany(
|
|
||||||
{
|
|
||||||
teamId,
|
|
||||||
_id: { $in: collectionIds }
|
|
||||||
},
|
|
||||||
{ session }
|
|
||||||
);
|
|
||||||
|
|
||||||
// no session delete: delete files, vector data
|
|
||||||
await deleteDatasetDataVector({ teamId, datasetIds, collectionIds });
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ export const createOrGetCollectionTags = async ({
|
|||||||
datasetId,
|
datasetId,
|
||||||
tag: tagContent
|
tag: tagContent
|
||||||
})),
|
})),
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)];
|
return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)];
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import { MongoDatasetData } from './data/schema';
|
|||||||
import { deleteDatasetDataVector } from '../../common/vectorStore/controller';
|
import { deleteDatasetDataVector } from '../../common/vectorStore/controller';
|
||||||
import { MongoDatasetDataText } from './data/dataTextSchema';
|
import { MongoDatasetDataText } from './data/dataTextSchema';
|
||||||
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
|
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
|
||||||
|
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||||
|
|
||||||
/* ============= dataset ========== */
|
/* ============= dataset ========== */
|
||||||
/* find all datasetId by top datasetId */
|
/* find all datasetId by top datasetId */
|
||||||
@@ -78,40 +79,39 @@ export async function delDatasetRelevantData({
|
|||||||
|
|
||||||
const datasetIds = datasets.map((item) => item._id);
|
const datasetIds = datasets.map((item) => item._id);
|
||||||
|
|
||||||
// delete training data
|
|
||||||
await MongoDatasetTraining.deleteMany({
|
|
||||||
teamId,
|
|
||||||
datasetId: { $in: datasetIds }
|
|
||||||
});
|
|
||||||
|
|
||||||
// Get _id, teamId, fileId, metadata.relatedImgId for all collections
|
// Get _id, teamId, fileId, metadata.relatedImgId for all collections
|
||||||
const collections = await MongoDatasetCollection.find(
|
const collections = await MongoDatasetCollection.find(
|
||||||
{
|
{
|
||||||
teamId,
|
teamId,
|
||||||
datasetId: { $in: datasetIds }
|
datasetId: { $in: datasetIds }
|
||||||
},
|
},
|
||||||
'_id teamId datasetId fileId metadata',
|
'_id teamId datasetId fileId metadata'
|
||||||
{ session }
|
|
||||||
).lean();
|
).lean();
|
||||||
|
|
||||||
// Delete Image and file
|
await retryFn(async () => {
|
||||||
await delCollectionRelatedSource({ collections, session });
|
await Promise.all([
|
||||||
|
// delete training data
|
||||||
|
MongoDatasetTraining.deleteMany({
|
||||||
|
teamId,
|
||||||
|
datasetId: { $in: datasetIds }
|
||||||
|
}),
|
||||||
|
//Delete dataset_data_texts
|
||||||
|
MongoDatasetDataText.deleteMany({
|
||||||
|
teamId,
|
||||||
|
datasetId: { $in: datasetIds }
|
||||||
|
}),
|
||||||
|
//delete dataset_datas
|
||||||
|
MongoDatasetData.deleteMany({ teamId, datasetId: { $in: datasetIds } }),
|
||||||
|
// Delete Image and file
|
||||||
|
delCollectionRelatedSource({ collections }),
|
||||||
|
// Delete vector data
|
||||||
|
deleteDatasetDataVector({ teamId, datasetIds })
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
// delete collections
|
// delete collections
|
||||||
await MongoDatasetCollection.deleteMany({
|
await MongoDatasetCollection.deleteMany({
|
||||||
teamId,
|
teamId,
|
||||||
datasetId: { $in: datasetIds }
|
datasetId: { $in: datasetIds }
|
||||||
}).session(session);
|
}).session(session);
|
||||||
|
|
||||||
// No session delete:
|
|
||||||
// Delete dataset_data_texts
|
|
||||||
await MongoDatasetDataText.deleteMany({
|
|
||||||
teamId,
|
|
||||||
datasetId: { $in: datasetIds }
|
|
||||||
});
|
|
||||||
// delete dataset_datas
|
|
||||||
await MongoDatasetData.deleteMany({ teamId, datasetId: { $in: datasetIds } });
|
|
||||||
|
|
||||||
// Delete vector data
|
|
||||||
await deleteDatasetDataVector({ teamId, datasetIds });
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ try {
|
|||||||
default_language: 'none'
|
default_language: 'none'
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, collectionId: 1 });
|
||||||
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
|
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
|
|||||||
@@ -196,7 +196,8 @@ export async function syncCollaborators({
|
|||||||
permission: item.permission
|
permission: item.permission
|
||||||
})),
|
})),
|
||||||
{
|
{
|
||||||
session
|
session,
|
||||||
|
ordered: true
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ export const initTeamFreePlan = async ({
|
|||||||
surplusPoints: freePoints
|
surplusPoints: freePoints
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ export const createTrainingUsage = async ({
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return { billId: String(_id) };
|
return { billId: String(_id) };
|
||||||
|
|||||||
@@ -217,7 +217,7 @@ export function useScrollPagination<
|
|||||||
const offset = init ? 0 : data.length;
|
const offset = init ? 0 : data.length;
|
||||||
|
|
||||||
setTrue();
|
setTrue();
|
||||||
console.log(offset);
|
|
||||||
try {
|
try {
|
||||||
const res = await api({
|
const res = await api({
|
||||||
offset,
|
offset,
|
||||||
|
|||||||
@@ -24,6 +24,7 @@
|
|||||||
"key_type": "API key format:",
|
"key_type": "API key format:",
|
||||||
"log": "Call log",
|
"log": "Call log",
|
||||||
"log_detail": "Log details",
|
"log_detail": "Log details",
|
||||||
|
"log_request_id_search": "Search by requestId",
|
||||||
"log_status": "Status",
|
"log_status": "Status",
|
||||||
"mapping": "Model Mapping",
|
"mapping": "Model Mapping",
|
||||||
"mapping_tip": "A valid Json is required. \nThe model can be mapped when sending a request to the actual address. \nFor example:\n{\n \n \"gpt-4o\": \"gpt-4o-test\"\n\n}\n\nWhen FastGPT requests the gpt-4o model, the gpt-4o-test model is sent to the actual address, instead of gpt-4o.",
|
"mapping_tip": "A valid Json is required. \nThe model can be mapped when sending a request to the actual address. \nFor example:\n{\n \n \"gpt-4o\": \"gpt-4o-test\"\n\n}\n\nWhen FastGPT requests the gpt-4o model, the gpt-4o-test model is sent to the actual address, instead of gpt-4o.",
|
||||||
|
|||||||
@@ -24,6 +24,7 @@
|
|||||||
"key_type": "API key 格式: ",
|
"key_type": "API key 格式: ",
|
||||||
"log": "调用日志",
|
"log": "调用日志",
|
||||||
"log_detail": "日志详情",
|
"log_detail": "日志详情",
|
||||||
|
"log_request_id_search": "根据 requestId 搜索",
|
||||||
"log_status": "状态",
|
"log_status": "状态",
|
||||||
"mapping": "模型映射",
|
"mapping": "模型映射",
|
||||||
"mapping_tip": "需填写一个有效 Json。可在向实际地址发送请求时,对模型进行映射。例如:\n{\n \"gpt-4o\": \"gpt-4o-test\"\n}\n当 FastGPT 请求 gpt-4o 模型时,会向实际地址发送 gpt-4o-test 的模型,而不是 gpt-4o。",
|
"mapping_tip": "需填写一个有效 Json。可在向实际地址发送请求时,对模型进行映射。例如:\n{\n \"gpt-4o\": \"gpt-4o-test\"\n}\n当 FastGPT 请求 gpt-4o 模型时,会向实际地址发送 gpt-4o-test 的模型,而不是 gpt-4o。",
|
||||||
|
|||||||
@@ -22,6 +22,7 @@
|
|||||||
"key_type": "API key 格式:",
|
"key_type": "API key 格式:",
|
||||||
"log": "調用日誌",
|
"log": "調用日誌",
|
||||||
"log_detail": "日誌詳情",
|
"log_detail": "日誌詳情",
|
||||||
|
"log_request_id_search": "根據 requestId 搜索",
|
||||||
"log_status": "狀態",
|
"log_status": "狀態",
|
||||||
"mapping": "模型映射",
|
"mapping": "模型映射",
|
||||||
"mapping_tip": "需填寫一個有效 Json。\n可在向實際地址發送請求時,對模型進行映射。\n例如:\n{\n \n \"gpt-4o\": \"gpt-4o-test\"\n\n}\n\n當 FastGPT 請求 gpt-4o 模型時,會向實際地址發送 gpt-4o-test 的模型,而不是 gpt-4o。",
|
"mapping_tip": "需填寫一個有效 Json。\n可在向實際地址發送請求時,對模型進行映射。\n例如:\n{\n \n \"gpt-4o\": \"gpt-4o-test\"\n\n}\n\n當 FastGPT 請求 gpt-4o 模型時,會向實際地址發送 gpt-4o-test 的模型,而不是 gpt-4o。",
|
||||||
|
|||||||
3
plugins/webcrawler/.dockerignore
Normal file
3
plugins/webcrawler/.dockerignore
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# 忽略 .git 目录及其内容
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
25
plugins/webcrawler/.gitignore
vendored
Normal file
25
plugins/webcrawler/.gitignore
vendored
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
*~
|
||||||
|
|
||||||
|
searxng-docker.service
|
||||||
|
caddy
|
||||||
|
srv
|
||||||
|
searxng/uwsgi.ini
|
||||||
|
.env
|
||||||
|
SPIDER/.env
|
||||||
|
|
||||||
|
# 忽略 node_modules 文件夹
|
||||||
|
SPIDER/node_modules/
|
||||||
|
|
||||||
|
# 忽略构建输出文件夹
|
||||||
|
SPIDER/dist/
|
||||||
|
|
||||||
|
# 忽略日志文件
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# 忽略操作系统生成的文件
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# 忽略 IDE/编辑器生成的文件
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
14
plugins/webcrawler/.searchxng.env
Normal file
14
plugins/webcrawler/.searchxng.env
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# By default listen on https://localhost
|
||||||
|
# To change this:
|
||||||
|
# * uncomment SEARXNG_HOSTNAME, and replace <host> by the SearXNG hostname
|
||||||
|
# * uncomment LETSENCRYPT_EMAIL, and replace <email> by your email (require to create a Let's Encrypt certificate)
|
||||||
|
|
||||||
|
# SEARXNG_HOSTNAME=<host>
|
||||||
|
# LETSENCRYPT_EMAIL=<email>
|
||||||
|
|
||||||
|
# Optional:
|
||||||
|
# If you run a very small or a very large instance, you might want to change the amount of used uwsgi workers and threads per worker
|
||||||
|
# More workers (= processes) means that more search requests can be handled at the same time, but it also causes more resource usage
|
||||||
|
|
||||||
|
SEARXNG_UWSGI_WORKERS=4
|
||||||
|
SEARXNG_UWSGI_THREADS=4
|
||||||
91
plugins/webcrawler/Caddyfile
Normal file
91
plugins/webcrawler/Caddyfile
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
{
|
||||||
|
admin off
|
||||||
|
|
||||||
|
log {
|
||||||
|
output stderr
|
||||||
|
format filter {
|
||||||
|
# Preserves first 8 bits from IPv4 and 32 bits from IPv6
|
||||||
|
request>remote_ip ip_mask 8 32
|
||||||
|
request>client_ip ip_mask 8 32
|
||||||
|
|
||||||
|
# Remove identificable information
|
||||||
|
request>remote_port delete
|
||||||
|
request>headers delete
|
||||||
|
request>uri query {
|
||||||
|
delete url
|
||||||
|
delete h
|
||||||
|
delete q
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{$SEARXNG_HOSTNAME}
|
||||||
|
|
||||||
|
tls {$SEARXNG_TLS}
|
||||||
|
|
||||||
|
encode zstd gzip
|
||||||
|
|
||||||
|
@api {
|
||||||
|
path /config
|
||||||
|
path /healthz
|
||||||
|
path /stats/errors
|
||||||
|
path /stats/checker
|
||||||
|
}
|
||||||
|
|
||||||
|
@search {
|
||||||
|
path /search
|
||||||
|
}
|
||||||
|
|
||||||
|
@imageproxy {
|
||||||
|
path /image_proxy
|
||||||
|
}
|
||||||
|
|
||||||
|
@static {
|
||||||
|
path /static/*
|
||||||
|
}
|
||||||
|
|
||||||
|
header {
|
||||||
|
# CSP (https://content-security-policy.com)
|
||||||
|
Content-Security-Policy "upgrade-insecure-requests; default-src 'none'; script-src 'self'; style-src 'self' 'unsafe-inline'; form-action 'self' https://github.com/searxng/searxng/issues/new; font-src 'self'; frame-ancestors 'self'; base-uri 'self'; connect-src 'self' https://overpass-api.de; img-src * data:; frame-src https://www.youtube-nocookie.com https://player.vimeo.com https://www.dailymotion.com https://www.deezer.com https://www.mixcloud.com https://w.soundcloud.com https://embed.spotify.com;"
|
||||||
|
|
||||||
|
# Disable some browser features
|
||||||
|
Permissions-Policy "accelerometer=(),camera=(),geolocation=(),gyroscope=(),magnetometer=(),microphone=(),payment=(),usb=()"
|
||||||
|
|
||||||
|
# Set referrer policy
|
||||||
|
Referrer-Policy "no-referrer"
|
||||||
|
|
||||||
|
# Force clients to use HTTPS
|
||||||
|
Strict-Transport-Security "max-age=31536000"
|
||||||
|
|
||||||
|
# Prevent MIME type sniffing from the declared Content-Type
|
||||||
|
X-Content-Type-Options "nosniff"
|
||||||
|
|
||||||
|
# X-Robots-Tag (comment to allow site indexing)
|
||||||
|
X-Robots-Tag "noindex, noarchive, nofollow"
|
||||||
|
|
||||||
|
# Remove "Server" header
|
||||||
|
-Server
|
||||||
|
}
|
||||||
|
|
||||||
|
header @api {
|
||||||
|
Access-Control-Allow-Methods "GET, OPTIONS"
|
||||||
|
Access-Control-Allow-Origin "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
route {
|
||||||
|
# Cache policy
|
||||||
|
header Cache-Control "max-age=0, no-store"
|
||||||
|
header @search Cache-Control "max-age=5, private"
|
||||||
|
header @imageproxy Cache-Control "max-age=604800, public"
|
||||||
|
header @static Cache-Control "max-age=31536000, public, immutable"
|
||||||
|
}
|
||||||
|
|
||||||
|
# SearXNG (uWSGI)
|
||||||
|
reverse_proxy localhost:8080 {
|
||||||
|
header_up X-Forwarded-Port ""
|
||||||
|
header_up X-Real-IP ""
|
||||||
|
|
||||||
|
# https://github.com/searx/searx-docker/issues/24
|
||||||
|
header_up Connection "close"
|
||||||
|
}
|
||||||
57
plugins/webcrawler/Dockerfile
Normal file
57
plugins/webcrawler/Dockerfile
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
FROM node:20.10.0-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 安装 Chrome 运行依赖
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
ca-certificates \
|
||||||
|
fonts-liberation \
|
||||||
|
libasound2 \
|
||||||
|
libatk-bridge2.0-0 \
|
||||||
|
libatk1.0-0 \
|
||||||
|
libc6 \
|
||||||
|
libcairo2 \
|
||||||
|
libcups2 \
|
||||||
|
libdbus-1-3 \
|
||||||
|
libexpat1 \
|
||||||
|
libfontconfig1 \
|
||||||
|
libgbm1 \
|
||||||
|
libgcc1 \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libgtk-3-0 \
|
||||||
|
libnspr4 \
|
||||||
|
libnss3 \
|
||||||
|
libpango-1.0-0 \
|
||||||
|
libpangocairo-1.0-0 \
|
||||||
|
libstdc++6 \
|
||||||
|
libx11-6 \
|
||||||
|
libx11-xcb1 \
|
||||||
|
libxcb1 \
|
||||||
|
libxcomposite1 \
|
||||||
|
libxcursor1 \
|
||||||
|
libxdamage1 \
|
||||||
|
libxext6 \
|
||||||
|
libxfixes3 \
|
||||||
|
libxi6 \
|
||||||
|
libxrandr2 \
|
||||||
|
libxrender1 \
|
||||||
|
libxss1 \
|
||||||
|
libxtst6 \
|
||||||
|
lsb-release \
|
||||||
|
wget \
|
||||||
|
xdg-utils \
|
||||||
|
chromium \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 安装中文字体
|
||||||
|
RUN apt-get update && apt-get install -y fonts-wqy-microhei && fc-cache -f -v
|
||||||
|
|
||||||
|
COPY SPIDER/. .
|
||||||
|
|
||||||
|
RUN test -f package.json || (echo "package.json missing" && exit 1)
|
||||||
|
RUN test -f .env || (echo ".env file missing in SPIDER directory" && exit 1)
|
||||||
|
|
||||||
|
RUN npm run build
|
||||||
|
|
||||||
|
EXPOSE 3000
|
||||||
|
CMD ["npm", "start"]
|
||||||
73
plugins/webcrawler/README.md
Normal file
73
plugins/webcrawler/README.md
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# webcrawler
|
||||||
|
## docker版快速部署
|
||||||
|
|
||||||
|
## 代码版部署
|
||||||
|
0. 按照 https://github.com/searxng/searxng-docker 的方式处理docker
|
||||||
|
1. 参考SPIDER文件夹下的.env.example,添加.env文件
|
||||||
|
2. 进入SPIDER文件夹进行pnpm install
|
||||||
|
3. 回到根目录,运行docker compose up -d
|
||||||
|
|
||||||
|
## 代码版开发
|
||||||
|
1. 将docker-compose.yml中与SPIDER相关的部分注释掉(nodeapp)
|
||||||
|
2. .env文件中的URL参照注释修改
|
||||||
|
3. 注释掉启动puppteer部分里面指定浏览器地址的代码
|
||||||
|
4. pnpm run dev
|
||||||
|
|
||||||
|
|
||||||
|
## 测试样例:
|
||||||
|
Auth的Bear Token记得填,也就是.env里的ACCESS_TOKEN
|
||||||
|
|
||||||
|
### 读取单页面(content以HTML形式返回)
|
||||||
|
```
|
||||||
|
http://localhost:3000/api/read?queryUrl=<url>
|
||||||
|
```
|
||||||
|
|
||||||
|
返回结构
|
||||||
|
```json
|
||||||
|
|
||||||
|
{
|
||||||
|
"status": 200,
|
||||||
|
"data": {
|
||||||
|
"title": "something here",
|
||||||
|
"content": "something here"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
"status": 400,
|
||||||
|
"error": {
|
||||||
|
"code": "MISSING_PARAM",
|
||||||
|
"message": "缺少必要参数: query"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 搜索(content以HTML形式返回)
|
||||||
|
```
|
||||||
|
http://localhost:3000/api/search?query=<something>&pageCount=5&needDetails=true&engine=baidu
|
||||||
|
```
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": 200,
|
||||||
|
"data": {
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"title": "string",
|
||||||
|
"url": "string",
|
||||||
|
"snippet": "string",
|
||||||
|
"source": "string",
|
||||||
|
"crawlStatus": "string",
|
||||||
|
"score": 0,
|
||||||
|
"content": "string"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
"status": 400,
|
||||||
|
"error": {
|
||||||
|
"code": "MISSING_PARAM",
|
||||||
|
"message": "缺少必要参数: query"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
23
plugins/webcrawler/SPIDER/.env.example
Normal file
23
plugins/webcrawler/SPIDER/.env.example
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
|
||||||
|
ACCESS_TOKEN=114514
|
||||||
|
DETECT_WEBSITE = zhuanlan.zhihu.com
|
||||||
|
STRATEGIES=[{"waitUntil":"networkidle0","timeout":5000},{"waitUntil":"networkidle2","timeout":10000},{"waitUntil":"load","timeout":15000}]
|
||||||
|
PORT=3000
|
||||||
|
MAX_CONCURRENCY=10
|
||||||
|
NODE_ENV=development
|
||||||
|
ENGINE = [
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
ENGINE_BAIDUURL=https://www.baidu.com/s
|
||||||
|
#ENGINE_SEARCHXNGURL=http://localhost:8080/search
|
||||||
|
ENGINE_SEARCHXNGURL=http://searxng:8080/search
|
||||||
|
|
||||||
|
#MONGODB_URI=mongodb://root:example@localhost:27017
|
||||||
|
MONGODB_URI=mongodb://root:example@mongodb:27017
|
||||||
|
BLACKLIST = [".gov.cn",".edu.cn"]
|
||||||
|
|
||||||
|
STD_TTL=3600
|
||||||
|
EXPIRE_AFTER_SECONDS=9000
|
||||||
|
|
||||||
|
#VALIDATE_PROXY=[{"ip":"","port":},{"ip":"","port":}]
|
||||||
5804
plugins/webcrawler/SPIDER/package-lock.json
generated
Normal file
5804
plugins/webcrawler/SPIDER/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
62
plugins/webcrawler/SPIDER/package.json
Normal file
62
plugins/webcrawler/SPIDER/package.json
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
{
|
||||||
|
"name": "spider",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "/dist/index.ts",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1",
|
||||||
|
"start": "ts-node src/index.ts",
|
||||||
|
"build": "webpack",
|
||||||
|
"dev": "ts-node-dev --respawn src/index.ts"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"@types/node-fetch": "^2.6.12",
|
||||||
|
"assert": "^2.1.0",
|
||||||
|
"axios": "^1.7.9",
|
||||||
|
"body-parser": "^1.20.3",
|
||||||
|
"browserify-zlib": "^0.2.0",
|
||||||
|
"buffer": "^6.0.3",
|
||||||
|
"cheerio": "^1.0.0",
|
||||||
|
"crypto-browserify": "^3.12.1",
|
||||||
|
"dotenv": "^16.4.7",
|
||||||
|
"express": "^4.21.2",
|
||||||
|
"https-proxy-agent": "^7.0.6",
|
||||||
|
"jsdom": "^26.0.0",
|
||||||
|
"mongodb": "^6.13.1",
|
||||||
|
"node-cache": "^5.1.2",
|
||||||
|
"node-fetch": "^2.7.0",
|
||||||
|
"os-browserify": "^0.3.0",
|
||||||
|
"path-browserify": "^1.0.1",
|
||||||
|
"puppeteer": "^24.2.1",
|
||||||
|
"puppeteer-cluster": "^0.24.0",
|
||||||
|
"querystring-es3": "^0.2.1",
|
||||||
|
"random-useragent": "^0.5.0",
|
||||||
|
"spider": "file:",
|
||||||
|
"stream-browserify": "^3.0.0",
|
||||||
|
"stream-http": "^3.2.0",
|
||||||
|
"string_decoder": "^1.3.0",
|
||||||
|
"turndown": "^7.2.0",
|
||||||
|
"turndown-plugin-gfm": "^1.0.2",
|
||||||
|
"url": "^0.11.4",
|
||||||
|
"user-agents": "^1.1.454",
|
||||||
|
"util": "^0.12.5",
|
||||||
|
"vm-browserify": "^1.1.2"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/body-parser": "^1.19.5",
|
||||||
|
"@types/express": "^5.0.0",
|
||||||
|
"@types/jsdom": "^21.1.7",
|
||||||
|
"@types/node": "^22.13.4",
|
||||||
|
"@types/random-useragent": "^0.3.3",
|
||||||
|
"@types/user-agents": "^1.0.4",
|
||||||
|
"ts-loader": "^9.5.2",
|
||||||
|
"ts-node-dev": "^2.0.0",
|
||||||
|
"typescript": "^5.7.3",
|
||||||
|
"webpack": "^5.98.0",
|
||||||
|
"webpack-cli": "^6.0.1",
|
||||||
|
"webpack-node-externals": "^3.0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
import { Request, Response } from 'express';
|
||||||
|
import fetch from 'node-fetch';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
const userAgents = [
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
|
||||||
|
];
|
||||||
|
|
||||||
|
export const quickFetch = async (req: Request, res: Response): Promise<void> => {
|
||||||
|
const { url } = req.query;
|
||||||
|
|
||||||
|
if (!url) {
|
||||||
|
res.status(400).json({
|
||||||
|
status: 400,
|
||||||
|
error: {
|
||||||
|
code: 'MISSING_PARAM',
|
||||||
|
message: '缺少必要参数: url'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(url as string, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
||||||
|
Referer: 'https://www.google.com/',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
Connection: 'keep-alive',
|
||||||
|
'Cache-Control': 'no-cache'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
|
}
|
||||||
|
const data = await response.text();
|
||||||
|
res.status(200).json({
|
||||||
|
status: 200,
|
||||||
|
data: {
|
||||||
|
content: data
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error fetching the page:', error);
|
||||||
|
res.status(500).json({
|
||||||
|
status: 500,
|
||||||
|
error: {
|
||||||
|
code: 'INTERNAL_SERVER_ERROR',
|
||||||
|
message: '发生错误'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
export default { quickFetch };
|
||||||
148
plugins/webcrawler/SPIDER/src/controllers/readController.ts
Normal file
148
plugins/webcrawler/SPIDER/src/controllers/readController.ts
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
import { Request, Response } from 'express';
|
||||||
|
import puppeteer, { Page } from 'puppeteer';
|
||||||
|
import * as cheerio from 'cheerio';
|
||||||
|
import UserAgent from 'user-agents';
|
||||||
|
import { setupPage } from '../utils/setupPage'; // 导入 setupPage 模块
|
||||||
|
import dotenv from 'dotenv'; // 导入 dotenv 模块
|
||||||
|
import { URL } from 'url'; // 导入 URL 模块
|
||||||
|
import { handleSpecialWebsite } from '../specialHandlers'; // 导入 handleSpecialWebsite 模块
|
||||||
|
import fetch from 'node-fetch';
|
||||||
|
import { getCachedPage, updateCacheAsync } from '../utils/cacheUpdater'; // 导入缓存相关模块
|
||||||
|
|
||||||
|
dotenv.config(); // 加载环境变量
|
||||||
|
|
||||||
|
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
||||||
|
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
||||||
|
|
||||||
|
export const readPage = async (req: Request, res: Response): Promise<void> => {
|
||||||
|
const { queryUrl } = req.query;
|
||||||
|
console.log('-------');
|
||||||
|
console.log(queryUrl);
|
||||||
|
console.log('-------');
|
||||||
|
|
||||||
|
if (!queryUrl) {
|
||||||
|
res.status(400).json({
|
||||||
|
status: 400,
|
||||||
|
error: {
|
||||||
|
code: 'MISSING_PARAM',
|
||||||
|
message: '缺少必要参数: queryUrl'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlDomain = new URL(queryUrl as string).hostname;
|
||||||
|
if (blacklistDomains.some((domain: string) => urlDomain.endsWith(domain))) {
|
||||||
|
res.status(403).json({
|
||||||
|
status: 403,
|
||||||
|
error: {
|
||||||
|
code: 'BLACKLISTED_DOMAIN',
|
||||||
|
message: '该域名受到保护中'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(queryUrl as string, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': new UserAgent({
|
||||||
|
deviceCategory: 'desktop',
|
||||||
|
platform: 'Linux x86_64'
|
||||||
|
}).toString(),
|
||||||
|
Referer: 'https://www.google.com/',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
Connection: 'keep-alive',
|
||||||
|
'Cache-Control': 'no-cache'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const content = await response.text();
|
||||||
|
const $ = cheerio.load(content);
|
||||||
|
const cleanedContent = $('body').html();
|
||||||
|
|
||||||
|
res.status(200).json({
|
||||||
|
status: 200,
|
||||||
|
data: {
|
||||||
|
title: $('title').text(),
|
||||||
|
content: cleanedContent
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||||
|
console.log('Page read successfully');
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('快速抓取页面时发生错误:', error);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
ignoreDefaultArgs: ['--enable-automation'],
|
||||||
|
headless: true,
|
||||||
|
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||||
|
pipe: true,
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-gpu'
|
||||||
|
// '--single-process'
|
||||||
|
]
|
||||||
|
});
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
// 检测是否需要特殊处理
|
||||||
|
if (
|
||||||
|
typeof queryUrl === 'string' &&
|
||||||
|
detectWebsites.some((website) => queryUrl.includes(website))
|
||||||
|
) {
|
||||||
|
await setupPage(page);
|
||||||
|
} else {
|
||||||
|
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||||
|
await page.setUserAgent(userAgent.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
const queryUrlSafe = new URL(queryUrl as string).toString();
|
||||||
|
|
||||||
|
await page.goto(queryUrlSafe, { waitUntil: 'load' });
|
||||||
|
await page.waitForSelector('body');
|
||||||
|
|
||||||
|
const title = await page.title();
|
||||||
|
let cleanedContent = await handleSpecialWebsite(page, queryUrl as string);
|
||||||
|
|
||||||
|
if (!cleanedContent) {
|
||||||
|
const content = await page.content();
|
||||||
|
const $ = cheerio.load(content);
|
||||||
|
cleanedContent = $('body').html();
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.close();
|
||||||
|
await browser.close();
|
||||||
|
|
||||||
|
res.status(200).json({
|
||||||
|
status: 200,
|
||||||
|
data: {
|
||||||
|
title,
|
||||||
|
content: cleanedContent
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await updateCacheAsync(queryUrl as string, cleanedContent || '');
|
||||||
|
console.log('Page read successfully');
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
res.status(500).json({
|
||||||
|
status: 500,
|
||||||
|
error: {
|
||||||
|
code: 'INTERNAL_SERVER_ERROR',
|
||||||
|
message: '读取页面时发生内部服务器错误'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
132
plugins/webcrawler/SPIDER/src/controllers/searchController.ts
Normal file
132
plugins/webcrawler/SPIDER/src/controllers/searchController.ts
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
import { Request, Response } from 'express';
|
||||||
|
import { Cluster } from 'puppeteer-cluster';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
import { performDeepSearch } from '../utils/deepSearch';
|
||||||
|
import { fetchSearchResults as fetchBaiduResults } from '../engines/baiduEngine';
|
||||||
|
import { fetchSearchResults as fetchSearchxngResults } from '../engines/searchxngEngine';
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
const strategies = JSON.parse(process.env.STRATEGIES || '[]');
|
||||||
|
const detectWebsites = process.env.DETECT_WEBSITES?.split(',') || [];
|
||||||
|
const maxConcurrency = parseInt(process.env.MAX_CONCURRENCY || '10', 10);
|
||||||
|
|
||||||
|
export const search = async (req: Request, res: Response): Promise<void> => {
|
||||||
|
const {
|
||||||
|
query,
|
||||||
|
pageCount = 10,
|
||||||
|
needDetails = 'false',
|
||||||
|
engine = 'baidu',
|
||||||
|
categories = 'general'
|
||||||
|
} = req.query;
|
||||||
|
const needDetailsBool = needDetails === 'true';
|
||||||
|
|
||||||
|
if (!query) {
|
||||||
|
res.status(400).json({
|
||||||
|
status: 400,
|
||||||
|
error: {
|
||||||
|
code: 'MISSING_PARAM',
|
||||||
|
message: '缺少必要参数: query'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let fetchSearchResults;
|
||||||
|
let searchUrlBase;
|
||||||
|
try {
|
||||||
|
if (engine === 'baidu') {
|
||||||
|
fetchSearchResults = fetchBaiduResults;
|
||||||
|
searchUrlBase = process.env.ENGINE_BAIDUURL;
|
||||||
|
} else if (engine === 'searchxng') {
|
||||||
|
fetchSearchResults = fetchSearchxngResults;
|
||||||
|
searchUrlBase = process.env.ENGINE_SEARCHXNGURL;
|
||||||
|
} else {
|
||||||
|
res.status(400).json({
|
||||||
|
status: 400,
|
||||||
|
error: {
|
||||||
|
code: 'INVALID_ENGINE',
|
||||||
|
message: '无效的搜索引擎'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const { resultUrls, results } = await fetchSearchResults(
|
||||||
|
query as string,
|
||||||
|
Number(pageCount),
|
||||||
|
searchUrlBase || '',
|
||||||
|
categories as string
|
||||||
|
);
|
||||||
|
|
||||||
|
//如果返回值为空,返回空数组
|
||||||
|
if (results.size === 0) {
|
||||||
|
console.log('No results found');
|
||||||
|
res.status(200).json({
|
||||||
|
status: 200,
|
||||||
|
data: {
|
||||||
|
results: []
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!needDetailsBool) {
|
||||||
|
console.log('Need details is false');
|
||||||
|
results.forEach((value: any) => {
|
||||||
|
if (value.crawlStatus === 'Pending') {
|
||||||
|
value.crawlStatus = 'Success';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
res.status(200).json({
|
||||||
|
status: 200,
|
||||||
|
data: {
|
||||||
|
results: Array.from(results.values())
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
console.log('Need details is true');
|
||||||
|
|
||||||
|
const clusterInstance = await Cluster.launch({
|
||||||
|
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||||
|
maxConcurrency: maxConcurrency,
|
||||||
|
puppeteerOptions: {
|
||||||
|
ignoreDefaultArgs: ['--enable-automation'],
|
||||||
|
headless: 'true',
|
||||||
|
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||||
|
pipe: true,
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-gpu'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const sortedResults = await performDeepSearch(
|
||||||
|
clusterInstance,
|
||||||
|
resultUrls,
|
||||||
|
results,
|
||||||
|
strategies,
|
||||||
|
detectWebsites,
|
||||||
|
Number(pageCount)
|
||||||
|
);
|
||||||
|
res.status(200).json({
|
||||||
|
status: 200,
|
||||||
|
data: {
|
||||||
|
results: sortedResults.slice(0, Number(pageCount))
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
res.status(500).json({
|
||||||
|
status: 500,
|
||||||
|
error: {
|
||||||
|
code: 'INTERNAL_SERVER_ERROR',
|
||||||
|
message: '发生错误'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
export default { search };
|
||||||
207
plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
Normal file
207
plugins/webcrawler/SPIDER/src/engines/baiduEngine.ts
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
import { URL } from 'url';
|
||||||
|
import { JSDOM } from 'jsdom';
|
||||||
|
import puppeteer from 'puppeteer';
|
||||||
|
import { setupPage } from '../utils/setupPage';
|
||||||
|
import { Cluster } from 'puppeteer-cluster';
|
||||||
|
|
||||||
|
async function randomWait(min: number, max: number) {
|
||||||
|
// 随机等待时间
|
||||||
|
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
|
}
|
||||||
|
|
||||||
|
export const fetchSearchResults = async (
|
||||||
|
query: string,
|
||||||
|
pageCount: number,
|
||||||
|
searchUrlBase: string,
|
||||||
|
categories: string
|
||||||
|
) => {
|
||||||
|
console.log(`Fetching Baidu search results for query: ${query}`);
|
||||||
|
// 如果 searchUrlBase 为空,返回空数组
|
||||||
|
if (!searchUrlBase) {
|
||||||
|
return { resultUrls: [], results: new Map() };
|
||||||
|
}
|
||||||
|
const resultUrls: string[] = [];
|
||||||
|
const results = new Map<string, any>();
|
||||||
|
|
||||||
|
const pagesToFetch = Math.ceil(pageCount / 10);
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
ignoreDefaultArgs: ['--enable-automation'],
|
||||||
|
headless: true,
|
||||||
|
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||||
|
pipe: true,
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-gpu'
|
||||||
|
// '--single-process'
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await setupPage(page);
|
||||||
|
|
||||||
|
for (let i = 0; i < pagesToFetch; i++) {
|
||||||
|
const searchUrl = new URL(`${searchUrlBase}?wd=${encodeURIComponent(query)}&pn=${i * 10}`);
|
||||||
|
console.log(`Fetching page ${i + 1} from Baidu: ${searchUrl.toString()}`);
|
||||||
|
let retryCount = 0;
|
||||||
|
let success = false;
|
||||||
|
|
||||||
|
while (retryCount < 5 && !success) {
|
||||||
|
try {
|
||||||
|
console.time(`Page Load Time for page ${i + 1}`);
|
||||||
|
await page.goto(searchUrl.toString(), { waitUntil: 'load' });
|
||||||
|
console.timeEnd(`Page Load Time for page ${i + 1}`);
|
||||||
|
|
||||||
|
let content = await page.content();
|
||||||
|
let dom = new JSDOM(content);
|
||||||
|
let document = dom.window.document;
|
||||||
|
console.log(document.title);
|
||||||
|
|
||||||
|
// 如果是百度安全验证页面,重新设置页面并重新访问
|
||||||
|
if (document.title.includes('百度安全验证')) {
|
||||||
|
console.log('Detected Baidu security verification, retrying...');
|
||||||
|
await setupPage(page);
|
||||||
|
retryCount++;
|
||||||
|
//随机等待时间
|
||||||
|
await randomWait(1000, 3000);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析搜索结果
|
||||||
|
console.time(`Link Retrieval Time for page ${i + 1}`);
|
||||||
|
|
||||||
|
const resultContainers = document.querySelectorAll('.result.c-container');
|
||||||
|
for (const result of resultContainers) {
|
||||||
|
if (resultUrls.length > pageCount + 5) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const titleElement = result.querySelector('h3 a');
|
||||||
|
const title = titleElement ? titleElement.textContent : '';
|
||||||
|
const url = titleElement ? titleElement.getAttribute('href') : '';
|
||||||
|
const contentElement = result.querySelector('[class^="content"]');
|
||||||
|
const content = contentElement ? contentElement.textContent : '';
|
||||||
|
|
||||||
|
if (url) {
|
||||||
|
resultUrls.push(url);
|
||||||
|
results.set(url, {
|
||||||
|
title,
|
||||||
|
url,
|
||||||
|
snippet: content,
|
||||||
|
source: 'baidu',
|
||||||
|
crawlStatus: 'Pending',
|
||||||
|
score: 0
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.timeEnd(`Link Retrieval Time for page ${i + 1}`);
|
||||||
|
success = true;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error fetching page ${i + 1}:`, error);
|
||||||
|
retryCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
|
||||||
|
console.log('fetch all fake urls');
|
||||||
|
|
||||||
|
// 快速检索真实 URL
|
||||||
|
const urlsToProcessWithPuppeteer = [];
|
||||||
|
for (const url of resultUrls) {
|
||||||
|
try {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent':
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
||||||
|
Referer: 'https://www.google.com/',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
Connection: 'keep-alive',
|
||||||
|
'Cache-Control': 'no-cache'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const realUrl = response.url;
|
||||||
|
console.log('realurl:', realUrl);
|
||||||
|
const result = results.get(url);
|
||||||
|
if (result) {
|
||||||
|
result.url = realUrl;
|
||||||
|
result.crawlStatus = 'Success';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error fetching original URL for ${url}:`, error);
|
||||||
|
urlsToProcessWithPuppeteer.push(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('pass quickfetch');
|
||||||
|
|
||||||
|
// 并发处理真实 URL
|
||||||
|
const cluster = await Cluster.launch({
|
||||||
|
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||||
|
maxConcurrency: 10,
|
||||||
|
puppeteerOptions: {
|
||||||
|
ignoreDefaultArgs: ['--enable-automation'],
|
||||||
|
headless: 'true',
|
||||||
|
executablePath: '/usr/bin/chromium', // 明确指定 Chromium 路径
|
||||||
|
pipe: true,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let failedUrlCount = 0;
|
||||||
|
|
||||||
|
await cluster.task(async ({ page, data: url }) => {
|
||||||
|
let retryUrlCount = 0;
|
||||||
|
let urlSuccess = false;
|
||||||
|
while (retryUrlCount < 3 && !urlSuccess) {
|
||||||
|
console.log(`Fetching original URL for ${url}, attempt ${retryUrlCount + 1}`);
|
||||||
|
try {
|
||||||
|
await page.goto(url, { waitUntil: 'load' });
|
||||||
|
// 检查页面是否被分离
|
||||||
|
if (page.isClosed()) {
|
||||||
|
throw new Error('Page has been closed');
|
||||||
|
}
|
||||||
|
const realUrl = page.url(); // 获取真实 URL
|
||||||
|
const result = results.get(url);
|
||||||
|
if (result) {
|
||||||
|
result.url = realUrl;
|
||||||
|
result.crawlStatus = 'Success';
|
||||||
|
}
|
||||||
|
urlSuccess = true;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error fetching original URL, retrying...`, error);
|
||||||
|
retryUrlCount++;
|
||||||
|
await randomWait(1000, 3000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!urlSuccess) {
|
||||||
|
failedUrlCount++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const url of urlsToProcessWithPuppeteer) {
|
||||||
|
cluster.queue(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
await cluster.idle();
|
||||||
|
await cluster.close();
|
||||||
|
|
||||||
|
console.log(`Number of URLs that failed to return a real URL: ${failedUrlCount}`);
|
||||||
|
|
||||||
|
// 过滤并返回前 pageCount 个结果
|
||||||
|
const filteredResults = Array.from(results.values()).slice(0, pageCount);
|
||||||
|
|
||||||
|
return {
|
||||||
|
resultUrls: filteredResults.map((result) => result.url),
|
||||||
|
results: new Map(filteredResults.map((result) => [result.url, result]))
|
||||||
|
};
|
||||||
|
};
|
||||||
64
plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
Normal file
64
plugins/webcrawler/SPIDER/src/engines/searchxngEngine.ts
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import axios from 'axios';
|
||||||
|
import { URL } from 'url';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
const blacklistDomains = process.env.BLACKLIST ? JSON.parse(process.env.BLACKLIST) : [];
|
||||||
|
|
||||||
|
export const fetchSearchResults = async (
|
||||||
|
query: string,
|
||||||
|
pageCount: number,
|
||||||
|
searchUrlBase: string,
|
||||||
|
categories: string
|
||||||
|
) => {
|
||||||
|
const MAX_PAGES = (pageCount / 10 + 1) * 2 + 1; // 最多搜索的页面数
|
||||||
|
//如果searchUrlBase为空,返回空数组,pagecount是需要搜索结果的数量
|
||||||
|
if (!searchUrlBase) {
|
||||||
|
return { resultUrls: [], results: new Map() };
|
||||||
|
}
|
||||||
|
const resultUrls: string[] = [];
|
||||||
|
const results = new Map<string, any>();
|
||||||
|
|
||||||
|
let fetchedResultsCount = 0;
|
||||||
|
let pageIndex = 0;
|
||||||
|
|
||||||
|
while (fetchedResultsCount < pageCount && pageIndex < MAX_PAGES) {
|
||||||
|
const searchUrl = new URL(
|
||||||
|
`${searchUrlBase}?q=${encodeURIComponent(query)}&pageno=${pageIndex + 1}&format=json&categories=${categories}`
|
||||||
|
);
|
||||||
|
console.log(`Fetching page ${pageIndex + 1} from SearchXNG: ${searchUrl.toString()}`);
|
||||||
|
const response = await axios.get(searchUrl.toString());
|
||||||
|
const jsonResults = response.data.results;
|
||||||
|
|
||||||
|
for (let index = 0; index < jsonResults.length; index++) {
|
||||||
|
const result = jsonResults[index];
|
||||||
|
const resultDomain = new URL(result.url).hostname;
|
||||||
|
if (
|
||||||
|
blacklistDomains.some((domain: string) => resultDomain.endsWith(domain)) ||
|
||||||
|
resultDomain.includes('zhihu')
|
||||||
|
) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
resultUrls.push(result.url);
|
||||||
|
results.set(result.url, {
|
||||||
|
title: result.title,
|
||||||
|
url: result.url,
|
||||||
|
snippet: result.content,
|
||||||
|
source: result.engine,
|
||||||
|
crawlStatus: 'Pending',
|
||||||
|
score: result.score
|
||||||
|
});
|
||||||
|
fetchedResultsCount++;
|
||||||
|
if (fetchedResultsCount >= pageCount) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageIndex++;
|
||||||
|
if (jsonResults.length === 0) {
|
||||||
|
break; // 如果没有更多结果,退出循环
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { resultUrls, results };
|
||||||
|
};
|
||||||
18
plugins/webcrawler/SPIDER/src/index.ts
Normal file
18
plugins/webcrawler/SPIDER/src/index.ts
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import express, { Application } from 'express';
|
||||||
|
import bodyParser from 'body-parser';
|
||||||
|
import searchRoutes from './routes/searchRoutes';
|
||||||
|
import readRoutes from './routes/readRoutes';
|
||||||
|
import quickfetchRoutes from './routes/quickfetchRoutes';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
const app: Application = express();
|
||||||
|
|
||||||
|
app.use(bodyParser.json());
|
||||||
|
app.use('/api', searchRoutes);
|
||||||
|
app.use('/api', readRoutes);
|
||||||
|
app.use('/api', quickfetchRoutes);
|
||||||
|
|
||||||
|
const PORT = process.env.PORT || 3000;
|
||||||
|
app.listen(PORT, () => console.log(`Server running on port ${PORT}`));
|
||||||
21
plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
Normal file
21
plugins/webcrawler/SPIDER/src/middleware/authMiddleware.ts
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import { Request, Response, NextFunction } from 'express';
|
||||||
|
|
||||||
|
const authMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
||||||
|
const bearerHeader = req.headers['authorization'];
|
||||||
|
|
||||||
|
if (bearerHeader) {
|
||||||
|
console.log('bearerHeader:' + bearerHeader);
|
||||||
|
const bearer = bearerHeader.split(' ');
|
||||||
|
const bearerToken = bearer[1];
|
||||||
|
|
||||||
|
if (bearerToken === process.env.ACCESS_TOKEN) {
|
||||||
|
next();
|
||||||
|
} else {
|
||||||
|
res.status(403).json({ message: 'Invalid token' });
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
res.status(401).json({ message: 'Bearer token not found' });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
export default authMiddleware;
|
||||||
9
plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/quickfetchRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
import express from 'express';
|
||||||
|
import { quickFetch } from '../controllers/quickfetchController';
|
||||||
|
import authMiddleware from '../middleware/authMiddleware';
|
||||||
|
|
||||||
|
const readRoutes = express.Router();
|
||||||
|
|
||||||
|
readRoutes.get('/quickFetch', authMiddleware, quickFetch);
|
||||||
|
|
||||||
|
export default readRoutes;
|
||||||
9
plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/readRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
import express from 'express';
|
||||||
|
import { readPage } from '../controllers/readController';
|
||||||
|
import authMiddleware from '../middleware/authMiddleware';
|
||||||
|
|
||||||
|
const readRoutes = express.Router();
|
||||||
|
|
||||||
|
readRoutes.get('/read', authMiddleware, readPage);
|
||||||
|
|
||||||
|
export default readRoutes;
|
||||||
9
plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
Normal file
9
plugins/webcrawler/SPIDER/src/routes/searchRoutes.ts
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
import express from 'express';
|
||||||
|
import searchController from '../controllers/searchController';
|
||||||
|
import authMiddleware from '../middleware/authMiddleware';
|
||||||
|
|
||||||
|
const searchRoutes = express.Router();
|
||||||
|
|
||||||
|
searchRoutes.get('/search', authMiddleware, searchController.search);
|
||||||
|
|
||||||
|
export default searchRoutes;
|
||||||
26
plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
Normal file
26
plugins/webcrawler/SPIDER/src/specialHandlers/index.ts
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
import { Page } from 'puppeteer';
|
||||||
|
|
||||||
|
export const handleSpecialWebsite = async (page: Page, url: string): Promise<string | null> => {
|
||||||
|
if (url.includes('blog.csdn.net')) {
|
||||||
|
await page.waitForSelector('article');
|
||||||
|
const content = await page.$eval('article', (el) => el.innerHTML);
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
if (url.includes('zhuanlan.zhihu.com')) {
|
||||||
|
console.log('是知乎,需要点击按掉!');
|
||||||
|
console.log(await page.content());
|
||||||
|
if (
|
||||||
|
(await page.content()).includes(
|
||||||
|
'{"error":{"message":"您当前请求存在异常,暂时限制本次访问。如有疑问,您可以通过手机摇一摇或登录后私信知乎小管家反馈。","code":40362}}'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return null;
|
||||||
|
await page.waitForSelector('button[aria-label="关闭"]');
|
||||||
|
await page.click('button[aria-label="关闭"]'); // 使用 aria-label 选择按钮
|
||||||
|
await page.waitForSelector('article');
|
||||||
|
const content = await page.$eval('article', (el) => el.innerHTML);
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
// 可以添加更多特殊网站的处理逻辑
|
||||||
|
return null;
|
||||||
|
};
|
||||||
77
plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
Normal file
77
plugins/webcrawler/SPIDER/src/utils/cacheUpdater.ts
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
import NodeCache from 'node-cache';
|
||||||
|
import { MongoClient } from 'mongodb';
|
||||||
|
import crypto from 'crypto';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
const cache = new NodeCache({ stdTTL: parseInt(process.env.STD_TTL || '3600') });
|
||||||
|
const mongoClient = new MongoClient(process.env.MONGODB_URI || 'mongodb://localhost:27017');
|
||||||
|
const dbName = 'pageCache';
|
||||||
|
const collectionName = 'pages';
|
||||||
|
|
||||||
|
const connectToMongo = async () => {
|
||||||
|
await mongoClient.connect();
|
||||||
|
return mongoClient.db(dbName);
|
||||||
|
};
|
||||||
|
|
||||||
|
const createTTLIndex = async () => {
|
||||||
|
try {
|
||||||
|
const db = await connectToMongo();
|
||||||
|
await db
|
||||||
|
.collection(collectionName)
|
||||||
|
.createIndex(
|
||||||
|
{ updatedAt: 1 },
|
||||||
|
{ expireAfterSeconds: parseInt(process.env.EXPIRE_AFTER_SECONDS || '9000') }
|
||||||
|
);
|
||||||
|
console.log('TTL index created successfully');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error creating TTL index:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const getPageHash = (content: string) => {
|
||||||
|
return crypto.createHash('md5').update(content).digest('hex');
|
||||||
|
};
|
||||||
|
|
||||||
|
export const getCachedPage = async (url: string) => {
|
||||||
|
const cachedPage = cache.get(url);
|
||||||
|
if (cachedPage) return cachedPage;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const db = await connectToMongo();
|
||||||
|
const page = await db.collection(collectionName).findOne({ url });
|
||||||
|
if (page) cache.set(url, page);
|
||||||
|
return page;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error getting cached page:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const savePageToCache = async (url: string, content: string) => {
|
||||||
|
const hash = getPageHash(content);
|
||||||
|
const page = { url, content, hash, updatedAt: new Date() };
|
||||||
|
|
||||||
|
cache.set(url, page); // 更新内存缓存
|
||||||
|
|
||||||
|
try {
|
||||||
|
const db = await connectToMongo();
|
||||||
|
await db.collection(collectionName).updateOne({ url }, { $set: page }, { upsert: true }); // 更新持久化缓存
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error saving page to cache:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
export const updateCacheAsync = async (url: string, content: string) => {
|
||||||
|
await savePageToCache(url, content);
|
||||||
|
};
|
||||||
|
|
||||||
|
process.on('SIGINT', async () => {
|
||||||
|
await mongoClient.close();
|
||||||
|
process.exit(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
// 在应用启动时创建 TTL 索引
|
||||||
|
createTTLIndex();
|
||||||
158
plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
Normal file
158
plugins/webcrawler/SPIDER/src/utils/deepSearch.ts
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
import { Cluster } from 'puppeteer-cluster';
|
||||||
|
import * as cheerio from 'cheerio';
|
||||||
|
import UserAgent from 'user-agents';
|
||||||
|
import { setupPage } from './setupPage';
|
||||||
|
import { getCachedPage, updateCacheAsync } from './cacheUpdater';
|
||||||
|
import { handleSpecialWebsite } from '../specialHandlers';
|
||||||
|
import fetch from 'node-fetch';
|
||||||
|
|
||||||
|
interface CachedPage {
|
||||||
|
url: string;
|
||||||
|
content: string;
|
||||||
|
hash: string;
|
||||||
|
updatedAt: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const performDeepSearch = async (
|
||||||
|
clusterInstance: Cluster,
|
||||||
|
resultUrls: string[],
|
||||||
|
results: Map<string, any>,
|
||||||
|
strategies: any[],
|
||||||
|
detectWebsites: string[],
|
||||||
|
pageCount: number
|
||||||
|
) => {
|
||||||
|
const tasks = [];
|
||||||
|
|
||||||
|
await clusterInstance.task(async ({ page, data: { searchUrl } }) => {
|
||||||
|
try {
|
||||||
|
const cachedPage = (await getCachedPage(searchUrl)) as CachedPage | null;
|
||||||
|
if (cachedPage) {
|
||||||
|
const result = results.get(searchUrl);
|
||||||
|
if (result) {
|
||||||
|
result.content = cachedPage.content;
|
||||||
|
result.crawlStatus = 'Success';
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`从缓存获取页面 ${searchUrl} 时发生错误:`, error);
|
||||||
|
results.set(searchUrl, {
|
||||||
|
url: searchUrl,
|
||||||
|
error: (error as Error).message,
|
||||||
|
crawlStatus: 'Failed'
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(searchUrl, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': new UserAgent({
|
||||||
|
deviceCategory: 'desktop',
|
||||||
|
platform: 'Linux x86_64'
|
||||||
|
}).toString(),
|
||||||
|
Referer: 'https://www.google.com/',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
Connection: 'keep-alive',
|
||||||
|
'Cache-Control': 'no-cache'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const content = await response.text();
|
||||||
|
const $ = cheerio.load(content);
|
||||||
|
const cleanedContent = $('body').html() || '';
|
||||||
|
|
||||||
|
const result = results.get(searchUrl);
|
||||||
|
if (result) {
|
||||||
|
result.content = cleanedContent;
|
||||||
|
result.crawlStatus = 'Success';
|
||||||
|
}
|
||||||
|
|
||||||
|
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`快速抓取页面 ${searchUrl} 时发生错误:`, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (detectWebsites.some((website) => searchUrl.includes(website))) {
|
||||||
|
await setupPage(page);
|
||||||
|
} else {
|
||||||
|
const userAgent = new UserAgent({ deviceCategory: 'desktop', platform: 'Linux x86_64' });
|
||||||
|
await page.setUserAgent(userAgent.toString());
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`访问页面 ${searchUrl} 设置用户代理时发生错误:`, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
let pageLoaded = false;
|
||||||
|
let pageLoadError: Error | null = null;
|
||||||
|
for (const strategy of strategies) {
|
||||||
|
try {
|
||||||
|
await page.goto(searchUrl, { waitUntil: strategy.waitUntil, timeout: strategy.timeout });
|
||||||
|
pageLoaded = true;
|
||||||
|
break;
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.name === 'TimeoutError') {
|
||||||
|
pageLoadError = error;
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
pageLoadError = error;
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!pageLoaded) {
|
||||||
|
const result = results.get(searchUrl);
|
||||||
|
if (result) {
|
||||||
|
result.error = pageLoadError;
|
||||||
|
result.crawlStatus = 'Failed';
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
let cleanedContent = await handleSpecialWebsite(page, searchUrl);
|
||||||
|
if (!cleanedContent) {
|
||||||
|
const content = await page.content();
|
||||||
|
const $ = cheerio.load(content);
|
||||||
|
cleanedContent = $('body').html() || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = results.get(searchUrl);
|
||||||
|
if (result) {
|
||||||
|
result.content = cleanedContent;
|
||||||
|
result.crawlStatus = 'Success';
|
||||||
|
}
|
||||||
|
|
||||||
|
await updateCacheAsync(searchUrl, cleanedContent || '');
|
||||||
|
} catch (error) {
|
||||||
|
results.set(searchUrl, {
|
||||||
|
url: searchUrl,
|
||||||
|
error: (error as Error).message,
|
||||||
|
crawlStatus: 'Failed'
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
await page.close().catch(() => {});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const url of resultUrls) {
|
||||||
|
if (tasks.length >= pageCount + 10) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
tasks.push(clusterInstance.queue({ searchUrl: url }));
|
||||||
|
}
|
||||||
|
|
||||||
|
await Promise.all(tasks);
|
||||||
|
|
||||||
|
await clusterInstance.idle();
|
||||||
|
await clusterInstance.close();
|
||||||
|
|
||||||
|
return Array.from(results.values()).sort((a, b) => b.score - a.score);
|
||||||
|
};
|
||||||
81
plugins/webcrawler/SPIDER/src/utils/setupPage.ts
Normal file
81
plugins/webcrawler/SPIDER/src/utils/setupPage.ts
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
import { Page } from 'puppeteer';
|
||||||
|
import randomUseragent from 'random-useragent';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
const getRandomUserAgent = () => {
|
||||||
|
return randomUseragent.getRandom();
|
||||||
|
};
|
||||||
|
|
||||||
|
const getRandomPlatform = () => {
|
||||||
|
const platforms = ['Win32', 'MacIntel', 'Linux x86_64'];
|
||||||
|
return platforms[Math.floor(Math.random() * platforms.length)];
|
||||||
|
};
|
||||||
|
|
||||||
|
//代理池
|
||||||
|
const validateproxy = process.env.VALIDATE_PROXY ? JSON.parse(process.env.VALIDATE_PROXY) : [];
|
||||||
|
|
||||||
|
const getRandomProxy = () => {
|
||||||
|
return validateproxy.length > 0
|
||||||
|
? validateproxy[Math.floor(Math.random() * validateproxy.length)]
|
||||||
|
: null;
|
||||||
|
};
|
||||||
|
|
||||||
|
const getRandomLanguages = () => {
|
||||||
|
const languages = [
|
||||||
|
['zh-CN', 'zh', 'en'],
|
||||||
|
['en-US', 'en', 'fr'],
|
||||||
|
['es-ES', 'es', 'en']
|
||||||
|
];
|
||||||
|
return languages[Math.floor(Math.random() * languages.length)];
|
||||||
|
};
|
||||||
|
|
||||||
|
export const setupPage = async (page: Page): Promise<void> => {
|
||||||
|
const proxy = getRandomProxy();
|
||||||
|
if (proxy) {
|
||||||
|
await page.authenticate({
|
||||||
|
username: proxy.ip,
|
||||||
|
password: proxy.port.toString()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.evaluateOnNewDocument(() => {
|
||||||
|
const newProto = (navigator as any).__proto__;
|
||||||
|
delete newProto.webdriver;
|
||||||
|
(navigator as any).__proto__ = newProto;
|
||||||
|
(window as any).chrome = {};
|
||||||
|
(window as any).chrome.app = {
|
||||||
|
InstallState: 'testt',
|
||||||
|
RunningState: 'estt',
|
||||||
|
getDetails: 'stte',
|
||||||
|
getIsInstalled: 'ttes'
|
||||||
|
};
|
||||||
|
(window as any).chrome.csi = function () {};
|
||||||
|
(window as any).chrome.loadTimes = function () {};
|
||||||
|
(window as any).chrome.runtime = function () {};
|
||||||
|
Object.defineProperty(navigator, 'userAgent', {
|
||||||
|
get: () => getRandomUserAgent()
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'platform', {
|
||||||
|
get: () => getRandomPlatform()
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'plugins', {
|
||||||
|
get: () => [
|
||||||
|
{
|
||||||
|
description: 'Shockwave Flash',
|
||||||
|
filename: 'pepflashplayer.dll',
|
||||||
|
length: 1,
|
||||||
|
name: 'Shockwave Flash'
|
||||||
|
}
|
||||||
|
]
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'languages', {
|
||||||
|
get: () => getRandomLanguages()
|
||||||
|
});
|
||||||
|
const originalQuery = (window.navigator.permissions as any).query;
|
||||||
|
(window.navigator.permissions as any).query = (parameters: any) =>
|
||||||
|
parameters.name === 'notifications'
|
||||||
|
? Promise.resolve({ state: Notification.permission } as PermissionStatus)
|
||||||
|
: originalQuery(parameters);
|
||||||
|
});
|
||||||
|
};
|
||||||
113
plugins/webcrawler/SPIDER/tsconfig.json
Normal file
113
plugins/webcrawler/SPIDER/tsconfig.json
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
/* Visit https://aka.ms/tsconfig to read more about this file */
|
||||||
|
/* Projects */
|
||||||
|
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
|
||||||
|
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
|
||||||
|
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
|
||||||
|
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
|
||||||
|
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
|
||||||
|
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||||
|
"types": ["node"],
|
||||||
|
/* Language and Environment */
|
||||||
|
"target": "es6", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||||
|
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||||
|
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||||
|
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
|
||||||
|
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||||
|
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
|
||||||
|
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
|
||||||
|
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
|
||||||
|
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
|
||||||
|
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
|
||||||
|
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
|
||||||
|
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||||
|
|
||||||
|
/* Modules */
|
||||||
|
//"module": "es6", /* Specify what module code is generated. */
|
||||||
|
"rootDir": "./src", /* Specify the root folder within your source files. */
|
||||||
|
"moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||||
|
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
|
||||||
|
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||||
|
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ /* Specify type package names to be included without being referenced in a source file. */
|
||||||
|
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
|
||||||
|
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
|
||||||
|
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
|
||||||
|
// "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
|
||||||
|
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
||||||
|
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
||||||
|
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
||||||
|
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
|
||||||
|
// "resolveJsonModule": true, /* Enable importing .json files. */
|
||||||
|
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
||||||
|
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||||
|
|
||||||
|
/* JavaScript Support */
|
||||||
|
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
||||||
|
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
||||||
|
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||||
|
|
||||||
|
/* Emit */
|
||||||
|
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
|
||||||
|
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
|
||||||
|
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||||
|
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||||
|
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
|
||||||
|
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||||
|
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||||
|
"outDir": "./dist", /* Specify an output folder for all emitted files. */
|
||||||
|
// "removeComments": true, /* Disable emitting comments. */
|
||||||
|
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||||
|
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
|
||||||
|
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
|
||||||
|
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
|
||||||
|
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
|
||||||
|
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
|
||||||
|
// "newLine": "crlf", /* Set the newline character for emitting files. */
|
||||||
|
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
|
||||||
|
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
|
||||||
|
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
|
||||||
|
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
|
||||||
|
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
|
||||||
|
|
||||||
|
/* Interop Constraints */
|
||||||
|
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
|
||||||
|
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
|
||||||
|
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
|
||||||
|
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||||
|
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
||||||
|
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||||
|
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
|
||||||
|
|
||||||
|
/* Type Checking */
|
||||||
|
"typeRoots": ["./node_modules/@types"],
|
||||||
|
"strict": true, /* Enable all strict type-checking options. */
|
||||||
|
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||||
|
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||||
|
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||||
|
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
|
||||||
|
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
|
||||||
|
// "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
|
||||||
|
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
|
||||||
|
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
|
||||||
|
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
|
||||||
|
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
|
||||||
|
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
|
||||||
|
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
|
||||||
|
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
|
||||||
|
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
|
||||||
|
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
|
||||||
|
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
|
||||||
|
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
|
||||||
|
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
|
||||||
|
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
|
||||||
|
|
||||||
|
/* Completeness */
|
||||||
|
// "skipDefaultLibCheck": true,
|
||||||
|
// /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||||
|
"skipLibCheck": true/* Skip type checking all .d.ts files. */
|
||||||
|
|
||||||
|
},
|
||||||
|
"include": ["src/**/*.ts"],
|
||||||
|
"exclude": ["node_modules"]
|
||||||
|
}
|
||||||
55
plugins/webcrawler/SPIDER/webpack.config.js
Normal file
55
plugins/webcrawler/SPIDER/webpack.config.js
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
// 引入path包
|
||||||
|
const path = require('path')
|
||||||
|
require('dotenv').config();
|
||||||
|
const mode = process.env.NODE_ENV || 'development'
|
||||||
|
|
||||||
|
const nodeExternals = require('webpack-node-externals');
|
||||||
|
module.exports = {
|
||||||
|
target: 'node', // 指定构建目标为 Node.js
|
||||||
|
externals: [nodeExternals()], // 排除 node_modules
|
||||||
|
// 指定入口文件
|
||||||
|
entry: "./src/index.ts",
|
||||||
|
|
||||||
|
// 指定打包文件所在目录
|
||||||
|
output: {
|
||||||
|
path: path.resolve(__dirname, 'dist'),
|
||||||
|
// 打包后文件的名称
|
||||||
|
filename: "bundle.js"
|
||||||
|
},
|
||||||
|
resolve: {
|
||||||
|
extensions: ['.ts', '.tsx', '.js', '.json'],
|
||||||
|
fallback: {
|
||||||
|
"zlib": require.resolve("browserify-zlib"),
|
||||||
|
"querystring": require.resolve("querystring-es3"),
|
||||||
|
"path": require.resolve("path-browserify"),
|
||||||
|
"crypto": require.resolve("crypto-browserify"),
|
||||||
|
"stream": require.resolve("stream-browserify"),
|
||||||
|
"os": require.resolve("os-browserify/browser"),
|
||||||
|
"http": require.resolve("stream-http"),
|
||||||
|
"net": false,
|
||||||
|
"string_decoder": require.resolve("string_decoder/"),
|
||||||
|
"url": require.resolve("url/"),
|
||||||
|
"buffer": require.resolve("buffer/"),
|
||||||
|
"util": require.resolve("util/"),
|
||||||
|
// 新增 assert 的 fallback
|
||||||
|
"assert": require.resolve("assert/"),
|
||||||
|
// 处理新出现的 vm 警告
|
||||||
|
"vm": require.resolve("vm-browserify"),
|
||||||
|
"fs": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// 指定webpack打包的时候要使用的模块
|
||||||
|
module: {
|
||||||
|
// 指定要价在的规则
|
||||||
|
rules: [
|
||||||
|
{
|
||||||
|
// test指定的是规则生效的文件,意思是,用ts-loader来处理以ts为结尾的文件
|
||||||
|
test: /\.ts$/,
|
||||||
|
use: 'ts-loader',
|
||||||
|
exclude: /node_modules/
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
mode,
|
||||||
|
}
|
||||||
124
plugins/webcrawler/docker-compose.yaml
Normal file
124
plugins/webcrawler/docker-compose.yaml
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
name: spider
|
||||||
|
version: "0.0.1"
|
||||||
|
|
||||||
|
services:
|
||||||
|
caddy:
|
||||||
|
container_name: caddy
|
||||||
|
image: docker.io/library/caddy:2-alpine
|
||||||
|
network_mode: host
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./Caddyfile:/etc/caddy/Caddyfile:ro
|
||||||
|
- caddy-data:/data:rw
|
||||||
|
- caddy-config:/config:rw
|
||||||
|
environment:
|
||||||
|
- SEARXNG_HOSTNAME=${SEARXNG_HOSTNAME:-http://localhost}
|
||||||
|
- SEARXNG_TLS=${LETSENCRYPT_EMAIL:-internal}
|
||||||
|
cap_add:
|
||||||
|
- NET_BIND_SERVICE
|
||||||
|
cap_drop:
|
||||||
|
- ALL
|
||||||
|
logging:
|
||||||
|
driver: "json-file"
|
||||||
|
options:
|
||||||
|
max-size: "1m"
|
||||||
|
max-file: "1"
|
||||||
|
|
||||||
|
redis:
|
||||||
|
container_name: redis
|
||||||
|
image: docker.io/valkey/valkey:8-alpine
|
||||||
|
command: valkey-server --save 30 1 --loglevel warning
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- searxng
|
||||||
|
volumes:
|
||||||
|
- valkey-data2:/data
|
||||||
|
cap_drop:
|
||||||
|
- ALL
|
||||||
|
cap_add:
|
||||||
|
- SETGID
|
||||||
|
- SETUID
|
||||||
|
- DAC_OVERRIDE
|
||||||
|
logging:
|
||||||
|
driver: "json-file"
|
||||||
|
options:
|
||||||
|
max-size: "1m"
|
||||||
|
max-file: "1"
|
||||||
|
|
||||||
|
searxng:
|
||||||
|
container_name: searxng
|
||||||
|
image: docker.io/searxng/searxng:latest
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- searxng
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:8080:8080"
|
||||||
|
volumes:
|
||||||
|
- ./searxng:/etc/searxng:rw
|
||||||
|
environment:
|
||||||
|
- SEARXNG_BASE_URL=https://${SEARXNG_HOSTNAME:-localhost}/
|
||||||
|
- UWSGI_WORKERS=${SEARXNG_UWSGI_WORKERS:-4}
|
||||||
|
- UWSGI_THREADS=${SEARXNG_UWSGI_THREADS:-4}
|
||||||
|
env_file:
|
||||||
|
- .searchxng.env
|
||||||
|
cap_drop:
|
||||||
|
- ALL
|
||||||
|
cap_add:
|
||||||
|
- CHOWN
|
||||||
|
- SETGID
|
||||||
|
- SETUID
|
||||||
|
logging:
|
||||||
|
driver: "json-file"
|
||||||
|
options:
|
||||||
|
max-size: "1m"
|
||||||
|
max-file: "1"
|
||||||
|
|
||||||
|
mongodb:
|
||||||
|
container_name: mongodb
|
||||||
|
image: mongo:4.4
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- searxng
|
||||||
|
ports:
|
||||||
|
- "27017:27017"
|
||||||
|
volumes:
|
||||||
|
- mongo-data:/data/db
|
||||||
|
environment:
|
||||||
|
MONGO_INITDB_ROOT_USERNAME: root
|
||||||
|
MONGO_INITDB_ROOT_PASSWORD: example
|
||||||
|
logging:
|
||||||
|
driver: "json-file"
|
||||||
|
options:
|
||||||
|
max-size: "1m"
|
||||||
|
max-file: "1"
|
||||||
|
|
||||||
|
nodeapp:
|
||||||
|
container_name: main
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
networks:
|
||||||
|
- searxng
|
||||||
|
depends_on:
|
||||||
|
- mongodb
|
||||||
|
logging:
|
||||||
|
driver: "json-file"
|
||||||
|
options:
|
||||||
|
max-size: "1m"
|
||||||
|
max-file: "1"
|
||||||
|
volumes:
|
||||||
|
- /dev/shm:/dev/shm
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 4G
|
||||||
|
cpus: '2.0'
|
||||||
|
networks:
|
||||||
|
searxng:
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
caddy-data:
|
||||||
|
caddy-config:
|
||||||
|
valkey-data2:
|
||||||
|
mongo-data:
|
||||||
16
plugins/webcrawler/searxng-docker.service.template
Normal file
16
plugins/webcrawler/searxng-docker.service.template
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=SearXNG service
|
||||||
|
Requires=docker.service
|
||||||
|
After=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Restart=on-failure
|
||||||
|
|
||||||
|
Environment=SEARXNG_DOCKERCOMPOSEFILE=docker-compose.yaml
|
||||||
|
|
||||||
|
WorkingDirectory=/usr/local/searxng-docker
|
||||||
|
ExecStart=/usr/local/bin/docker compose -f ${SEARXNG_DOCKERCOMPOSEFILE} up --remove-orphans
|
||||||
|
ExecStop=/usr/local/bin/docker compose -f ${SEARXNG_DOCKERCOMPOSEFILE} down
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
6
plugins/webcrawler/searxng/limiter.toml
Normal file
6
plugins/webcrawler/searxng/limiter.toml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# This configuration file updates the default configuration file
|
||||||
|
# See https://github.com/searxng/searxng/blob/master/searx/limiter.toml
|
||||||
|
|
||||||
|
[botdetection.ip_limit]
|
||||||
|
# activate link_token method in the ip_limit method
|
||||||
|
link_token = true
|
||||||
38
plugins/webcrawler/searxng/settings.yml
Normal file
38
plugins/webcrawler/searxng/settings.yml
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
# see https://docs.searxng.org/admin/settings/settings.html#settings-use-default-settings
|
||||||
|
use_default_settings: true
|
||||||
|
server:
|
||||||
|
# base_url is defined in the SEARXNG_BASE_URL environment variable, see .env and docker-compose.yml
|
||||||
|
secret_key: "01042f00ae8bb522a9c03d3e7e1910318208a2c9fbdd23a6315577a9c98553a8" # change this!
|
||||||
|
limiter: false # can be disabled for a private instance
|
||||||
|
image_proxy: true
|
||||||
|
ui:
|
||||||
|
static_use_hash: true
|
||||||
|
# 启用 cn 分类
|
||||||
|
enabled_categories: [cn, general, images] # 按需添加其他分类
|
||||||
|
# 或者定义分类显示顺序
|
||||||
|
categories_order: [cn, general, images]
|
||||||
|
redis:
|
||||||
|
url: redis://redis:6379/0
|
||||||
|
engines:
|
||||||
|
- name: bing
|
||||||
|
disabled: false
|
||||||
|
categories: cn
|
||||||
|
#- name: bilibili
|
||||||
|
# engine: bilibili
|
||||||
|
# shortcut: bil
|
||||||
|
# disabled: false
|
||||||
|
# categories: cn
|
||||||
|
- name : baidu
|
||||||
|
engine : json_engine
|
||||||
|
paging : True
|
||||||
|
first_page_num : 0
|
||||||
|
search_url : https://www.baidu.com/s?tn=json&wd={query}&pn={pageno}&rn=50
|
||||||
|
url_query : url
|
||||||
|
title_query : title
|
||||||
|
content_query : abs
|
||||||
|
categories : cn
|
||||||
|
|
||||||
|
search:
|
||||||
|
formats:
|
||||||
|
- html
|
||||||
|
- json
|
||||||
@@ -32,6 +32,7 @@ import MyIcon from '@fastgpt/web/components/common/Icon';
|
|||||||
import { formatTime2YMDHMS } from '@fastgpt/global/common/string/time';
|
import { formatTime2YMDHMS } from '@fastgpt/global/common/string/time';
|
||||||
import MyModal from '@fastgpt/web/components/common/MyModal';
|
import MyModal from '@fastgpt/web/components/common/MyModal';
|
||||||
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
|
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
|
||||||
|
import SearchInput from '@fastgpt/web/components/common/Input/SearchInput';
|
||||||
|
|
||||||
type LogDetailType = {
|
type LogDetailType = {
|
||||||
id: number;
|
id: number;
|
||||||
@@ -55,11 +56,13 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
|||||||
|
|
||||||
const isRoot = userInfo?.username === 'root';
|
const isRoot = userInfo?.username === 'root';
|
||||||
const [filterProps, setFilterProps] = useState<{
|
const [filterProps, setFilterProps] = useState<{
|
||||||
|
request_id?: string;
|
||||||
channelId?: string;
|
channelId?: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
code_type: 'all' | 'success' | 'error';
|
code_type: 'all' | 'success' | 'error';
|
||||||
dateRange: DateRangeType;
|
dateRange: DateRangeType;
|
||||||
}>({
|
}>({
|
||||||
|
request_id: '',
|
||||||
code_type: 'all',
|
code_type: 'all',
|
||||||
dateRange: {
|
dateRange: {
|
||||||
from: (() => {
|
from: (() => {
|
||||||
@@ -125,6 +128,7 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
|||||||
pageSize: 20,
|
pageSize: 20,
|
||||||
refreshDeps: [filterProps],
|
refreshDeps: [filterProps],
|
||||||
params: {
|
params: {
|
||||||
|
request_id: filterProps.request_id,
|
||||||
channel: filterProps.channelId,
|
channel: filterProps.channelId,
|
||||||
model_name: filterProps.model,
|
model_name: filterProps.model,
|
||||||
code_type: filterProps.code_type,
|
code_type: filterProps.code_type,
|
||||||
@@ -162,7 +166,7 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
|||||||
content: item.content
|
content: item.content
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
}, [data]);
|
}, [channelList, data, systemModelList]);
|
||||||
|
|
||||||
const [logDetail, setLogDetail] = useState<LogDetailType>();
|
const [logDetail, setLogDetail] = useState<LogDetailType>();
|
||||||
|
|
||||||
@@ -172,6 +176,13 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
|
|||||||
<Flex alignItems={'center'}>
|
<Flex alignItems={'center'}>
|
||||||
{Tab}
|
{Tab}
|
||||||
<Box flex={1} />
|
<Box flex={1} />
|
||||||
|
<Box flex={'0 0 200px'}>
|
||||||
|
<SearchInput
|
||||||
|
placeholder={t('account_model:log_request_id_search')}
|
||||||
|
defaultValue={filterProps.request_id}
|
||||||
|
onBlur={(e) => setFilterProps({ ...filterProps, request_id: e.target.value })}
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
</Flex>
|
</Flex>
|
||||||
)}
|
)}
|
||||||
<HStack spacing={4}>
|
<HStack spacing={4}>
|
||||||
@@ -393,7 +404,7 @@ const LogDetail = ({ data, onClose }: { data: LogDetailType; onClose: () => void
|
|||||||
</GridItem>
|
</GridItem>
|
||||||
)}
|
)}
|
||||||
{detailData?.response_body && (
|
{detailData?.response_body && (
|
||||||
<GridItem display={'flex'} borderBottomWidth="1px" borderRightWidth="1px" colSpan={2}>
|
<GridItem display={'flex'} colSpan={2}>
|
||||||
<Title>Response Body</Title>
|
<Title>Response Body</Title>
|
||||||
<Container>{detailData?.response_body}</Container>
|
<Container>{detailData?.response_body}</Container>
|
||||||
</GridItem>
|
</GridItem>
|
||||||
|
|||||||
@@ -143,7 +143,7 @@ const checkInvalidData = async () => {
|
|||||||
|
|
||||||
console.log(`检测集合完成`);
|
console.log(`检测集合完成`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log('checkInvalidData error', error);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -166,7 +166,9 @@ const checkInvalidDataText = async () => {
|
|||||||
await MongoDatasetDataText.deleteMany({
|
await MongoDatasetDataText.deleteMany({
|
||||||
dataId: { $in: unExistsSet }
|
dataId: { $in: unExistsSet }
|
||||||
});
|
});
|
||||||
} catch (error) {}
|
} catch (error) {
|
||||||
|
console.log('checkInvalidDataText error', error);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
|
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
/* 批量创建子插件 */
|
/* 批量创建子插件 */
|
||||||
@@ -88,7 +88,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
if (item.version === 'v2') {
|
if (item.version === 'v2') {
|
||||||
await MongoAppVersion.create(
|
await MongoAppVersion.create(
|
||||||
@@ -100,7 +100,7 @@ async function initHttp(teamId?: string): Promise<any> {
|
|||||||
edges: item.edges
|
edges: item.edges
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -160,7 +160,7 @@ async function initPlugin(teamId?: string): Promise<any> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
if (plugin.version === 'v2') {
|
if (plugin.version === 'v2') {
|
||||||
@@ -173,7 +173,7 @@ async function initPlugin(teamId?: string): Promise<any> {
|
|||||||
edges: plugin.edges
|
edges: plugin.edges
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -98,7 +98,8 @@ async function handler(
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
{
|
{
|
||||||
session
|
session,
|
||||||
|
ordered: true
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ export const onCreateApp = async ({
|
|||||||
'pluginData.nodeVersion': defaultNodeVersion
|
'pluginData.nodeVersion': defaultNodeVersion
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!AppFolderTypeList.includes(type!)) {
|
if (!AppFolderTypeList.includes(type!)) {
|
||||||
@@ -144,7 +144,7 @@ export const onCreateApp = async ({
|
|||||||
isPublish: true
|
isPublish: true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -89,7 +89,8 @@ async function handler(req: ApiRequestProps<CreateAppFolderBody>) {
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
{
|
{
|
||||||
session
|
session,
|
||||||
|
ordered: true
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ async function handler(req: ApiRequestProps<PostPublishAppProps>, res: NextApiRe
|
|||||||
tmbId
|
tmbId
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
// update app
|
// update app
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ async function handler(
|
|||||||
yuqueServer
|
yuqueServer
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
await refreshSourceAvatar(avatar, undefined, session);
|
await refreshSourceAvatar(avatar, undefined, session);
|
||||||
|
|
||||||
|
|||||||
@@ -34,17 +34,17 @@ async function handler(req: NextApiRequest) {
|
|||||||
});
|
});
|
||||||
const datasetIds = datasets.map((d) => d._id);
|
const datasetIds = datasets.map((d) => d._id);
|
||||||
|
|
||||||
|
// delete collection.tags
|
||||||
|
await MongoDatasetCollectionTags.deleteMany({
|
||||||
|
teamId,
|
||||||
|
datasetId: { $in: datasetIds }
|
||||||
|
});
|
||||||
|
|
||||||
// delete all dataset.data and pg data
|
// delete all dataset.data and pg data
|
||||||
await mongoSessionRun(async (session) => {
|
await mongoSessionRun(async (session) => {
|
||||||
// delete dataset data
|
// delete dataset data
|
||||||
await delDatasetRelevantData({ datasets, session });
|
await delDatasetRelevantData({ datasets, session });
|
||||||
|
|
||||||
// delete collection.tags
|
|
||||||
await MongoDatasetCollectionTags.deleteMany({
|
|
||||||
teamId,
|
|
||||||
datasetId: { $in: datasetIds }
|
|
||||||
}).session(session);
|
|
||||||
|
|
||||||
// delete dataset
|
// delete dataset
|
||||||
await MongoDataset.deleteMany(
|
await MongoDataset.deleteMany(
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ async function handler(
|
|||||||
permission: OwnerPermissionVal
|
permission: OwnerPermissionVal
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -122,7 +122,8 @@ async function handler(req: ApiRequestProps<rebuildEmbeddingBody>): Promise<Resp
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
{
|
{
|
||||||
session
|
session,
|
||||||
|
ordered: true
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ export async function insertData2Dataset({
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
// 3. Create mongo data text
|
// 3. Create mongo data text
|
||||||
@@ -112,7 +112,7 @@ export async function insertData2Dataset({
|
|||||||
fullTextToken: jiebaSplit({ text: qaStr })
|
fullTextToken: jiebaSplit({ text: qaStr })
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -192,7 +192,7 @@ const rebuildData = async ({
|
|||||||
retryCount: 50
|
retryCount: 50
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ export async function initRootUser(retry = 3): Promise<any> {
|
|||||||
password: hashStr(psw)
|
password: hashStr(psw)
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
{ session }
|
{ session, ordered: true }
|
||||||
);
|
);
|
||||||
rootId = _id;
|
rootId = _id;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -152,6 +152,7 @@ export const putChannel = (data: ChannelInfoType) =>
|
|||||||
export const deleteChannel = (id: number) => DELETE(`/channel/${id}`);
|
export const deleteChannel = (id: number) => DELETE(`/channel/${id}`);
|
||||||
|
|
||||||
export const getChannelLog = (params: {
|
export const getChannelLog = (params: {
|
||||||
|
request_id?: string;
|
||||||
channel?: string;
|
channel?: string;
|
||||||
model_name?: string;
|
model_name?: string;
|
||||||
code_type?: 'all' | 'success' | 'error';
|
code_type?: 'all' | 'success' | 'error';
|
||||||
@@ -164,6 +165,7 @@ export const getChannelLog = (params: {
|
|||||||
logs: ChannelLogListItemType[];
|
logs: ChannelLogListItemType[];
|
||||||
total: number;
|
total: number;
|
||||||
}>(`/logs/search`, {
|
}>(`/logs/search`, {
|
||||||
|
request_id: params.request_id,
|
||||||
channel: params.channel,
|
channel: params.channel,
|
||||||
model_name: params.model_name,
|
model_name: params.model_name,
|
||||||
code_type: params.code_type,
|
code_type: params.code_type,
|
||||||
|
|||||||
Reference in New Issue
Block a user