diff --git a/.github/workflows/docs-deploy-kubeconfig.yml b/.github/workflows/docs-deploy-kubeconfig.yml index 9d4be40d2..cc6d11ca7 100644 --- a/.github/workflows/docs-deploy-kubeconfig.yml +++ b/.github/workflows/docs-deploy-kubeconfig.yml @@ -6,8 +6,6 @@ on: - 'docSite/**' branches: - 'main' - tags: - - 'v*.*.*' jobs: build-fastgpt-docs-images: diff --git a/.github/workflows/docs-deploy-vercel.yml b/.github/workflows/docs-deploy-vercel.yml index fed4a830b..0b8b91b50 100644 --- a/.github/workflows/docs-deploy-vercel.yml +++ b/.github/workflows/docs-deploy-vercel.yml @@ -7,8 +7,6 @@ on: - 'docSite/**' branches: - 'main' - tags: - - 'v*.*.*' # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: diff --git a/.github/workflows/docs-preview.yml b/.github/workflows/docs-preview.yml index a76242a3a..e66e5d64e 100644 --- a/.github/workflows/docs-preview.yml +++ b/.github/workflows/docs-preview.yml @@ -4,8 +4,6 @@ on: pull_request_target: paths: - 'docSite/**' - branches: - - 'main' workflow_dispatch: # A workflow run is made up of one or more jobs that can run sequentially or in parallel diff --git a/.github/workflows/fastgpt-build-image.yml b/.github/workflows/fastgpt-build-image.yml index 3dc362c68..ccd54859c 100644 --- a/.github/workflows/fastgpt-build-image.yml +++ b/.github/workflows/fastgpt-build-image.yml @@ -26,7 +26,7 @@ jobs: with: driver-opts: network=host - name: Cache Docker layers - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /tmp/.buildx-cache key: ${{ runner.os }}-buildx-${{ github.sha }} @@ -108,7 +108,7 @@ jobs: with: driver-opts: network=host - name: Cache Docker layers - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /tmp/.buildx-cache key: ${{ runner.os }}-buildx-${{ github.sha }} @@ -191,7 +191,7 @@ jobs: with: driver-opts: network=host - name: Cache Docker layers - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /tmp/.buildx-cache key: ${{ runner.os }}-buildx-${{ github.sha }} diff --git a/.github/workflows/sandbox-build-image.yml b/.github/workflows/sandbox-build-image.yml index a1a745a1f..8e58daf46 100644 --- a/.github/workflows/sandbox-build-image.yml +++ b/.github/workflows/sandbox-build-image.yml @@ -25,7 +25,7 @@ jobs: with: driver-opts: network=host - name: Cache Docker layers - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /tmp/.buildx-cache key: ${{ runner.os }}-buildx-${{ github.sha }} diff --git a/.vscode/settings.json b/.vscode/settings.json index c674d0733..bba595f12 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -27,7 +27,5 @@ }, "markdown.copyFiles.destination": { "/docSite/content/**/*": "${documentWorkspaceFolder}/docSite/assets/imgs/" - }, - "markdown.copyFiles.overwriteBehavior": "nameIncrementally", - "markdown.copyFiles.transformPath": "const filename = uri.path.split('/').pop(); return `/imgs/${filename}`;" + } } \ No newline at end of file diff --git a/README.md b/README.md index dfe0a8387..2a5793baf 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,6 @@ https://github.com/labring/FastGPT/assets/15308462/7d3a38df-eb0e-4388-9250-2409b * [系统配置文件说明](https://doc.tryfastgpt.ai/docs/development/configuration/) * [多模型配置方案](https://doc.tryfastgpt.ai/docs/development/modelconfig/one-api/) * [版本更新/升级介绍](https://doc.tryfastgpt.ai/docs/development/upgrading/) -* [AI Proxy API调用地址](https://sealos.run/aiproxy/?k=fastgpt-github/) * [OpenAPI API 文档](https://doc.tryfastgpt.ai/docs/development/openapi/) * [知识库结构详解](https://doc.tryfastgpt.ai/docs/guide/knowledge_base/rag/) @@ -115,16 +114,6 @@ https://github.com/labring/FastGPT/assets/15308462/7d3a38df-eb0e-4388-9250-2409b # -## 🏘️ 社区交流群 - -扫码加入飞书话题群: - -![](https://oss.laf.run/otnvvf-imgs/fastgpt-feishu1.png) - - - # - - ## 🏘️ 加入我们 我们正在寻找志同道合的小伙伴,加速 FastGPT 的发展。你可以通过 [FastGPT 2025 招聘](https://fael3z0zfze.feishu.cn/wiki/P7FOwEmPziVcaYkvVaacnVX1nvg)了解 FastGPT 的招聘信息。 @@ -133,19 +122,27 @@ https://github.com/labring/FastGPT/assets/15308462/7d3a38df-eb0e-4388-9250-2409b - [Laf:3 分钟快速接入三方应用](https://github.com/labring/laf) - [Sealos:快速部署集群应用](https://github.com/labring/sealos) -- [AI Proxy API调用地址](https://sealos.run/aiproxy/?k=fastgpt-github/) - [One API:多模型管理,支持 Azure、文心一言等](https://github.com/songquanpeng/one-api) -- [TuShan:5 分钟搭建后台管理系统](https://github.com/msgbyte/tushan) # - ## 🌿 第三方生态 -- [COW 个人微信/企微机器人](https://doc.tryfastgpt.ai/docs/use-cases/external-integration/onwechat/) +- [AI Proxy:国内模型聚合服务](https://sealos.run/aiproxy/?k=fastgpt-github/) - [SiliconCloud (硅基流动) —— 开源模型在线体验平台](https://cloud.siliconflow.cn/i/TR9Ym0c4) +- [COW 个人微信/企微机器人](https://doc.tryfastgpt.ai/docs/use-cases/external-integration/onwechat/) + + + # + + +## 🏘️ 社区交流群 + +扫码加入飞书话题群: + +![](https://oss.laf.run/otnvvf-imgs/fastgpt-feishu1.png) # diff --git a/deploy/docker/docker-compose-milvus.yml b/deploy/docker/docker-compose-milvus.yml index 3ccbc97dd..e9adc3ccd 100644 --- a/deploy/docker/docker-compose-milvus.yml +++ b/deploy/docker/docker-compose-milvus.yml @@ -137,10 +137,13 @@ services: - FE_DOMAIN= # root 密码,用户名为: root。如果需要修改 root 密码,直接修改这个环境变量,并重启即可。 - DEFAULT_ROOT_PSW=1234 - # AI模型的API地址哦。务必加 /v1。这里默认填写了OneApi的访问地址。 - - OPENAI_BASE_URL=http://oneapi:3000/v1 - # AI模型的API Key。(这里默认填写了OneAPI的快速默认key,测试通后,务必及时修改) - - CHAT_API_KEY=sk-fastgpt + # AI Proxy 的地址,如果配了该地址,优先使用 + - AIPROXY_API_ENDPOINT=http://aiproxy:3000 + # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY + - AIPROXY_API_TOKEN=aiproxy + # 模型中转地址(如果用了 AI Proxy,下面 2 个就不需要了,旧版 OneAPI 用户,使用下面的变量) + # - OPENAI_BASE_URL=http://oneapi:3000/v1 + # - CHAT_API_KEY=sk-fastgpt # 数据库最大连接数 - DB_MAX_LINK=30 # 登录凭证密钥 @@ -170,48 +173,52 @@ services: volumes: - ./config.json:/app/data/config.json - # oneapi - mysql: - image: registry.cn-hangzhou.aliyuncs.com/fastgpt/mysql:8.0.36 # 阿里云 - # image: mysql:8.0.36 - container_name: mysql - restart: always - ports: - - 3306:3306 - networks: - - fastgpt - command: --default-authentication-plugin=mysql_native_password - environment: - # 默认root密码,仅首次运行有效 - MYSQL_ROOT_PASSWORD: oneapimmysql - MYSQL_DATABASE: oneapi - volumes: - - ./mysql:/var/lib/mysql - oneapi: - container_name: oneapi - image: ghcr.io/songquanpeng/one-api:v0.6.7 - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/one-api:v0.6.6 # 阿里云 - ports: - - 3001:3000 + # AI Proxy + aiproxy: + image: 'ghcr.io/labring/sealos-aiproxy-service:latest' + container_name: aiproxy + restart: unless-stopped depends_on: - - mysql + aiproxy_pg: + condition: service_healthy networks: - fastgpt - restart: always environment: - # mysql 连接参数 - - SQL_DSN=root:oneapimmysql@tcp(mysql:3306)/oneapi - # 登录凭证加密密钥 - - SESSION_SECRET=oneapikey - # 内存缓存 - - MEMORY_CACHE_ENABLED=true - # 启动聚合更新,减少数据交互频率 - - BATCH_UPDATE_ENABLED=true - # 聚合更新时长 - - BATCH_UPDATE_INTERVAL=10 - # 初始化的 root 密钥(建议部署完后更改,否则容易泄露) - - INITIAL_ROOT_TOKEN=fastgpt + # 对应 fastgpt 里的AIPROXY_API_TOKEN + - ADMIN_KEY=aiproxy + # 错误日志详情保存时间(小时) + - LOG_DETAIL_STORAGE_HOURS=1 + # 数据库连接地址 + - SQL_DSN=postgres://postgres:aiproxy@aiproxy_pg:5432/aiproxy + # 最大重试次数 + - RetryTimes=3 + # 不需要计费 + - BILLING_ENABLED=false + # 不需要严格检测模型 + - DISABLE_MODEL_CONFIG=true + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:3000/api/status'] + interval: 5s + timeout: 5s + retries: 10 + aiproxy_pg: + image: pgvector/pgvector:0.8.0-pg15 # docker hub + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/pgvector:v0.8.0-pg15 # 阿里云 + restart: unless-stopped + container_name: aiproxy_pg volumes: - - ./oneapi:/data + - ./aiproxy_pg:/var/lib/postgresql/data + networks: + - fastgpt + environment: + TZ: Asia/Shanghai + POSTGRES_USER: postgres + POSTGRES_DB: aiproxy + POSTGRES_PASSWORD: aiproxy + healthcheck: + test: ['CMD', 'pg_isready', '-U', 'postgres', '-d', 'aiproxy'] + interval: 5s + timeout: 5s + retries: 10 networks: fastgpt: diff --git a/deploy/docker/docker-compose-pgvector.yml b/deploy/docker/docker-compose-pgvector.yml index ec4f35817..2b64e6470 100644 --- a/deploy/docker/docker-compose-pgvector.yml +++ b/deploy/docker/docker-compose-pgvector.yml @@ -7,12 +7,12 @@ version: '3.3' services: # db pg: - image: pgvector/pgvector:0.7.0-pg15 # docker hub - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/pgvector:v0.7.0 # 阿里云 + image: pgvector/pgvector:0.8.0-pg15 # docker hub + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/pgvector:v0.8.0-pg15 # 阿里云 container_name: pg restart: always - ports: # 生产环境建议不要暴露 - - 5432:5432 + # ports: # 生产环境建议不要暴露 + # - 5432:5432 networks: - fastgpt environment: @@ -28,8 +28,8 @@ services: # image: mongo:4.4.29 # cpu不支持AVX时候使用 container_name: mongo restart: always - ports: - - 27017:27017 + # ports: + # - 27017:27017 networks: - fastgpt command: mongod --keyFile /data/mongodb.key --replSet rs0 @@ -95,10 +95,13 @@ services: - FE_DOMAIN= # root 密码,用户名为: root。如果需要修改 root 密码,直接修改这个环境变量,并重启即可。 - DEFAULT_ROOT_PSW=1234 - # AI模型的API地址哦。务必加 /v1。这里默认填写了OneApi的访问地址。 - - OPENAI_BASE_URL=http://oneapi:3000/v1 - # AI模型的API Key。(这里默认填写了OneAPI的快速默认key,测试通后,务必及时修改) - - CHAT_API_KEY=sk-fastgpt + # AI Proxy 的地址,如果配了该地址,优先使用 + - AIPROXY_API_ENDPOINT=http://aiproxy:3000 + # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY + - AIPROXY_API_TOKEN=aiproxy + # 模型中转地址(如果用了 AI Proxy,下面 2 个就不需要了,旧版 OneAPI 用户,使用下面的变量) + # - OPENAI_BASE_URL=http://oneapi:3000/v1 + # - CHAT_API_KEY=sk-fastgpt # 数据库最大连接数 - DB_MAX_LINK=30 # 登录凭证密钥 @@ -127,48 +130,52 @@ services: volumes: - ./config.json:/app/data/config.json - # oneapi - mysql: - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/mysql:8.0.36 # 阿里云 - image: mysql:8.0.36 - container_name: mysql - restart: always - ports: - - 3306:3306 - networks: - - fastgpt - command: --default-authentication-plugin=mysql_native_password - environment: - # 默认root密码,仅首次运行有效 - MYSQL_ROOT_PASSWORD: oneapimmysql - MYSQL_DATABASE: oneapi - volumes: - - ./mysql:/var/lib/mysql - oneapi: - container_name: oneapi - image: ghcr.io/songquanpeng/one-api:v0.6.7 - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/one-api:v0.6.6 # 阿里云 - ports: - - 3001:3000 + # AI Proxy + aiproxy: + image: 'ghcr.io/labring/sealos-aiproxy-service:latest' + container_name: aiproxy + restart: unless-stopped depends_on: - - mysql + aiproxy_pg: + condition: service_healthy networks: - fastgpt - restart: always environment: - # mysql 连接参数 - - SQL_DSN=root:oneapimmysql@tcp(mysql:3306)/oneapi - # 登录凭证加密密钥 - - SESSION_SECRET=oneapikey - # 内存缓存 - - MEMORY_CACHE_ENABLED=true - # 启动聚合更新,减少数据交互频率 - - BATCH_UPDATE_ENABLED=true - # 聚合更新时长 - - BATCH_UPDATE_INTERVAL=10 - # 初始化的 root 密钥(建议部署完后更改,否则容易泄露) - - INITIAL_ROOT_TOKEN=fastgpt + # 对应 fastgpt 里的AIPROXY_API_TOKEN + - ADMIN_KEY=aiproxy + # 错误日志详情保存时间(小时) + - LOG_DETAIL_STORAGE_HOURS=1 + # 数据库连接地址 + - SQL_DSN=postgres://postgres:aiproxy@aiproxy_pg:5432/aiproxy + # 最大重试次数 + - RetryTimes=3 + # 不需要计费 + - BILLING_ENABLED=false + # 不需要严格检测模型 + - DISABLE_MODEL_CONFIG=true + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:3000/api/status'] + interval: 5s + timeout: 5s + retries: 10 + aiproxy_pg: + image: pgvector/pgvector:0.8.0-pg15 # docker hub + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/pgvector:v0.8.0-pg15 # 阿里云 + restart: unless-stopped + container_name: aiproxy_pg volumes: - - ./oneapi:/data + - ./aiproxy_pg:/var/lib/postgresql/data + networks: + - fastgpt + environment: + TZ: Asia/Shanghai + POSTGRES_USER: postgres + POSTGRES_DB: aiproxy + POSTGRES_PASSWORD: aiproxy + healthcheck: + test: ['CMD', 'pg_isready', '-U', 'postgres', '-d', 'aiproxy'] + interval: 5s + timeout: 5s + retries: 10 networks: fastgpt: diff --git a/deploy/docker/docker-compose-zilliz.yml b/deploy/docker/docker-compose-zilliz.yml index 508ec189b..0e4bb3ddb 100644 --- a/deploy/docker/docker-compose-zilliz.yml +++ b/deploy/docker/docker-compose-zilliz.yml @@ -75,10 +75,13 @@ services: - FE_DOMAIN= # root 密码,用户名为: root。如果需要修改 root 密码,直接修改这个环境变量,并重启即可。 - DEFAULT_ROOT_PSW=1234 - # AI模型的API地址哦。务必加 /v1。这里默认填写了OneApi的访问地址。 - - OPENAI_BASE_URL=http://oneapi:3000/v1 - # AI模型的API Key。(这里默认填写了OneAPI的快速默认key,测试通后,务必及时修改) - - CHAT_API_KEY=sk-fastgpt + # AI Proxy 的地址,如果配了该地址,优先使用 + - AIPROXY_API_ENDPOINT=http://aiproxy:3000 + # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY + - AIPROXY_API_TOKEN=aiproxy + # 模型中转地址(如果用了 AI Proxy,下面 2 个就不需要了,旧版 OneAPI 用户,使用下面的变量) + # - OPENAI_BASE_URL=http://oneapi:3000/v1 + # - CHAT_API_KEY=sk-fastgpt # 数据库最大连接数 - DB_MAX_LINK=30 # 登录凭证密钥 @@ -108,48 +111,52 @@ services: volumes: - ./config.json:/app/data/config.json - # oneapi - mysql: - image: registry.cn-hangzhou.aliyuncs.com/fastgpt/mysql:8.0.36 # 阿里云 - # image: mysql:8.0.36 - container_name: mysql - restart: always - ports: - - 3306:3306 - networks: - - fastgpt - command: --default-authentication-plugin=mysql_native_password - environment: - # 默认root密码,仅首次运行有效 - MYSQL_ROOT_PASSWORD: oneapimmysql - MYSQL_DATABASE: oneapi - volumes: - - ./mysql:/var/lib/mysql - oneapi: - container_name: oneapi - image: ghcr.io/songquanpeng/one-api:v0.6.7 - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/one-api:v0.6.6 # 阿里云 - ports: - - 3001:3000 + # AI Proxy + aiproxy: + image: 'ghcr.io/labring/sealos-aiproxy-service:latest' + container_name: aiproxy + restart: unless-stopped depends_on: - - mysql + aiproxy_pg: + condition: service_healthy networks: - fastgpt - restart: always environment: - # mysql 连接参数 - - SQL_DSN=root:oneapimmysql@tcp(mysql:3306)/oneapi - # 登录凭证加密密钥 - - SESSION_SECRET=oneapikey - # 内存缓存 - - MEMORY_CACHE_ENABLED=true - # 启动聚合更新,减少数据交互频率 - - BATCH_UPDATE_ENABLED=true - # 聚合更新时长 - - BATCH_UPDATE_INTERVAL=10 - # 初始化的 root 密钥(建议部署完后更改,否则容易泄露) - - INITIAL_ROOT_TOKEN=fastgpt + # 对应 fastgpt 里的AIPROXY_API_TOKEN + - ADMIN_KEY=aiproxy + # 错误日志详情保存时间(小时) + - LOG_DETAIL_STORAGE_HOURS=1 + # 数据库连接地址 + - SQL_DSN=postgres://postgres:aiproxy@aiproxy_pg:5432/aiproxy + # 最大重试次数 + - RetryTimes=3 + # 不需要计费 + - BILLING_ENABLED=false + # 不需要严格检测模型 + - DISABLE_MODEL_CONFIG=true + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:3000/api/status'] + interval: 5s + timeout: 5s + retries: 10 + aiproxy_pg: + image: pgvector/pgvector:0.8.0-pg15 # docker hub + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/pgvector:v0.8.0-pg15 # 阿里云 + restart: unless-stopped + container_name: aiproxy_pg volumes: - - ./oneapi:/data + - ./aiproxy_pg:/var/lib/postgresql/data + networks: + - fastgpt + environment: + TZ: Asia/Shanghai + POSTGRES_USER: postgres + POSTGRES_DB: aiproxy + POSTGRES_PASSWORD: aiproxy + healthcheck: + test: ['CMD', 'pg_isready', '-U', 'postgres', '-d', 'aiproxy'] + interval: 5s + timeout: 5s + retries: 10 networks: fastgpt: diff --git a/deploy/helm/fastgpt/templates/configmap-config.yaml b/deploy/helm/fastgpt/templates/configmap-config.yaml index 0aa7dcc90..4a760d560 100644 --- a/deploy/helm/fastgpt/templates/configmap-config.yaml +++ b/deploy/helm/fastgpt/templates/configmap-config.yaml @@ -6,6 +6,7 @@ data: "openapiPrefix": "fastgpt", "vectorMaxProcess": 15, "qaMaxProcess": 15, + "vlmMaxProcess": 15, "pgHNSWEfSearch": 100 }, "llmModels": [ diff --git a/docSite/assets/imgs/aiproxy-1.jpg b/docSite/assets/imgs/aiproxy-1.jpg new file mode 100644 index 000000000..1b8e8eaf7 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-1.jpg differ diff --git a/docSite/assets/imgs/aiproxy-1.png b/docSite/assets/imgs/aiproxy-1.png new file mode 100644 index 000000000..1b8e8eaf7 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-1.png differ diff --git a/docSite/assets/imgs/aiproxy-10.png b/docSite/assets/imgs/aiproxy-10.png new file mode 100644 index 000000000..e635a0fe3 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-10.png differ diff --git a/docSite/assets/imgs/aiproxy-11.png b/docSite/assets/imgs/aiproxy-11.png new file mode 100644 index 000000000..0a87605ae Binary files /dev/null and b/docSite/assets/imgs/aiproxy-11.png differ diff --git a/docSite/assets/imgs/aiproxy-2.png b/docSite/assets/imgs/aiproxy-2.png new file mode 100644 index 000000000..2f8e8a037 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-2.png differ diff --git a/docSite/assets/imgs/aiproxy-3.png b/docSite/assets/imgs/aiproxy-3.png new file mode 100644 index 000000000..769d22496 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-3.png differ diff --git a/docSite/assets/imgs/aiproxy-4.png b/docSite/assets/imgs/aiproxy-4.png new file mode 100644 index 000000000..b2e815b84 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-4.png differ diff --git a/docSite/assets/imgs/aiproxy-5.png b/docSite/assets/imgs/aiproxy-5.png new file mode 100644 index 000000000..11d5546c0 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-5.png differ diff --git a/docSite/assets/imgs/aiproxy-6.png b/docSite/assets/imgs/aiproxy-6.png new file mode 100644 index 000000000..6ea3ff906 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-6.png differ diff --git a/docSite/assets/imgs/aiproxy-7.png b/docSite/assets/imgs/aiproxy-7.png new file mode 100644 index 000000000..0c23fb606 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-7.png differ diff --git a/docSite/assets/imgs/aiproxy-8.png b/docSite/assets/imgs/aiproxy-8.png new file mode 100644 index 000000000..14e392669 Binary files /dev/null and b/docSite/assets/imgs/aiproxy-8.png differ diff --git a/docSite/assets/imgs/aiproxy-9.png b/docSite/assets/imgs/aiproxy-9.png new file mode 100644 index 000000000..0851486fb Binary files /dev/null and b/docSite/assets/imgs/aiproxy-9.png differ diff --git a/docSite/assets/imgs/aiproxy1.png b/docSite/assets/imgs/aiproxy1.png new file mode 100644 index 000000000..3905df329 Binary files /dev/null and b/docSite/assets/imgs/aiproxy1.png differ diff --git a/docSite/assets/imgs/image copy.png b/docSite/assets/imgs/image copy.png new file mode 100644 index 000000000..21184181b Binary files /dev/null and b/docSite/assets/imgs/image copy.png differ diff --git a/docSite/assets/imgs/marker2.png b/docSite/assets/imgs/marker2.png new file mode 100644 index 000000000..20d6e6103 Binary files /dev/null and b/docSite/assets/imgs/marker2.png differ diff --git a/docSite/assets/imgs/marker3.png b/docSite/assets/imgs/marker3.png new file mode 100644 index 000000000..31d0586cf Binary files /dev/null and b/docSite/assets/imgs/marker3.png differ diff --git a/docSite/content/zh-cn/docs/development/configuration.md b/docSite/content/zh-cn/docs/development/configuration.md index 90434d663..8e5affee4 100644 --- a/docSite/content/zh-cn/docs/development/configuration.md +++ b/docSite/content/zh-cn/docs/development/configuration.md @@ -23,8 +23,54 @@ weight: 707 "systemEnv": { "vectorMaxProcess": 15, // 向量处理线程数量 "qaMaxProcess": 15, // 问答拆分线程数量 + "vlmMaxProcess": 15, // 图片理解模型最大处理进程 "tokenWorkers": 50, // Token 计算线程保持数,会持续占用内存,不能设置太大。 - "pgHNSWEfSearch": 100 // 向量搜索参数。越大,搜索越精确,但是速度越慢。设置为100,有99%+精度。 + "pgHNSWEfSearch": 100, // 向量搜索参数。越大,搜索越精确,但是速度越慢。设置为100,有99%+精度。 + "customPdfParse": { // 4.9.0 新增配置 + "url": "", // 自定义 PDF 解析服务地址 + "key": "", // 自定义 PDF 解析服务密钥 + "doc2xKey": "", // doc2x 服务密钥 + "price": 0 // PDF 解析服务价格 + } } } ``` + +## 自定义 PDF 解析配置 + +自定义 PDF 服务解析的优先级高于 Doc2x 服务,所以如果使用 Doc2x 服务,请勿配置自定义 PDF 服务。 + +### 使用 Sealos PDF 解析服务 + +#### 1. 申请 Sealos AI proxy API Key + +[点击打开 Sealos Pdf parser 官网](https://cloud.sealos.run/?uid=fnWRt09fZP&openapp=system-aiproxy),并进行对应 API Key 的申请。 + +#### 2. 修改 FastGPT 配置文件 + +`systemEnv.customPdfParse.url`填写成`https://aiproxy.hzh.sealos.run/v1/parse/pdf?model=parse-pdf` +`systemEnv.customPdfParse.key`填写成在 Sealos AI proxy 中申请的 API Key。 + +![](/imgs/deployconfig-aiproxy.png) + +### 使用 Doc2x 解析 PDF 文件 + +`Doc2x`是一个国内提供专业 PDF 解析。 + +#### 1. 申请 Doc2x 服务 + +[点击打开 Doc2x 官网](https://doc2x.noedgeai.com?inviteCode=9EACN2),并进行对应 API Key 的申请。 + +#### 2. 修改 FastGPT 配置文件 + +开源版用户在 `config.json` 文件中添加 `systemEnv.customPdfParse.doc2xKey` 配置,并填写上申请到的 API Key。并重启服务。 + +商业版用户在 Admin 后台根据表单指引填写 Doc2x 服务密钥。 + +#### 3. 开始使用 + +在知识库导入数据或应用文件上传配置中,可以勾选`PDF 增强解析`,则在对 PDF 解析时候,会使用 Doc2x 服务进行解析。 + +### 使用 Marker 解析 PDF 文件 + +[点击查看 Marker 接入教程](/docs/development/custom-models/marker) \ No newline at end of file diff --git a/docSite/content/zh-cn/docs/development/custom-models/marker.md b/docSite/content/zh-cn/docs/development/custom-models/marker.md index 93c9d97bc..36bbdfe68 100644 --- a/docSite/content/zh-cn/docs/development/custom-models/marker.md +++ b/docSite/content/zh-cn/docs/development/custom-models/marker.md @@ -11,39 +11,51 @@ weight: 909 PDF 是一个相对复杂的文件格式,在 FastGPT 内置的 pdf 解析器中,依赖的是 pdfjs 库解析,该库基于逻辑解析,无法有效的理解复杂的 pdf 文件。所以我们在解析 pdf 时候,如果遇到图片、表格、公式等非简单文本内容,会发现解析效果不佳。 -市面上目前有多种解析 PDF 的方法,比如使用 [Marker](https://github.com/VikParuchuri/marker),该项目使用了 Surya 模型,基于视觉解析,可以有效提取图片、表格、公式等复杂内容。为了可以让 Marker 快速接入 FastGPT,我们做了一个自定义解析的拓展 Demo。 +市面上目前有多种解析 PDF 的方法,比如使用 [Marker](https://github.com/VikParuchuri/marker),该项目使用了 Surya 模型,基于视觉解析,可以有效提取图片、表格、公式等复杂内容。 -在 FastGPT 4.8.15 版本中,你可以通过增加一个环境变量,来替换掉 FastGPT 系统内置解析器,实现自定义的文档解析服务。该功能只是 Demo 阶段,后期配置模式和交互规则会发生改动。 +在 `FastGPT v4.9.0` 版本中,开源版用户可以在`config.json`文件中添加`systemEnv.customPdfParse`配置,来使用 Marker 解析 PDF 文件。商业版用户直接在 Admin 后台根据表单指引填写即可。需重新拉取 Marker 镜像,接口格式已变动。 ## 使用教程 -### 1. 按照 Marker +### 1. 安装 Marker 参考文档 [Marker 安装教程](https://github.com/labring/FastGPT/tree/main/plugins/model/pdf-marker),安装 Marker 模型。封装的 API 已经适配了 FastGPT 自定义解析服务。 这里介绍快速 Docker 安装的方法: ```dockerfile -docker pull crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:latest -docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:latest +docker pull crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:v0.2 +docker run --gpus all -itd -p 7231:7232 --name model_pdf_v2 -e PROCESSES_PER_GPU="2" crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:v0.2 +``` +### 2. 添加 FastGPT 文件配置 + +```json +{ + xxx + "systemEnv": { + xxx + "customPdfParse": { + "url": "http://xxxx.com/v2/parse/file", // 自定义 PDF 解析服务地址 marker v0.2 + "key": "", // 自定义 PDF 解析服务密钥 + "doc2xKey": "", // doc2x 服务密钥 + "price": 0 // PDF 解析服务价格 + } + } +} ``` -### 2. 添加 FastGPT 环境变量 - -``` -CUSTOM_READ_FILE_URL=http://xxxx.com/v1/parse/file -CUSTOM_READ_FILE_EXTENSION=pdf -``` - -* CUSTOM_READ_FILE_URL - 自定义解析服务的地址, host改成解析服务的访问地址,path 不能变动。 -* CUSTOM_READ_FILE_EXTENSION - 支持的文件后缀,多个文件类型,可用逗号隔开。 +需要重启服务。 ### 3. 测试效果 -通过知识库上传一个 pdf 文件,并确认上传,可以在日志中看到 LOG (LOG_LEVEL需要设置 info 或者 debug): +通过知识库上传一个 pdf 文件,并勾选上 `PDF 增强解析`。 + +![alt text](/imgs/marker2.png) + +确认上传后,可以在日志中看到 LOG (LOG_LEVEL需要设置 info 或者 debug): ``` -[Info] 2024-12-05 15:04:42 Parsing files from an external service +[Info] 2024-12-05 15:04:42 Parsing files from an external service [Info] 2024-12-05 15:07:08 Custom file parsing is complete, time: 1316ms ``` @@ -51,6 +63,10 @@ CUSTOM_READ_FILE_EXTENSION=pdf ![alt text](/imgs/image-10.png) +同样的,在应用中,你可以在文件上传配置里,勾选上 `PDF 增强解析`。 + +![alt text](/imgs/marker3.png) + ## 效果展示 @@ -63,4 +79,25 @@ CUSTOM_READ_FILE_EXTENSION=pdf 上图是分块后的结果,下图是 pdf 原文。整体图片、公式、表格都可以提取出来,效果还是杠杠的。 -不过要注意的是,[Marker](https://github.com/VikParuchuri/marker) 的协议是`GPL-3.0 license`,请在遵守协议的前提下使用。 \ No newline at end of file +不过要注意的是,[Marker](https://github.com/VikParuchuri/marker) 的协议是`GPL-3.0 license`,请在遵守协议的前提下使用。 + +## 旧版 Marker 使用方法 + +FastGPT V4.9.0 版本之前,可以用以下方式,试用 Marker 解析服务。 + +安装和运行 Marker 服务: + +```dockerfile +docker pull crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:v0.1 +docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 -e PROCESSES_PER_GPU="2" crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:v0.1 +``` + +并修改 FastGPT 环境变量: + +``` +CUSTOM_READ_FILE_URL=http://xxxx.com/v1/parse/file +CUSTOM_READ_FILE_EXTENSION=pdf +``` + +* CUSTOM_READ_FILE_URL - 自定义解析服务的地址, host改成解析服务的访问地址,path 不能变动。 +* CUSTOM_READ_FILE_EXTENSION - 支持的文件后缀,多个文件类型,可用逗号隔开。 \ No newline at end of file diff --git a/docSite/content/zh-cn/docs/development/docker.md b/docSite/content/zh-cn/docs/development/docker.md index 1c776ea16..9a5962a04 100644 --- a/docSite/content/zh-cn/docs/development/docker.md +++ b/docSite/content/zh-cn/docs/development/docker.md @@ -30,7 +30,7 @@ weight: 707 ### PgVector版本 -非常轻量,适合数据量在 5000 万以下。 +非常轻量,适合知识库索引量在 5000 万以下。 {{< table "table-hover table-striped-columns" >}} | 环境 | 最低配置(单节点) | 推荐配置 | @@ -149,18 +149,14 @@ curl -o docker-compose.yml https://raw.githubusercontent.com/labring/FastGPT/mai {{< tab tabName="PgVector版本" >}} {{< markdownify >}} -``` -FE_DOMAIN=你的前端你访问地址,例如 http://192.168.0.1:3000;https://cloud.fastgpt.cn -``` +无需操作 {{< /markdownify >}} {{< /tab >}} {{< tab tabName="Milvus版本" >}} {{< markdownify >}} -``` -FE_DOMAIN=你的前端你访问地址,例如 http://192.168.0.1:3000;https://cloud.fastgpt.cn -``` +无需操作 {{< /markdownify >}} {{< /tab >}} @@ -174,7 +170,6 @@ FE_DOMAIN=你的前端你访问地址,例如 http://192.168.0.1:3000;https://clo {{% alert icon="🤖" context="success" %}} 1. 修改`MILVUS_ADDRESS`和`MILVUS_TOKEN`链接参数,分别对应 `zilliz` 的 `Public Endpoint` 和 `Api key`,记得把自己ip加入白名单。 -2. 修改FE_DOMAIN=你的前端你访问地址,例如 http://192.168.0.1:3000;https://cloud.fastgpt.cn {{% /alert %}} @@ -189,36 +184,28 @@ FE_DOMAIN=你的前端你访问地址,例如 http://192.168.0.1:3000;https://clo ```bash # 启动容器 docker-compose up -d -# 等待10s,OneAPI第一次总是要重启几次才能连上Mysql -sleep 10 -# 重启一次oneapi(由于OneAPI的默认Key有点问题,不重启的话会提示找不到渠道,临时手动重启一次解决,等待作者修复) -docker restart oneapi ``` -### 4. 打开 OneAPI 添加模型 +### 4. 访问 FastGPT -可以通过`ip:3001`访问OneAPI,默认账号为`root`密码为`123456`。 - -在OneApi中添加合适的AI模型渠道。[点击查看相关教程](/docs/development/modelconfig/one-api/) - -只需要添加模型即可,模板已经配置好了oneapi的连接地址和令牌,无需变更。 - -### 5. 访问 FastGPT - -目前可以通过 `ip:3000` 直接访问(注意防火墙)。登录用户名为 `root`,密码为`docker-compose.yml`环境变量里设置的 `DEFAULT_ROOT_PSW`。 +目前可以通过 `ip:3000` 直接访问(注意开放防火墙)。登录用户名为 `root`,密码为`docker-compose.yml`环境变量里设置的 `DEFAULT_ROOT_PSW`。 如果需要域名访问,请自行安装并配置 Nginx。 -首次运行,会自动初始化 root 用户,密码为 `1234`(与环境变量中的`DEFAULT_ROOT_PSW`一致),日志里会提示一次`MongoServerError: Unable to read from a snapshot due to pending collection catalog changes;`可忽略。 +首次运行,会自动初始化 root 用户,密码为 `1234`(与环境变量中的`DEFAULT_ROOT_PSW`一致),日志可能会提示一次`MongoServerError: Unable to read from a snapshot due to pending collection catalog changes;`可忽略。 -### 6. 配置模型 +### 5. 配置模型 -登录FastGPT后,进入模型配置页面,务必先配置至少一个语言模型和一个向量模型,否则系统无法正常使用。 - -[点击查看模型配置教程](/docs/development/modelConfig/intro/) +- 首次登录FastGPT后,系统会提示未配置`语言模型`和`索引模型`,并自动跳转模型配置页面。系统必须至少有这两类模型才能正常使用。 +- 如果系统未正常跳转,可以在`账号-模型提供商`页面,进行模型配置。[点击查看相关教程](/docs/development/modelconfig/ai-proxy) +- 目前已知可能问题:首次进入系统后,整个浏览器 tab 无法响应。此时需要删除该tab,重新打开一次即可。 ## FAQ +### 登录系统后,浏览器无法响应 + +无法点击任何内容,刷新也无效。此时需要删除该tab,重新打开一次即可。 + ### Mongo 副本集自动初始化失败 最新的 docker-compose 示例优化 Mongo 副本集初始化,实现了全自动。目前在 unbuntu20,22 centos7, wsl2, mac, window 均通过测试。仍无法正常启动,大部分是因为 cpu 不支持 AVX 指令集,可以切换 Mongo4.x 版本。 diff --git a/docSite/content/zh-cn/docs/development/intro.md b/docSite/content/zh-cn/docs/development/intro.md index 8c38a77d0..d1ec79ba0 100644 --- a/docSite/content/zh-cn/docs/development/intro.md +++ b/docSite/content/zh-cn/docs/development/intro.md @@ -70,6 +70,7 @@ Mongo 数据库需要注意,需要注意在连接地址中增加 `directConnec - `vectorMaxProcess`: 向量生成最大进程,根据数据库和 key 的并发数来决定,通常单个 120 号,2c4g 服务器设置 10~15。 - `qaMaxProcess`: QA 生成最大进程 +- `vlmMaxProcess`: 图片理解模型最大进程 - `pgHNSWEfSearch`: PostgreSQL vector 索引参数,越大搜索精度越高但是速度越慢,具体可看 pgvector 官方说明。 ### 5. 运行 diff --git a/docSite/content/zh-cn/docs/development/migration/docker_db.md b/docSite/content/zh-cn/docs/development/migration/docker_db.md index 019531353..4b34e4f5c 100644 --- a/docSite/content/zh-cn/docs/development/migration/docker_db.md +++ b/docSite/content/zh-cn/docs/development/migration/docker_db.md @@ -7,9 +7,18 @@ draft: false images: [] --- -## Copy文件 +## 1. 停止服务 + +```bash +docker-compose down +``` + + +## 2. Copy文件夹 Docker 部署数据库都会通过 volume 挂载本地的目录进入容器,如果要迁移,直接复制这些目录即可。 `PG 数据`: pg/data -`Mongo 数据`: mongo/data \ No newline at end of file +`Mongo 数据`: mongo/data + +直接把pg 和 mongo目录全部复制走即可。 \ No newline at end of file diff --git a/docSite/content/zh-cn/docs/development/modelConfig/ai-proxy.md b/docSite/content/zh-cn/docs/development/modelConfig/ai-proxy.md new file mode 100644 index 000000000..5d86c0957 --- /dev/null +++ b/docSite/content/zh-cn/docs/development/modelConfig/ai-proxy.md @@ -0,0 +1,129 @@ +--- +title: '通过 AI Proxy 接入模型' +description: '通过 AI Proxy 接入模型' +icon: 'api' +draft: false +toc: true +weight: 744 +--- + +从 `FastGPT 4.8.23` 版本开始,引入 AI Proxy 来进一步方便模型的配置。 + +AI Proxy 与 One API 类似,也是作为一个 OpenAI 接口管理 & 分发系统,可以通过标准的 OpenAI API 格式访问所有的大模型,开箱即用。 + +## 部署 + +### Docker 版本 + +`docker-compose.yml` 文件已加入了 AI Proxy 配置,可直接使用。[点击查看最新的 yml 配置](https://raw.githubusercontent.com/labring/FastGPT/main/deploy/docker/docker-compose-pgvector.yml) + +从旧版升级的用户,可以复制 yml 里,ai proxy 的配置,加入到旧的 yml 文件中。 + +## 运行原理 + +AI proxy 核心模块: + +1. 渠道管理:管理各家模型提供商的 API Key 和可用模型列表。 +2. 模型调用:根据请求的模型,选中对应的渠道;根据渠道的 API 格式,构造请求体,发送请求;格式化响应体成标准格式返回。 +3. 调用日志:详细记录模型调用的日志,并在错误时候可以记录其入参和报错信息,方便排查。 + +运行流程: + +![aiproxy12](/imgs/aiproxy1.png) + +## 在 FastGPT 中使用 + +AI proxy 相关功能,可以在`账号-模型提供商`页面找到。 + +### 1. 创建渠道 + +在`模型提供商`的配置页面,点击`模型渠道`,进入渠道配置页面 + +![aiproxy1](/imgs/aiproxy-1.png) + +点击右上角的“新增渠道”,即可进入渠道配置页面 + +![aiproxy2](/imgs/aiproxy-2.png) + +以阿里云的模型为例,进行如下配置 + +![aiproxy3](/imgs/aiproxy-3.png) + +1. 渠道名:展示在外部的渠道名称,仅作标识; +2. 厂商:模型对应的厂商,不同厂商对应不同的默认地址和 API 密钥格式; +3. 模型:当前渠道具体可以使用的模型,系统内置了主流的一些模型,如果下拉框中没有想要的选项,可以点击“新增模型”,[增加自定义模型](/docs/development/modelconfig/intro/#新增自定义模型); +4. 模型映射:将 FastGPT 请求的模型,映射到具体提供的模型上。例如: + +```json +{ + "gpt-4o-test": "gpt-4o", +} +``` + +FatGPT 中的模型为 `gpt-4o-test`,向 AI Proxy 发起请求时也是 `gpt-4o-test`。AI proxy 在向上游发送请求时,实际的`model`为 `gpt-4o`。 + +5. 代理地址:具体请求的地址,系统给每个主流渠道配置了默认的地址,如果无需改动则不用填。 +6. API 密钥:从模型厂商处获取的 API 凭证。注意部分厂商需要提供多个密钥组合,可以根据提示进行输入。 + +最后点击“新增”,就能在“模型渠道”下看到刚刚配置的渠道 + +![aiproxy4](/imgs/aiproxy-4.png) + +### 2. 渠道测试 + +然后可以对渠道进行测试,确保配置的模型有效 + +![aiproxy5](/imgs/aiproxy-5.png) + +点击“模型测试”,可以看到配置的模型列表,点击“开始测试” + +![aiproxy6](/imgs/aiproxy-6.png) + +等待模型测试完成后,会输出每个模型的测试结果以及请求时长 + +![aiproxy7](/imgs/aiproxy-7.png) + +### 3. 启用模型 + +最后在`模型配置`中,可以选择启用对应的模型,这样就能在平台中使用了,更多模型配置可以参考[模型配置](/docs/development/modelconfig/intro) + +![aiproxy8](/imgs/aiproxy-8.png) + +## 其他功能介绍 + +### 优先级 + +范围1~100。数值越大,越容易被优先选中。 + +![aiproxy9](/imgs/aiproxy-9.png) + +### 启用/禁用 + +在渠道右侧的控制菜单中,还可以控制渠道的启用或禁用,被禁用的渠道将无法再提供模型服务 + +![aiproxy10](/imgs/aiproxy-10.png) + +### 调用日志 + +在 `调用日志` 页面,会展示发送到模型处的请求记录,包括具体的输入输出 tokens、请求时间、请求耗时、请求地址等等。错误的请求,则会详细的入参和错误信息,方便排查,但仅会保留 1 小时(环境变量里可配置)。 + +![aiproxy11](/imgs/aiproxy-11.png) + +## 从 OneAPI 迁移到 AI Proxy + +可以从任意终端,发起 1 个 HTTP 请求。其中 `{{host}}` 替换成 AI Proxy 地址,`{{admin_key}}` 替换成 AI Proxy 中 `ADMIN_KEY` 的值。 + +Body 参数 `dsn` 为 OneAPI 的 mysql 连接串。 + +```bash +curl --location --request POST '{{host}}/api/channels/import/oneapi' \ +--header 'Authorization: Bearer {{admin_key}}' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "dsn": "mysql://root:s5mfkwst@tcp(dbconn.sealoshzh.site:33123)/mydb" +}' +``` + +执行成功的情况下会返回 "success": true + +脚本目前不是完全准,仅是简单的做数据映射,主要是迁移`代理地址`、`模型`和`API 密钥`,建议迁移后再进行手动检查。 \ No newline at end of file diff --git a/docSite/content/zh-cn/docs/development/modelConfig/intro.md b/docSite/content/zh-cn/docs/development/modelConfig/intro.md index 5aa6b219d..446eaae9e 100644 --- a/docSite/content/zh-cn/docs/development/modelConfig/intro.md +++ b/docSite/content/zh-cn/docs/development/modelConfig/intro.md @@ -13,9 +13,15 @@ weight: 744 ## 配置模型 -### 1. 使用 OneAPI 对接模型提供商 +### 1. 对接模型提供商 -可以使用 [OneAPI 接入教程](/docs/development/modelconfig/one-api) 来进行模型聚合,从而可以对接更多模型提供商。你需要先在各服务商申请好 API 接入 OneAPI 后,才能在 FastGPT 中使用这些模型。示例流程如下: +#### AI Proxy + +从 4.8.23 版本开始, FastGPT 支持在页面上配置模型提供商,即使用 [AI Proxy 接入教程](/docs/development/modelconfig/ai-proxy) 来进行模型聚合,从而可以对接更多模型提供商。 + +#### One API + +也可以使用 [OneAPI 接入教程](/docs/development/modelconfig/one-api)。你需要先在各服务商申请好 API 接入 OneAPI 后,才能在 FastGPT 中使用这些模型。示例流程如下: ![alt text](/imgs/image-95.png) @@ -28,17 +34,7 @@ weight: 744 在 OneAPI 配置好模型后,你就可以打开 FastGPT 页面,启用对应模型了。 -### 2. 登录 root 用户 - -仅 root 用户可以进行模型配置。 - -### 3. 进入模型配置页面 - -登录 root 用户后,在`账号-模型提供商-模型配置`中,你可以看到所有内置的模型和自定义模型,以及哪些模型启用了。 - -![alt text](/image-90.png) - -### 4. 配置介绍 +### 2. 配置介绍 {{% alert icon="🤖 " context="success" %}} 注意: diff --git a/docSite/content/zh-cn/docs/development/modelConfig/one-api.md b/docSite/content/zh-cn/docs/development/modelConfig/one-api.md index 537e90067..7d829bc22 100644 --- a/docSite/content/zh-cn/docs/development/modelConfig/one-api.md +++ b/docSite/content/zh-cn/docs/development/modelConfig/one-api.md @@ -20,10 +20,6 @@ FastGPT 目前采用模型分离的部署方案,FastGPT 中只兼容 OpenAI ## 部署 -### Docker 版本 - -`docker-compose.yml` 文件已加入了 OneAPI 配置,可直接使用。默认暴露在 3001 端口。 - ### Sealos 版本 * 北京区: [点击部署 OneAPI](https://hzh.sealos.run/?openapp=system-template%3FtemplateName%3Done-api) diff --git a/docSite/content/zh-cn/docs/development/modelConfig/siliconCloud.md b/docSite/content/zh-cn/docs/development/modelConfig/siliconCloud.md index 06036deb3..2f958b510 100644 --- a/docSite/content/zh-cn/docs/development/modelConfig/siliconCloud.md +++ b/docSite/content/zh-cn/docs/development/modelConfig/siliconCloud.md @@ -35,7 +35,7 @@ CHAT_API_KEY=sk-xxxxxx ![alt text](/imgs/image-104.png) -## 5. 体验测试 +## 4. 体验测试 ### 测试对话和图片识别 diff --git a/docSite/content/zh-cn/docs/development/openapi/dataset.md b/docSite/content/zh-cn/docs/development/openapi/dataset.md index 582d80832..d43b2026d 100644 --- a/docSite/content/zh-cn/docs/development/openapi/dataset.md +++ b/docSite/content/zh-cn/docs/development/openapi/dataset.md @@ -297,7 +297,9 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete? | --- | --- | --- | | datasetId | 知识库ID | ✅ | | parentId: | 父级ID,不填则默认为根目录 | | -| trainingType | 训练模式。chunk: 按文本长度进行分割;qa: QA拆分;auto: 增强训练 | ✅ | +| trainingType | 数据处理方式。chunk: 按文本长度进行分割;qa: 问答对提取 | ✅ | +| autoIndexes | 是否自动生成索引(仅商业版支持) | | +| imageIndex | 是否自动生成图片索引(仅商业版支持) | | | chunkSize | 预估块大小 | | | chunkSplitter | 自定义最高优先分割符号 | | | qaPrompt | qa拆分提示词 | | @@ -1061,10 +1063,12 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/collect | 字段 | 类型 | 说明 | 必填 | | --- | --- | --- | --- | -| defaultIndex | Boolean | 是否为默认索引 | ✅ | -| dataId | String | 关联的向量ID | ✅ | +| type | String | 可选索引类型:default-默认索引; custom-自定义索引; summary-总结索引; question-问题索引; image-图片索引 | | +| dataId | String | 关联的向量ID,变更数据时候传入该 ID,会进行差量更新,而不是全量更新 | | | text | String | 文本内容 | ✅ | +`type` 不填则默认为 `custom` 索引,还会基于 q/a 组成一个默认索引。如果传入了默认索引,则不会额外创建。 + ### 为集合批量添加添加数据 注意,每次最多推送 200 组数据。 @@ -1079,7 +1083,7 @@ curl --location --request POST 'https://api.fastgpt.in/api/core/dataset/data/pus --header 'Content-Type: application/json' \ --data-raw '{     "collectionId": "64663f451ba1676dbdef0499", - "trainingMode": "chunk", + "trainingType": "chunk", "prompt": "可选。qa 拆分引导词,chunk 模式下忽略", "billId": "可选。如果有这个值,本次的数据会被聚合到一个订单中,这个值可以重复使用。可以参考 [创建训练订单] 获取该值。",     "data": [ @@ -1296,8 +1300,7 @@ curl --location --request GET 'http://localhost:3000/api/core/dataset/data/detai "chunkIndex": 0, "indexes": [ { - "defaultIndex": true, - "type": "chunk", + "type": "default", "dataId": "3720083", "text": "N o . 2 0 2 2 1 2中 国 信 息 通 信 研 究 院京东探索研究院2022年 9月人工智能生成内容(AIGC)白皮书(2022 年)版权声明本白皮书版权属于中国信息通信研究院和京东探索研究院,并受法律保护。转载、摘编或利用其它方式使用本白皮书文字或者观点的,应注明“来源:中国信息通信研究院和京东探索研究院”。违反上述声明者,编者将追究其相关法律责任。前 言习近平总书记曾指出,“数字技术正以新理念、新业态、新模式全面融入人类经济、政治、文化、社会、生态文明建设各领域和全过程”。在当前数字世界和物理世界加速融合的大背景下,人工智能生成内容(Artificial Intelligence Generated Content,简称 AIGC)正在悄然引导着一场深刻的变革,重塑甚至颠覆数字内容的生产方式和消费模式,将极大地丰富人们的数字生活,是未来全面迈向数字文明新时代不可或缺的支撑力量。", "_id": "65abd4b29d1448617cba61dc" @@ -1333,12 +1336,18 @@ curl --location --request PUT 'http://localhost:3000/api/core/dataset/data/updat "a":"sss", "indexes":[ { - "dataId": "xxx", - "defaultIndex":false, - "text":"自定义索引1" + "dataId": "xxxx", + "type": "default", + "text": "默认索引" }, { - "text":"修改后的自定义索引2。(会删除原来的自定义索引2,并插入新的自定义索引2)" + "dataId": "xxx", + "type": "custom", + "text": "旧的自定义索引1" + }, + { + "type":"custom", + "text":"新增的自定义索引" } ] }' diff --git a/docSite/content/zh-cn/docs/development/upgrading/490.md b/docSite/content/zh-cn/docs/development/upgrading/490.md new file mode 100644 index 000000000..e6178dcbe --- /dev/null +++ b/docSite/content/zh-cn/docs/development/upgrading/490.md @@ -0,0 +1,188 @@ +--- +title: 'V4.9.0(进行中)' +description: 'FastGPT V4.9.0 更新说明' +icon: 'upgrade' +draft: false +toc: true +weight: 801 +--- + + +## 更新指南 + +### 1. 做好数据库备份 + +### 2. 更新镜像 + +- 更新 FastGPT 镜像 tag: v4.9.0-alpha +- 更新 FastGPT 商业版镜像 tag: v4.9.0-alpha +- Sandbox 镜像,可以不更新 + +### 3. 替换 OneAPI(可选) + +如果需要使用 AI Proxy 替换 OneAPI 的用户可执行该步骤。 + +#### 1. 修改 yml 文件 + +参考[最新的 yml](https://raw.githubusercontent.com/labring/FastGPT/main/deploy/docker/docker-compose-pgvector.yml) 文件。里面已移除 OneAPI 并添加了 AIProxy配置。包含一个服务和一个 PgSQL 数据库。将 `aiproxy` 的配置`追加`到 OneAPI 的配置后面(先不要删除 OneAPI,有一个初始化会自动同步 OneAPI 的配置) + +{{% details title="AI Proxy Yml 配置" closed="true" %}} + +``` + # AI Proxy + aiproxy: + image: 'ghcr.io/labring/sealos-aiproxy-service:latest' + container_name: aiproxy + restart: unless-stopped + depends_on: + aiproxy_pg: + condition: service_healthy + networks: + - fastgpt + environment: + # 对应 fastgpt 里的AIPROXY_API_TOKEN + - ADMIN_KEY=aiproxy + # 错误日志详情保存时间(小时) + - LOG_DETAIL_STORAGE_HOURS=1 + # 数据库连接地址 + - SQL_DSN=postgres://postgres:aiproxy@aiproxy_pg:5432/aiproxy + # 最大重试次数 + - RetryTimes=3 + # 不需要计费 + - BILLING_ENABLED=false + # 不需要严格检测模型 + - DISABLE_MODEL_CONFIG=true + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:3000/api/status'] + interval: 5s + timeout: 5s + retries: 10 + aiproxy_pg: + image: pgvector/pgvector:0.8.0-pg15 # docker hub + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/pgvector:v0.8.0-pg15 # 阿里云 + restart: unless-stopped + container_name: aiproxy_pg + volumes: + - ./aiproxy_pg:/var/lib/postgresql/data + networks: + - fastgpt + environment: + TZ: Asia/Shanghai + POSTGRES_USER: postgres + POSTGRES_DB: aiproxy + POSTGRES_PASSWORD: aiproxy + healthcheck: + test: ['CMD', 'pg_isready', '-U', 'postgres', '-d', 'aiproxy'] + interval: 5s + timeout: 5s + retries: 10 +``` + +{{% /details %}} + +#### 2. 增加 FastGPT 环境变量: + +修改 yml 文件中,fastgpt 容器的环境变量: + +``` +# AI Proxy 的地址,如果配了该地址,优先使用 +- AIPROXY_API_ENDPOINT=http://aiproxy:3000 +# AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY +- AIPROXY_API_TOKEN=aiproxy +``` + +#### 3. 重载服务 + +`docker-compose down` 停止服务,然后 `docker-compose up -d` 启动服务,此时会追加 `aiproxy` 服务,并修改 FastGPT 的配置。 + +#### 4. 执行OneAPI迁移AI proxy脚本 + +- 可联网方案: + +```bash +# 进入 aiproxy 容器 +docker exec -it aiproxy sh +# 安装 curl +apk add curl +# 执行脚本 +curl --location --request POST 'http://localhost:3000/api/channels/import/oneapi' \ +--header 'Authorization: Bearer aiproxy' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "dsn": "mysql://root:oneapimmysql@tcp(mysql:3306)/oneapi" +}' +# 返回 {"data":[],"success":true} 代表成功 +``` + +- 无法联网时,可打开`aiproxy`的外网暴露端口,然后在本地执行脚本。 + +aiProxy 暴露端口:3003:3000,修改后重新 `docker-compose up -d` 启动服务。 + +```bash +# 在终端执行脚本 +curl --location --request POST 'http://localhost:3003/api/channels/import/oneapi' \ +--header 'Authorization: Bearer aiproxy' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "dsn": "mysql://root:oneapimmysql@tcp(mysql:3306)/oneapi" +}' +# 返回 {"data":[],"success":true} 代表成功 +``` + +- 如果不熟悉 docker 操作,建议不要走脚本迁移,直接删除 OneAPI 所有内容,然后手动重新添加渠道。 + +#### 5. 进入 FastGPT 检查`AI Proxy` 服务是否正常启动。 + +登录 root 账号后,在`账号-模型提供商`页面,可以看到多出了`模型渠道`和`调用日志`两个选项,打开模型渠道,可以看到之前 OneAPI 的渠道,说明迁移完成,此时可以手动再检查下渠道是否正常。 + +#### 6. 删除 OneAPI 服务 + +```bash +# 停止服务,或者针对性停止 OneAPI 和其 Mysql +docker-compose down +# yml 文件中删除 OneAPI 和其 Mysql 依赖 +# 重启服务 +docker-compose up -d +``` + +### 4. 运行 FastGPT 升级脚本 + +从任意终端,发起 1 个 HTTP 请求。其中 {{rootkey}} 替换成环境变量里的 `rootkey`;{{host}} 替换成**FastGPT 域名**。 + +```bash +curl --location --request POST 'https://{{host}}/api/admin/initv490' \ +--header 'rootkey: {{rootkey}}' \ +--header 'Content-Type: application/json' +``` + +**脚本功能** + +1. 升级 PG Vector 插件版本 +2. 全量更新知识库集合字段。 +3. 全量更新知识库数据中,index 的 type 类型。(时间较长,最后可能提示 timeout,可忽略,数据库不崩都会一直增量执行) + +## 兼容 & 弃用 + +1. 弃用 - 之前私有化部署的自定义文件解析方案,请同步更新到最新的配置方案。[点击查看 PDF 增强解析配置](/docs/development/configuration/#使用-doc2x-解析-pdf-文件) +2. 弃用 - 弃用旧版本地文件上传 API:/api/core/dataset/collection/create/file(以前仅商业版可用的 API,该接口已放切换成:/api/core/dataset/collection/create/localFile) +3. 停止维护,即将弃用 - 外部文件库相关 API,可通过 API 文件库替代。 +4. API更新 - 上传文件至知识库、创建连接集合、API 文件库、推送分块数据等带有 `trainingType` 字段的接口,`trainingType`字段未来仅支持`chunk`和`QA`两种模式。增强索引模式将设置单独字段:`autoIndexes`,目前仍有适配旧版`trainingType=auto`代码,但请尽快变更成新接口类型。具体可见:[知识库 OpenAPI 文档](/docs/development/openapi/dataset.md) + +## 🚀 新增内容 + +1. PDF增强解析交互添加到页面上。同时内嵌 Doc2x 服务,可直接使用 Doc2x 服务解析 PDF 文件。 +2. 图片自动标注,同时修改知识库文件上传部分数据逻辑和交互。 +3. pg vector 插件升级 0.8.0 版本,引入迭代搜索,减少部分数据无法被检索的情况。 + +## ⚙️ 优化 + +1. 知识库数据不再限制索引数量,可无限自定义。同时可自动更新输入文本的索引,不影响自定义索引。 +2. Markdown 解析,增加链接后中文标点符号检测,增加空格。 +3. Prompt 模式工具调用,支持思考模型。同时优化其格式检测,减少空输出的概率。 +4. Mongo 文件读取流合并,减少计算量。同时优化存储 chunks,极大提高大文件读取速度。50M PDF 读取时间提高 3 倍。 +5. HTTP Body 适配,增加对字符串对象的适配。 + +## 🐛 修复 + +1. 增加网页抓取安全链接校验。 +2. 批量运行时,全局变量未进一步传递到下一次运行中,导致最终变量更新错误。 \ No newline at end of file diff --git a/packages/global/common/fn/utils.ts b/packages/global/common/fn/utils.ts deleted file mode 100644 index 43a1cfc9a..000000000 --- a/packages/global/common/fn/utils.ts +++ /dev/null @@ -1,31 +0,0 @@ -export const retryRun = (fn: () => T, retry = 2): T => { - try { - return fn(); - } catch (error) { - if (retry > 0) { - return retryRun(fn, retry - 1); - } - throw error; - } -}; - -export const batchRun = async (arr: T[], fn: (arr: T) => any, batchSize = 10) => { - const batchArr = new Array(batchSize).fill(null); - const result: any[] = []; - - const batchFn = async () => { - const data = arr.shift(); - if (data) { - result.push(await fn(data)); - return batchFn(); - } - }; - - await Promise.all( - batchArr.map(async () => { - await batchFn(); - }) - ); - - return result; -}; diff --git a/packages/global/common/string/markdown.ts b/packages/global/common/string/markdown.ts index 410ca4c75..43a6e895a 100644 --- a/packages/global/common/string/markdown.ts +++ b/packages/global/common/string/markdown.ts @@ -1,4 +1,4 @@ -import { batchRun } from '../fn/utils'; +import { batchRun } from '../system/utils'; import { getNanoid, simpleText } from './tools'; import type { ImageType } from '../../../service/worker/readFile/type'; @@ -37,6 +37,80 @@ export const simpleMarkdownText = (rawText: string) => { return rawText.trim(); }; +export const htmlTable2Md = (content: string): string => { + return content.replace(/[\s\S]*?<\/table>/g, (htmlTable) => { + try { + // Clean up whitespace and newlines + const cleanHtml = htmlTable.replace(/\n\s*/g, ''); + const rows = cleanHtml.match(/(.*?)<\/tr>/g); + if (!rows) return htmlTable; + + // Parse table data + let tableData: string[][] = []; + let maxColumns = 0; + + // Try to convert to markdown table + rows.forEach((row, rowIndex) => { + if (!tableData[rowIndex]) { + tableData[rowIndex] = []; + } + let colIndex = 0; + const cells = row.match(/(.*?)<\/td>/g) || []; + + cells.forEach((cell) => { + while (tableData[rowIndex][colIndex]) { + colIndex++; + } + const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1'); + const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1'); + const content = cell.replace(/|<\/td>/g, '').trim(); + + for (let i = 0; i < rowspan; i++) { + for (let j = 0; j < colspan; j++) { + if (!tableData[rowIndex + i]) { + tableData[rowIndex + i] = []; + } + tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^'; + } + } + colIndex += colspan; + maxColumns = Math.max(maxColumns, colIndex); + }); + + for (let i = 0; i < maxColumns; i++) { + if (!tableData[rowIndex][i]) { + tableData[rowIndex][i] = ' '; + } + } + }); + const chunks: string[] = []; + + const headerCells = tableData[0] + .slice(0, maxColumns) + .map((cell) => (cell === '^^' ? ' ' : cell || ' ')); + const headerRow = '| ' + headerCells.join(' | ') + ' |'; + chunks.push(headerRow); + + const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |'; + chunks.push(separator); + + tableData.slice(1).forEach((row) => { + const paddedRow = row + .slice(0, maxColumns) + .map((cell) => (cell === '^^' ? ' ' : cell || ' ')); + while (paddedRow.length < maxColumns) { + paddedRow.push(' '); + } + chunks.push('| ' + paddedRow.join(' | ') + ' |'); + }); + + return chunks.join('\n'); + } catch (error) { + return htmlTable; + } + }); +}; + /** * format markdown * 1. upload base64 @@ -94,7 +168,7 @@ export const markdownProcess = async ({ return simpleMarkdownText(imageProcess); }; -export const matchMdImgTextAndUpload = (text: string) => { +export const matchMdImg = (text: string) => { const base64Regex = /!\[([^\]]*)\]\((data:image\/[^;]+;base64[^)]+)\)/g; const imageList: ImageType[] = []; diff --git a/packages/global/common/system/types/index.d.ts b/packages/global/common/system/types/index.d.ts index b7f8334ad..a2b2367c1 100644 --- a/packages/global/common/system/types/index.d.ts +++ b/packages/global/common/system/types/index.d.ts @@ -43,10 +43,14 @@ export type FastGPTConfigFileType = { export type FastGPTFeConfigsType = { show_workorder?: boolean; show_emptyChat?: boolean; + isPlus?: boolean; register_method?: ['email' | 'phone' | 'sync']; login_method?: ['email' | 'phone']; // Attention: login method is diffrent with oauth find_password_method?: ['email' | 'phone']; bind_notification_method?: ['email' | 'phone']; + googleClientVerKey?: string; + + show_emptyChat?: boolean; show_appStore?: boolean; show_git?: boolean; show_pay?: boolean; @@ -57,15 +61,19 @@ export type FastGPTFeConfigsType = { show_aiproxy?: boolean; concatMd?: string; + concatMd?: string; docUrl?: string; openAPIDocUrl?: string; systemPluginCourseUrl?: string; appTemplateCourse?: string; + customApiDomain?: string; + customSharePageDomain?: string; systemTitle?: string; systemDescription?: string; - googleClientVerKey?: string; - isPlus?: boolean; + scripts?: { [key: string]: string }[]; + favicon?: string; + sso?: { icon?: string; title?: string; @@ -91,13 +99,14 @@ export type FastGPTFeConfigsType = { exportDatasetLimitMinutes?: number; websiteSyncLimitMinuted?: number; }; - scripts?: { [key: string]: string }[]; - favicon?: string; - customApiDomain?: string; - customSharePageDomain?: string; uploadFileMaxAmount?: number; uploadFileMaxSize?: number; + + // Compute by systemEnv.customPdfParse + showCustomPdfParse?: boolean; + customPdfParsePrice?: number; + lafEnv?: string; navbarItems?: NavbarItemType[]; externalProviderWorkflowVariables?: ExternalProviderWorkflowVarType[]; @@ -107,9 +116,18 @@ export type SystemEnvType = { openapiPrefix?: string; vectorMaxProcess: number; qaMaxProcess: number; + vlmMaxProcess: number; pgHNSWEfSearch: number; tokenWorkers: number; // token count max worker oneapiUrl?: string; chatApiKey?: string; + + customPdfParse?: { + url?: string; + key?: string; + + doc2xKey?: string; + price?: number; // n points/1 page + }; }; diff --git a/packages/global/common/system/utils.ts b/packages/global/common/system/utils.ts index 8f79cb2f5..e58761f48 100644 --- a/packages/global/common/system/utils.ts +++ b/packages/global/common/system/utils.ts @@ -16,3 +16,24 @@ export const retryFn = async (fn: () => Promise, retryTimes = 3): Promise< return Promise.reject(error); } }; + +export const batchRun = async (arr: T[], fn: (arr: T) => any, batchSize = 10) => { + const batchArr = new Array(batchSize).fill(null); + const result: any[] = []; + + const batchFn = async () => { + const data = arr.shift(); + if (data) { + result.push(await fn(data)); + return batchFn(); + } + }; + + await Promise.all( + batchArr.map(async () => { + await batchFn(); + }) + ); + + return result; +}; diff --git a/packages/global/core/ai/model.ts b/packages/global/core/ai/model.ts index 8fd818307..314287629 100644 --- a/packages/global/core/ai/model.ts +++ b/packages/global/core/ai/model.ts @@ -22,7 +22,7 @@ export const defaultQAModels: LLMModelItemType[] = [ maxTemperature: 1.2, charsPointsPrice: 0, censor: false, - vision: false, + vision: true, datasetProcess: true, toolChoice: true, functionCall: false, @@ -59,10 +59,17 @@ export const defaultSTTModels: STTModelType[] = [ export const getModelFromList = ( modelList: { provider: ModelProviderIdType; name: string; model: string }[], model: string -) => { +): + | { + avatar: string; + provider: ModelProviderIdType; + name: string; + model: string; + } + | undefined => { const modelData = modelList.find((item) => item.model === model) ?? modelList[0]; if (!modelData) { - throw new Error('No Key model is configured'); + return; } const provider = getModelProvider(modelData.provider); return { diff --git a/packages/global/core/app/type.d.ts b/packages/global/core/app/type.d.ts index 52b6964fb..1a0a53b85 100644 --- a/packages/global/core/app/type.d.ts +++ b/packages/global/core/app/type.d.ts @@ -188,6 +188,7 @@ export type AppAutoExecuteConfigType = { // File export type AppFileSelectConfigType = { canSelectFile: boolean; + customPdfParse?: boolean; canSelectImg: boolean; maxFiles: number; }; diff --git a/packages/global/core/chat/adapt.ts b/packages/global/core/chat/adapt.ts index 4c4d65101..4302ca757 100644 --- a/packages/global/core/chat/adapt.ts +++ b/packages/global/core/chat/adapt.ts @@ -1,8 +1,11 @@ import type { + AIChatItemValueItemType, ChatItemType, ChatItemValueItemType, RuntimeUserPromptType, - UserChatItemType + SystemChatItemValueItemType, + UserChatItemType, + UserChatItemValueItemType } from '../../core/chat/type.d'; import { ChatFileTypeEnum, ChatItemValueTypeEnum, ChatRoleEnum } from '../../core/chat/constants'; import type { @@ -174,137 +177,24 @@ export const GPTMessages2Chats = ( ): ChatItemType[] => { const chatMessages = messages .map((item) => { - const value: ChatItemType['value'] = []; const obj = GPT2Chat[item.role]; - if ( - obj === ChatRoleEnum.System && - item.role === ChatCompletionRequestMessageRoleEnum.System - ) { - if (Array.isArray(item.content)) { - item.content.forEach((item) => [ - value.push({ - type: ChatItemValueTypeEnum.text, - text: { - content: item.text - } - }) - ]); - } else { - value.push({ - type: ChatItemValueTypeEnum.text, - text: { - content: item.content - } - }); - } - } else if ( - obj === ChatRoleEnum.Human && - item.role === ChatCompletionRequestMessageRoleEnum.User - ) { - if (typeof item.content === 'string') { - value.push({ - type: ChatItemValueTypeEnum.text, - text: { - content: item.content - } - }); - } else if (Array.isArray(item.content)) { - item.content.forEach((item) => { - if (item.type === 'text') { + const value = (() => { + if ( + obj === ChatRoleEnum.System && + item.role === ChatCompletionRequestMessageRoleEnum.System + ) { + const value: SystemChatItemValueItemType[] = []; + + if (Array.isArray(item.content)) { + item.content.forEach((item) => [ value.push({ type: ChatItemValueTypeEnum.text, text: { content: item.text } - }); - } else if (item.type === 'image_url') { - value.push({ - //@ts-ignore - type: ChatItemValueTypeEnum.file, - file: { - type: ChatFileTypeEnum.image, - name: '', - url: item.image_url.url - } - }); - } else if (item.type === 'file_url') { - value.push({ - // @ts-ignore - type: ChatItemValueTypeEnum.file, - file: { - type: ChatFileTypeEnum.file, - name: item.name, - url: item.url - } - }); - } - }); - } - } else if ( - obj === ChatRoleEnum.AI && - item.role === ChatCompletionRequestMessageRoleEnum.Assistant - ) { - if (item.tool_calls && reserveTool) { - // save tool calls - const toolCalls = item.tool_calls as ChatCompletionMessageToolCall[]; - value.push({ - //@ts-ignore - type: ChatItemValueTypeEnum.tool, - tools: toolCalls.map((tool) => { - let toolResponse = - messages.find( - (msg) => - msg.role === ChatCompletionRequestMessageRoleEnum.Tool && - msg.tool_call_id === tool.id - )?.content || ''; - toolResponse = - typeof toolResponse === 'string' ? toolResponse : JSON.stringify(toolResponse); - - return { - id: tool.id, - toolName: tool.toolName || '', - toolAvatar: tool.toolAvatar || '', - functionName: tool.function.name, - params: tool.function.arguments, - response: toolResponse as string - }; - }) - }); - } else if (item.function_call && reserveTool) { - const functionCall = item.function_call as ChatCompletionMessageFunctionCall; - const functionResponse = messages.find( - (msg) => - msg.role === ChatCompletionRequestMessageRoleEnum.Function && - msg.name === item.function_call?.name - ) as ChatCompletionFunctionMessageParam; - - if (functionResponse) { - value.push({ - //@ts-ignore - type: ChatItemValueTypeEnum.tool, - tools: [ - { - id: functionCall.id || '', - toolName: functionCall.toolName || '', - toolAvatar: functionCall.toolAvatar || '', - functionName: functionCall.name, - params: functionCall.arguments, - response: functionResponse.content || '' - } - ] - }); - } - } else if (item.interactive) { - value.push({ - //@ts-ignore - type: ChatItemValueTypeEnum.interactive, - interactive: item.interactive - }); - } else if (typeof item.content === 'string') { - const lastValue = value[value.length - 1]; - if (lastValue && lastValue.type === ChatItemValueTypeEnum.text && lastValue.text) { - lastValue.text.content += item.content; + }) + ]); } else { value.push({ type: ChatItemValueTypeEnum.text, @@ -313,8 +203,145 @@ export const GPTMessages2Chats = ( } }); } + return value; + } else if ( + obj === ChatRoleEnum.Human && + item.role === ChatCompletionRequestMessageRoleEnum.User + ) { + const value: UserChatItemValueItemType[] = []; + + if (typeof item.content === 'string') { + value.push({ + type: ChatItemValueTypeEnum.text, + text: { + content: item.content + } + }); + } else if (Array.isArray(item.content)) { + item.content.forEach((item) => { + if (item.type === 'text') { + value.push({ + type: ChatItemValueTypeEnum.text, + text: { + content: item.text + } + }); + } else if (item.type === 'image_url') { + value.push({ + //@ts-ignore + type: ChatItemValueTypeEnum.file, + file: { + type: ChatFileTypeEnum.image, + name: '', + url: item.image_url.url + } + }); + } else if (item.type === 'file_url') { + value.push({ + // @ts-ignore + type: ChatItemValueTypeEnum.file, + file: { + type: ChatFileTypeEnum.file, + name: item.name, + url: item.url + } + }); + } + }); + } + return value; + } else if ( + obj === ChatRoleEnum.AI && + item.role === ChatCompletionRequestMessageRoleEnum.Assistant + ) { + const value: AIChatItemValueItemType[] = []; + + if (typeof item.reasoning_text === 'string') { + value.push({ + type: ChatItemValueTypeEnum.reasoning, + reasoning: { + content: item.reasoning_text + } + }); + } + if (item.tool_calls && reserveTool) { + // save tool calls + const toolCalls = item.tool_calls as ChatCompletionMessageToolCall[]; + value.push({ + //@ts-ignore + type: ChatItemValueTypeEnum.tool, + tools: toolCalls.map((tool) => { + let toolResponse = + messages.find( + (msg) => + msg.role === ChatCompletionRequestMessageRoleEnum.Tool && + msg.tool_call_id === tool.id + )?.content || ''; + toolResponse = + typeof toolResponse === 'string' ? toolResponse : JSON.stringify(toolResponse); + + return { + id: tool.id, + toolName: tool.toolName || '', + toolAvatar: tool.toolAvatar || '', + functionName: tool.function.name, + params: tool.function.arguments, + response: toolResponse as string + }; + }) + }); + } + if (item.function_call && reserveTool) { + const functionCall = item.function_call as ChatCompletionMessageFunctionCall; + const functionResponse = messages.find( + (msg) => + msg.role === ChatCompletionRequestMessageRoleEnum.Function && + msg.name === item.function_call?.name + ) as ChatCompletionFunctionMessageParam; + + if (functionResponse) { + value.push({ + //@ts-ignore + type: ChatItemValueTypeEnum.tool, + tools: [ + { + id: functionCall.id || '', + toolName: functionCall.toolName || '', + toolAvatar: functionCall.toolAvatar || '', + functionName: functionCall.name, + params: functionCall.arguments, + response: functionResponse.content || '' + } + ] + }); + } + } + if (item.interactive) { + value.push({ + //@ts-ignore + type: ChatItemValueTypeEnum.interactive, + interactive: item.interactive + }); + } + if (typeof item.content === 'string') { + const lastValue = value[value.length - 1]; + if (lastValue && lastValue.type === ChatItemValueTypeEnum.text && lastValue.text) { + lastValue.text.content += item.content; + } else { + value.push({ + type: ChatItemValueTypeEnum.text, + text: { + content: item.content + } + }); + } + } + + return value; } - } + + return []; + })(); return { dataId: item.dataId, diff --git a/packages/global/core/chat/type.d.ts b/packages/global/core/chat/type.d.ts index 4e010c68f..8837075e8 100644 --- a/packages/global/core/chat/type.d.ts +++ b/packages/global/core/chat/type.d.ts @@ -77,6 +77,7 @@ export type AIChatItemValueItemType = { | ChatItemValueTypeEnum.reasoning | ChatItemValueTypeEnum.tool | ChatItemValueTypeEnum.interactive; + text?: { content: string; }; diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index 5cf6c860f..99b4aaa3a 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -1,5 +1,5 @@ import { DatasetDataIndexItemType, DatasetSchemaType } from './type'; -import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants'; +import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants'; import type { LLMModelItemType } from '../ai/model.d'; import { ParentIdType } from 'common/parentFolder/type'; @@ -10,9 +10,11 @@ export type DatasetUpdateBody = { name?: string; avatar?: string; intro?: string; - agentModel?: LLMModelItemType; status?: DatasetSchemaType['status']; + agentModel?: string; + vlmModel?: string; + websiteConfig?: DatasetSchemaType['websiteConfig']; externalReadUrl?: DatasetSchemaType['externalReadUrl']; defaultPermission?: DatasetSchemaType['defaultPermission']; @@ -27,7 +29,10 @@ export type DatasetUpdateBody = { /* ================= collection ===================== */ export type DatasetCollectionChunkMetadataType = { parentId?: string; - trainingType?: TrainingModeEnum; + customPdfParse?: boolean; + trainingType?: DatasetCollectionDataProcessModeEnum; + imageIndex?: boolean; + autoIndexes?: boolean; chunkSize?: number; chunkSplitter?: string; qaPrompt?: string; @@ -131,9 +136,15 @@ export type PostWebsiteSyncParams = { export type PushDatasetDataProps = { collectionId: string; data: PushDatasetDataChunkProps[]; - trainingMode: TrainingModeEnum; + trainingType?: DatasetCollectionDataProcessModeEnum; + autoIndexes?: boolean; + imageIndex?: boolean; prompt?: string; + billId?: string; + + // Abandon + trainingMode?: DatasetCollectionDataProcessModeEnum; }; export type PushDatasetDataResponse = { insertLen: number; diff --git a/packages/global/core/dataset/collection/utils.ts b/packages/global/core/dataset/collection/utils.ts index f82a689c4..a803a989f 100644 --- a/packages/global/core/dataset/collection/utils.ts +++ b/packages/global/core/dataset/collection/utils.ts @@ -1,4 +1,4 @@ -import { DatasetCollectionTypeEnum, TrainingModeEnum, TrainingTypeMap } from '../constants'; +import { DatasetCollectionTypeEnum } from '../constants'; import { DatasetCollectionSchemaType } from '../type'; export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType) => { @@ -16,9 +16,3 @@ export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType export const checkCollectionIsFolder = (type: DatasetCollectionTypeEnum) => { return type === DatasetCollectionTypeEnum.folder || type === DatasetCollectionTypeEnum.virtual; }; - -export const getTrainingTypeLabel = (type?: TrainingModeEnum) => { - if (!type) return ''; - if (!TrainingTypeMap[type]) return ''; - return TrainingTypeMap[type].label; -}; diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts index a522645f9..7d0e3531b 100644 --- a/packages/global/core/dataset/constants.ts +++ b/packages/global/core/dataset/constants.ts @@ -109,6 +109,26 @@ export const DatasetCollectionSyncResultMap = { } }; +export enum DatasetCollectionDataProcessModeEnum { + chunk = 'chunk', + qa = 'qa', + auto = 'auto' // abandon +} +export const DatasetCollectionDataProcessModeMap = { + [DatasetCollectionDataProcessModeEnum.chunk]: { + label: i18nT('common:core.dataset.training.Chunk mode'), + tooltip: i18nT('common:core.dataset.import.Chunk Split Tip') + }, + [DatasetCollectionDataProcessModeEnum.qa]: { + label: i18nT('common:core.dataset.training.QA mode'), + tooltip: i18nT('common:core.dataset.import.QA Import Tip') + }, + [DatasetCollectionDataProcessModeEnum.auto]: { + label: i18nT('common:core.dataset.training.Auto mode'), + tooltip: i18nT('common:core.dataset.training.Auto mode Tip') + } +}; + /* ------------ data -------------- */ /* ------------ training -------------- */ @@ -124,28 +144,11 @@ export enum ImportDataSourceEnum { export enum TrainingModeEnum { chunk = 'chunk', + qa = 'qa', auto = 'auto', - qa = 'qa' + image = 'image' } -export const TrainingTypeMap = { - [TrainingModeEnum.chunk]: { - label: i18nT('common:core.dataset.training.Chunk mode'), - tooltip: i18nT('common:core.dataset.import.Chunk Split Tip'), - openSource: true - }, - [TrainingModeEnum.auto]: { - label: i18nT('common:core.dataset.training.Auto mode'), - tooltip: i18nT('common:core.dataset.training.Auto mode Tip'), - openSource: false - }, - [TrainingModeEnum.qa]: { - label: i18nT('common:core.dataset.training.QA mode'), - tooltip: i18nT('common:core.dataset.import.QA Import Tip'), - openSource: true - } -}; - /* ------------ search -------------- */ export enum DatasetSearchModeEnum { embedding = 'embedding', diff --git a/packages/global/core/dataset/controller.d.ts b/packages/global/core/dataset/controller.d.ts index 732f4cf78..2382a94e6 100644 --- a/packages/global/core/dataset/controller.d.ts +++ b/packages/global/core/dataset/controller.d.ts @@ -20,9 +20,22 @@ export type UpdateDatasetDataProps = { })[]; }; -export type PatchIndexesProps = { - type: 'create' | 'update' | 'delete' | 'unChange'; - index: Omit & { - dataId?: string; - }; -}; +export type PatchIndexesProps = + | { + type: 'create'; + index: Omit & { + dataId?: string; + }; + } + | { + type: 'update'; + index: DatasetDataIndexItemType; + } + | { + type: 'delete'; + index: DatasetDataIndexItemType; + } + | { + type: 'unChange'; + index: DatasetDataIndexItemType; + }; diff --git a/packages/global/core/dataset/data/constants.ts b/packages/global/core/dataset/data/constants.ts new file mode 100644 index 000000000..802b5f469 --- /dev/null +++ b/packages/global/core/dataset/data/constants.ts @@ -0,0 +1,42 @@ +import { i18nT } from '../../../../web/i18n/utils'; + +export enum DatasetDataIndexTypeEnum { + default = 'default', + custom = 'custom', + summary = 'summary', + question = 'question', + image = 'image' +} + +export const DatasetDataIndexMap: Record< + `${DatasetDataIndexTypeEnum}`, + { + label: any; + color: string; + } +> = { + [DatasetDataIndexTypeEnum.default]: { + label: i18nT('dataset:data_index_default'), + color: 'gray' + }, + [DatasetDataIndexTypeEnum.custom]: { + label: i18nT('dataset:data_index_custom'), + color: 'blue' + }, + [DatasetDataIndexTypeEnum.summary]: { + label: i18nT('dataset:data_index_summary'), + color: 'green' + }, + [DatasetDataIndexTypeEnum.question]: { + label: i18nT('dataset:data_index_question'), + color: 'red' + }, + [DatasetDataIndexTypeEnum.image]: { + label: i18nT('dataset:data_index_image'), + color: 'purple' + } +}; +export const defaultDatasetIndexData = DatasetDataIndexMap[DatasetDataIndexTypeEnum.custom]; +export const getDatasetIndexMapData = (type: `${DatasetDataIndexTypeEnum}`) => { + return DatasetDataIndexMap[type] || defaultDatasetIndexData; +}; diff --git a/packages/global/core/dataset/training/type.d.ts b/packages/global/core/dataset/training/type.d.ts new file mode 100644 index 000000000..1bc15ea22 --- /dev/null +++ b/packages/global/core/dataset/training/type.d.ts @@ -0,0 +1,20 @@ +import { PushDatasetDataChunkProps } from '../api'; +import { TrainingModeEnum } from '../constants'; + +export type PushDataToTrainingQueueProps = { + teamId: string; + tmbId: string; + datasetId: string; + collectionId: string; + + mode?: TrainingModeEnum; + data: PushDatasetDataChunkProps[]; + prompt?: string; + + agentModel: string; + vectorModel: string; + vlmModel?: string; + + billId?: string; + session?: ClientSession; +}; diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index 49aabc625..74741a2a9 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod import { PermissionTypeEnum } from '../../support/permission/constant'; import { PushDatasetDataChunkProps } from './api'; import { + DatasetCollectionDataProcessModeEnum, DatasetCollectionTypeEnum, DatasetStatusEnum, DatasetTypeEnum, @@ -12,6 +13,7 @@ import { DatasetPermission } from '../../support/permission/dataset/controller'; import { Permission } from '../../support/permission/controller'; import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset'; import { SourceMemberType } from 'support/user/type'; +import { DatasetDataIndexTypeEnum } from './data/constants'; export type DatasetSchemaType = { _id: string; @@ -23,11 +25,14 @@ export type DatasetSchemaType = { avatar: string; name: string; - vectorModel: string; - agentModel: string; intro: string; type: `${DatasetTypeEnum}`; status: `${DatasetStatusEnum}`; + + vectorModel: string; + agentModel: string; + vlmModel?: string; + websiteConfig?: { url: string; selector: string; @@ -52,26 +57,22 @@ export type DatasetCollectionSchemaType = { parentId?: string; name: string; type: DatasetCollectionTypeEnum; - createTime: Date; - updateTime: Date; - forbid?: boolean; - - trainingType: TrainingModeEnum; - chunkSize: number; - chunkSplitter?: string; - qaPrompt?: string; - ocrParse?: boolean; - tags?: string[]; + createTime: Date; + updateTime: Date; + + // Status + forbid?: boolean; + nextSyncTime?: Date; + + // Collection metadata fileId?: string; // local file id rawLink?: string; // link url externalFileId?: string; //external file id apiFileId?: string; // api file id externalFileUrl?: string; // external import url - nextSyncTime?: Date; - rawTextLength?: number; hashRawText?: string; metadata?: { @@ -80,6 +81,16 @@ export type DatasetCollectionSchemaType = { [key: string]: any; }; + + // Parse settings + customPdfParse?: boolean; + // Chunk settings + autoIndexes?: boolean; + imageIndex?: boolean; + trainingType: DatasetCollectionDataProcessModeEnum; + chunkSize: number; + chunkSplitter?: string; + qaPrompt?: string; }; export type DatasetCollectionTagsSchemaType = { @@ -90,7 +101,7 @@ export type DatasetCollectionTagsSchemaType = { }; export type DatasetDataIndexItemType = { - defaultIndex: boolean; + type: `${DatasetDataIndexTypeEnum}`; dataId: string; // pg data id text: string; }; @@ -141,6 +152,7 @@ export type DatasetTrainingSchemaType = { chunkIndex: number; weight: number; indexes: Omit[]; + retryCount: number; }; export type CollectionWithDatasetType = DatasetCollectionSchemaType & { @@ -169,9 +181,10 @@ export type DatasetListItemType = { sourceMember?: SourceMemberType; }; -export type DatasetItemType = Omit & { +export type DatasetItemType = Omit & { vectorModel: EmbeddingModelItemType; agentModel: LLMModelItemType; + vlmModel?: LLMModelItemType; permission: DatasetPermission; }; diff --git a/packages/global/core/dataset/utils.ts b/packages/global/core/dataset/utils.ts index d8e6564ac..64c330c84 100644 --- a/packages/global/core/dataset/utils.ts +++ b/packages/global/core/dataset/utils.ts @@ -1,6 +1,7 @@ import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants'; import { getFileIcon } from '../../common/file/icon'; import { strIsLink } from '../../common/string/tools'; +import { DatasetDataIndexTypeEnum } from './data/constants'; export function getCollectionIcon( type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file, @@ -38,14 +39,23 @@ export function getSourceNameIcon({ } /* get dataset data default index */ -export function getDefaultIndex(props?: { q?: string; a?: string; dataId?: string }) { - const { q = '', a, dataId } = props || {}; - const qaStr = `${q}\n${a}`.trim(); - return { - defaultIndex: true, - text: a ? qaStr : q, - dataId - }; +export function getDefaultIndex(props?: { q?: string; a?: string }) { + const { q = '', a } = props || {}; + + return [ + { + text: q, + type: DatasetDataIndexTypeEnum.default + }, + ...(a + ? [ + { + text: a, + type: DatasetDataIndexTypeEnum.default + } + ] + : []) + ]; } export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => { diff --git a/packages/global/core/workflow/template/system/aiChat/index.ts b/packages/global/core/workflow/template/system/aiChat/index.ts index 8d6aa6cd0..db5df3d6d 100644 --- a/packages/global/core/workflow/template/system/aiChat/index.ts +++ b/packages/global/core/workflow/template/system/aiChat/index.ts @@ -55,7 +55,7 @@ export const AiChatModule: FlowNodeTemplateType = { showStatus: true, isTool: true, courseUrl: '/docs/guide/workbench/workflow/ai_chat/', - version: '4813', + version: '490', inputs: [ Input_Template_SettingAiModel, // --- settings modal diff --git a/packages/global/core/workflow/template/system/tools.ts b/packages/global/core/workflow/template/system/tools.ts index 8ef75d9f5..672deaffa 100644 --- a/packages/global/core/workflow/template/system/tools.ts +++ b/packages/global/core/workflow/template/system/tools.ts @@ -58,6 +58,13 @@ export const ToolModule: FlowNodeTemplateType = { valueType: WorkflowIOValueTypeEnum.boolean, value: true }, + { + key: NodeInputKeyEnum.aiChatReasoning, + renderTypeList: [FlowNodeInputTypeEnum.hidden], + label: '', + valueType: WorkflowIOValueTypeEnum.boolean, + value: true + }, { key: NodeInputKeyEnum.aiChatTopP, renderTypeList: [FlowNodeInputTypeEnum.hidden], diff --git a/packages/global/support/wallet/usage/constants.ts b/packages/global/support/wallet/usage/constants.ts index b20bc8a6d..e2848c703 100644 --- a/packages/global/support/wallet/usage/constants.ts +++ b/packages/global/support/wallet/usage/constants.ts @@ -10,7 +10,8 @@ export enum UsageSourceEnum { wecom = 'wecom', feishu = 'feishu', dingtalk = 'dingtalk', - official_account = 'official_account' + official_account = 'official_account', + pdfParse = 'pdfParse' } export const UsageSourceMap = { @@ -43,5 +44,8 @@ export const UsageSourceMap = { }, [UsageSourceEnum.dingtalk]: { label: i18nT('account_usage:dingtalk') + }, + [UsageSourceEnum.pdfParse]: { + label: i18nT('account_usage:pdf_parse') } }; diff --git a/packages/global/support/wallet/usage/type.d.ts b/packages/global/support/wallet/usage/type.d.ts index f34feb580..268279c85 100644 --- a/packages/global/support/wallet/usage/type.d.ts +++ b/packages/global/support/wallet/usage/type.d.ts @@ -7,6 +7,7 @@ export type UsageListItemCountType = { outputTokens?: number; charsLength?: number; duration?: number; + pages?: number; // deprecated tokens?: number; diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts index b6480b002..f809cc772 100644 --- a/packages/service/common/file/gridfs/controller.ts +++ b/packages/service/common/file/gridfs/controller.ts @@ -52,7 +52,9 @@ export async function uploadFile({ const stats = await fsp.stat(path); if (!stats.isFile()) return Promise.reject(`${path} is not a file`); - const readStream = fs.createReadStream(path); + const readStream = fs.createReadStream(path, { + highWaterMark: 256 * 1024 + }); // Add default metadata metadata.teamId = teamId; @@ -62,9 +64,27 @@ export async function uploadFile({ // create a gridfs bucket const bucket = getGridBucket(bucketName); + const fileSize = stats.size; + const chunkSizeBytes = (() => { + // 计算理想块大小:文件大小 ÷ 目标块数(10) + const idealChunkSize = Math.ceil(fileSize / 10); + + // 确保块大小至少为512KB + const minChunkSize = 512 * 1024; // 512KB + + // 取理想块大小和最小块大小中的较大值 + let chunkSize = Math.max(idealChunkSize, minChunkSize); + + // 将块大小向上取整到最接近的64KB的倍数,使其更整齐 + chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024); + + return chunkSize; + })(); + const stream = bucket.openUploadStream(filename, { metadata, - contentType + contentType, + chunkSizeBytes }); // save to gridfs @@ -186,20 +206,25 @@ export async function getDownloadStream({ export const readFileContentFromMongo = async ({ teamId, + tmbId, bucketName, fileId, - isQAImport = false + isQAImport = false, + customPdfParse = false }: { teamId: string; + tmbId: string; bucketName: `${BucketNameEnum}`; fileId: string; isQAImport?: boolean; + customPdfParse?: boolean; }): Promise<{ rawText: string; filename: string; }> => { + const bufferId = `${fileId}-${customPdfParse}`; // read buffer - const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: fileId }, undefined, { + const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: bufferId }, undefined, { ...readFromSecondary }).lean(); if (fileBuffer) { @@ -227,9 +252,11 @@ export const readFileContentFromMongo = async ({ // Get raw text const { rawText } = await readRawContentByFileBuffer({ + customPdfParse, extension, isQAImport, teamId, + tmbId, buffer: fileBuffers, encoding, metadata: { @@ -240,7 +267,7 @@ export const readFileContentFromMongo = async ({ // < 14M if (fileBuffers.length < 14 * 1024 * 1024 && rawText.trim()) { MongoRawTextBuffer.create({ - sourceId: fileId, + sourceId: bufferId, rawText, metadata: { filename: file.filename diff --git a/packages/service/common/file/gridfs/utils.ts b/packages/service/common/file/gridfs/utils.ts index 1e0725e75..9e376f28f 100644 --- a/packages/service/common/file/gridfs/utils.ts +++ b/packages/service/common/file/gridfs/utils.ts @@ -3,15 +3,13 @@ import { PassThrough } from 'stream'; export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => { return new Promise((resolve, reject) => { - const chunks: Buffer[] = []; - let totalLength = 0; + const chunks: Uint8Array[] = []; stream.on('data', (chunk) => { chunks.push(chunk); - totalLength += chunk.length; }); stream.on('end', () => { - const resultBuffer = Buffer.concat(chunks, totalLength); // 一次性拼接 + const resultBuffer = Buffer.concat(chunks); // 一次性拼接 resolve(resultBuffer); }); stream.on('error', (err) => { @@ -21,25 +19,26 @@ export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => { }; export const stream2Encoding = async (stream: NodeJS.ReadableStream) => { - const start = Date.now(); const copyStream = stream.pipe(new PassThrough()); /* get encoding */ const buffer = await (() => { return new Promise((resolve, reject) => { - let tmpBuffer: Buffer = Buffer.from([]); + const chunks: Uint8Array[] = []; + let totalLength = 0; stream.on('data', (chunk) => { - if (tmpBuffer.length < 200) { - tmpBuffer = Buffer.concat([tmpBuffer, chunk]); + if (totalLength < 200) { + chunks.push(chunk); + totalLength += chunk.length; - if (tmpBuffer.length >= 200) { - resolve(tmpBuffer); + if (totalLength >= 200) { + resolve(Buffer.concat(chunks)); } } }); stream.on('end', () => { - resolve(tmpBuffer); + resolve(Buffer.concat(chunks)); }); stream.on('error', (err) => { reject(err); diff --git a/packages/service/common/file/image/controller.ts b/packages/service/common/file/image/controller.ts index c2772bcde..0bf898337 100644 --- a/packages/service/common/file/image/controller.ts +++ b/packages/service/common/file/image/controller.ts @@ -6,6 +6,7 @@ import { guessBase64ImageType } from '../utils'; import { readFromSecondary } from '../../mongo/utils'; import { addHours } from 'date-fns'; import { imageFileType } from '@fastgpt/global/common/file/constants'; +import { retryFn } from '@fastgpt/global/common/system/utils'; export const maxImgSize = 1024 * 1024 * 12; const base64MimeRegex = /data:image\/([^\)]+);base64/; @@ -40,13 +41,15 @@ export async function uploadMongoImg({ return Promise.reject(`Invalid image file type: ${mime}`); } - const { _id } = await MongoImage.create({ - teamId, - binary, - metadata: Object.assign({ mime }, metadata), - shareId, - expiredTime: forever ? undefined : addHours(new Date(), 1) - }); + const { _id } = await retryFn(() => + MongoImage.create({ + teamId, + binary, + metadata: Object.assign({ mime }, metadata), + shareId, + expiredTime: forever ? undefined : addHours(new Date(), 1) + }) + ); return `${process.env.NEXT_PUBLIC_BASE_URL || ''}${imageBaseUrl}${String(_id)}.${extension}`; } diff --git a/packages/service/common/file/image/utils.ts b/packages/service/common/file/image/utils.ts new file mode 100644 index 000000000..57820879d --- /dev/null +++ b/packages/service/common/file/image/utils.ts @@ -0,0 +1,34 @@ +import axios from 'axios'; +import { addLog } from '../../system/log'; +import { serverRequestBaseUrl } from '../../api/serverRequest'; +import { getFileContentTypeFromHeader, guessBase64ImageType } from '../utils'; +import { retryFn } from '@fastgpt/global/common/system/utils'; + +export const getImageBase64 = async (url: string) => { + addLog.debug(`Load image to base64: ${url}`); + + try { + const response = await retryFn(() => + axios.get(url, { + baseURL: serverRequestBaseUrl, + responseType: 'arraybuffer', + proxy: false + }) + ); + + const base64 = Buffer.from(response.data, 'binary').toString('base64'); + const imageType = + getFileContentTypeFromHeader(response.headers['content-type']) || + guessBase64ImageType(base64); + + return { + completeBase64: `data:${imageType};base64,${base64}`, + base64, + mime: imageType + }; + } catch (error) { + addLog.debug(`Load image to base64 failed: ${url}`); + console.log(error); + return Promise.reject(error); + } +}; diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index ba6863436..7575b3675 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -1,18 +1,24 @@ import { uploadMongoImg } from '../image/controller'; import FormData from 'form-data'; - import { WorkerNameEnum, runWorker } from '../../../worker/utils'; import fs from 'fs'; -import type { ReadFileResponse } from '../../../worker/readFile/type'; +import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type'; import axios from 'axios'; import { addLog } from '../../system/log'; -import { batchRun } from '@fastgpt/global/common/fn/utils'; -import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown'; +import { batchRun } from '@fastgpt/global/common/system/utils'; +import { htmlTable2Md, matchMdImg } from '@fastgpt/global/common/string/markdown'; +import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; +import { getErrText } from '@fastgpt/global/common/error/utils'; +import { delay } from '@fastgpt/global/common/system/utils'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import { getImageBase64 } from '../image/utils'; export type readRawTextByLocalFileParams = { teamId: string; + tmbId: string; path: string; encoding: string; + customPdfParse?: boolean; metadata?: Record; }; export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => { @@ -22,46 +28,51 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam const buffer = await fs.promises.readFile(path); - const { rawText } = await readRawContentByFileBuffer({ + return readRawContentByFileBuffer({ extension, isQAImport: false, + customPdfParse: params.customPdfParse, teamId: params.teamId, + tmbId: params.tmbId, encoding: params.encoding, buffer, metadata: params.metadata }); - - return { - rawText - }; }; export const readRawContentByFileBuffer = async ({ - extension, - isQAImport, teamId, + tmbId, + + extension, buffer, encoding, - metadata + metadata, + customPdfParse = false, + isQAImport = false }: { - isQAImport?: boolean; - extension: string; teamId: string; + tmbId: string; + + extension: string; buffer: Buffer; encoding: string; metadata?: Record; -}) => { - // Custom read file service - const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL; - const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || ''; - const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false'; - const readFileFromCustomService = async (): Promise => { - if ( - !customReadfileUrl || - !customReadFileExtension || - !customReadFileExtension.includes(extension) - ) - return; + + customPdfParse?: boolean; + isQAImport: boolean; +}): Promise => { + const systemParse = () => + runWorker(WorkerNameEnum.readFile, { + extension, + encoding, + buffer, + teamId + }); + const parsePdfFromCustomService = async (): Promise => { + const url = global.systemEnv.customPdfParse?.url; + const token = global.systemEnv.customPdfParse?.key; + if (!url) return systemParse(); const start = Date.now(); addLog.info('Parsing files from an external service'); @@ -70,27 +81,32 @@ export const readRawContentByFileBuffer = async ({ data.append('file', buffer, { filename: `file.${extension}` }); - data.append('extension', extension); - data.append('ocr', ocrParse); const { data: response } = await axios.post<{ - success: boolean; - message: string; - data: { - page: number; - markdown: string; - duration: number; - }; - }>(customReadfileUrl, data, { + pages: number; + markdown: string; + error?: Object | string; + }>(url, data, { timeout: 600000, headers: { - ...data.getHeaders() + ...data.getHeaders(), + Authorization: token ? `Bearer ${token}` : undefined } }); + if (response.error) { + return Promise.reject(response.error); + } + addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`); - const rawText = response.data.markdown; - const { text, imageList } = matchMdImgTextAndUpload(rawText); + const rawText = response.markdown; + const { text, imageList } = matchMdImg(rawText); + + createPdfParseUsage({ + teamId, + tmbId, + pages: response.pages + }); return { rawText: text, @@ -98,15 +114,198 @@ export const readRawContentByFileBuffer = async ({ imageList }; }; + const parsePdfFromDoc2x = async (): Promise => { + const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey; + if (!doc2xKey) return systemParse(); - let { rawText, formatText, imageList } = - (await readFileFromCustomService()) || - (await runWorker(WorkerNameEnum.readFile, { - extension, - encoding, - buffer, - teamId - })); + const parseTextImage = async (text: string) => { + // Extract image links and convert to base64 + const imageList: { id: string; url: string }[] = []; + let processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => { + const id = `IMAGE_${getNanoid()}_IMAGE`; + imageList.push({ + id, + url + }); + return `![](${id})`; + }); + + // Get base64 from image url + let resultImageList: ImageType[] = []; + await batchRun( + imageList, + async (item) => { + try { + const { base64, mime } = await getImageBase64(item.url); + resultImageList.push({ + uuid: item.id, + mime, + base64 + }); + } catch (error) { + processedText = processedText.replace(item.id, item.url); + addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`); + } + }, + 5 + ); + + return { + text: processedText, + imageList: resultImageList + }; + }; + + let startTime = Date.now(); + + // 1. Get pre-upload URL first + const { data: preupload_data } = await axios + .post<{ code: string; data: { uid: string; url: string } }>( + 'https://v2.doc2x.noedgeai.com/api/v2/parse/preupload', + null, + { + headers: { + Authorization: `Bearer ${doc2xKey}` + } + } + ) + .catch((error) => { + return Promise.reject( + `[Pre-upload Error] Failed to get pre-upload URL: ${getErrText(error)}` + ); + }); + if (preupload_data?.code !== 'success') { + return Promise.reject(`Failed to get pre-upload URL: ${JSON.stringify(preupload_data)}`); + } + + const upload_url = preupload_data.data.url; + const uid = preupload_data.data.uid; + + // 2. Upload file to pre-signed URL with binary stream + const blob = new Blob([buffer], { type: 'application/pdf' }); + const response = await axios + .put(upload_url, blob, { + headers: { + 'Content-Type': 'application/pdf' + } + }) + .catch((error) => { + return Promise.reject(`[Upload Error] Failed to upload file: ${getErrText(error)}`); + }); + if (response.status !== 200) { + return Promise.reject(`Upload failed with status ${response.status}: ${response.statusText}`); + } + + await delay(5000); + addLog.debug(`Uploaded file to Doc2x, uid: ${uid}`); + // 3. Get the result by uid + const checkResult = async (retry = 30) => { + if (retry <= 0) { + return Promise.reject( + `[Parse Timeout Error] Failed to get result (uid: ${uid}): Process timeout` + ); + } + + try { + const { data: result_data } = await axios + .get<{ + code: string; + data: { + progress: number; + status: 'processing' | 'failed' | 'success'; + result: { + pages: { + md: string; + }[]; + }; + }; + }>(`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, { + headers: { + Authorization: `Bearer ${doc2xKey}` + } + }) + .catch((error) => { + return Promise.reject( + `[Parse Status Error] Failed to get parse status: ${getErrText(error)}` + ); + }); + + // Error + if (!['ok', 'success'].includes(result_data.code)) { + return Promise.reject( + `Failed to get result (uid: ${uid}): ${JSON.stringify(result_data)}` + ); + } + + // Process + if (['ready', 'processing'].includes(result_data.data.status)) { + addLog.debug(`Waiting for the result, uid: ${uid}`); + await delay(5000); + return checkResult(retry - 1); + } + + // Finifsh + if (result_data.data.status === 'success') { + const result = result_data.data.result.pages + .map((page) => page.md) + .join('') + // Do some post-processing + .replace(/\\[\(\)]/g, '$') + .replace(/\\[\[\]]/g, '$$') + .replace(/]*)?(?:\s*\/>|>)/g, '![img]($1)') + .replace(//g, '') + .replace(//g, '') + .replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$') + .replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}'); + + const { text, imageList } = await parseTextImage(htmlTable2Md(result)); + + return { + pages: result_data.data.result.pages.length, + text, + imageList + }; + } + return checkResult(retry - 1); + } catch (error) { + if (retry > 1) { + await delay(100); + return checkResult(retry - 1); + } + return Promise.reject(error); + } + }; + + const { pages, text, imageList } = await checkResult(); + + createPdfParseUsage({ + teamId, + tmbId, + pages + }); + + addLog.info(`Doc2x parse success, time: ${Date.now() - startTime}ms`); + return { + rawText: text, + formatText: text, + imageList + }; + }; + // Custom read file service + const pdfParseFn = async (): Promise => { + if (!customPdfParse) return systemParse(); + if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService(); + if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x(); + + return systemParse(); + }; + + let { rawText, formatText, imageList } = await (async () => { + if (extension === 'pdf') { + return await pdfParseFn(); + } + return await systemParse(); + })(); // markdown data format if (imageList) { @@ -116,14 +315,14 @@ export const readRawContentByFileBuffer = async ({ return await uploadMongoImg({ base64Img: `data:${item.mime};base64,${item.base64}`, teamId, - // expiredTime: addHours(new Date(), 1), metadata: { ...metadata, mime: item.mime } }); } catch (error) { - return ''; + addLog.warn('Upload file image error', { error }); + return 'Upload load image error'; } })(); rawText = rawText.replace(item.uuid, src); @@ -142,5 +341,5 @@ export const readRawContentByFileBuffer = async ({ } } - return { rawText }; + return { rawText, formatText, imageList }; }; diff --git a/packages/service/common/system/tools.ts b/packages/service/common/system/tools.ts index a52e2a79b..321c2fbe0 100644 --- a/packages/service/common/system/tools.ts +++ b/packages/service/common/system/tools.ts @@ -10,6 +10,11 @@ export const SERVICE_LOCAL_HOST = export const initFastGPTConfig = (config?: FastGPTConfigFileType) => { if (!config) return; + // Special config computed + config.feConfigs.showCustomPdfParse = + !!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey; + config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0; + global.feConfigs = config.feConfigs; global.systemEnv = config.systemEnv; global.subPlans = config.subPlans; diff --git a/packages/service/common/system/utils.ts b/packages/service/common/system/utils.ts index 3151a4394..13837aab4 100644 --- a/packages/service/common/system/utils.ts +++ b/packages/service/common/system/utils.ts @@ -30,10 +30,10 @@ export const isInternalAddress = (url: string): boolean => { return true; } - // For non-metadata URLs, check if it's a domain name + // For IP addresses, check if they are internal const ipv4Pattern = /^(\d{1,3}\.){3}\d{1,3}$/; if (!ipv4Pattern.test(hostname)) { - return true; + return false; // Not an IP address, so it's a domain name - consider it external by default } // ... existing IP validation code ... diff --git a/packages/service/common/vectorStore/pg/class.ts b/packages/service/common/vectorStore/pg/class.ts index ba08adf43..6b7f42bd3 100644 --- a/packages/service/common/vectorStore/pg/class.ts +++ b/packages/service/common/vectorStore/pg/class.ts @@ -164,34 +164,22 @@ export class PgVectorCtrl { } try { - // const explan: any = await PgClient.query( - // `BEGIN; - // SET LOCAL hnsw.ef_search = ${global.systemEnv?.pgHNSWEfSearch || 100}; - // EXPLAIN ANALYZE select id, collection_id, vector <#> '[${vector}]' AS score - // from ${DatasetVectorTableName} - // where team_id='${teamId}' - // AND dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')}) - // ${forbidCollectionSql} - // order by score limit ${limit}; - // COMMIT;` - // ); - // console.log(explan[2].rows); - const results: any = await PgClient.query( - ` - BEGIN; + `BEGIN; SET LOCAL hnsw.ef_search = ${global.systemEnv?.pgHNSWEfSearch || 100}; - select id, collection_id, vector <#> '[${vector}]' AS score - from ${DatasetVectorTableName} - where team_id='${teamId}' - AND dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')}) - ${filterCollectionIdSql} - ${forbidCollectionSql} - order by score limit ${limit}; + SET LOCAL hnsw.iterative_scan = relaxed_order; + WITH relaxed_results AS MATERIALIZED ( + select id, collection_id, vector <#> '[${vector}]' AS score + from ${DatasetVectorTableName} + where team_id='${teamId}' + AND dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')}) + ${filterCollectionIdSql} + ${forbidCollectionSql} + order by score limit ${limit} + ) SELECT id, collection_id, score FROM relaxed_results ORDER BY score; COMMIT;` ); - - const rows = results?.[2]?.rows as PgSearchRawType[]; + const rows = results?.[3]?.rows as PgSearchRawType[]; return { results: rows.map((item) => ({ diff --git a/packages/service/core/ai/audio/speech.ts b/packages/service/core/ai/audio/speech.ts index 3d82f68c6..0bacaae4e 100644 --- a/packages/service/core/ai/audio/speech.ts +++ b/packages/service/core/ai/audio/speech.ts @@ -43,13 +43,13 @@ export async function text2Speech({ const readableStream = response.body as unknown as NodeJS.ReadableStream; readableStream.pipe(res); - let bufferStore = Buffer.from([]); + const chunks: Uint8Array[] = []; readableStream.on('data', (chunk) => { - bufferStore = Buffer.concat([bufferStore, chunk]); + chunks.push(chunk); }); readableStream.on('end', () => { - onSuccess({ model, buffer: bufferStore }); + onSuccess({ model, buffer: Buffer.concat(chunks) }); }); readableStream.on('error', (e) => { onError(e); diff --git a/packages/service/core/ai/config/provider/DeepSeek.json b/packages/service/core/ai/config/provider/DeepSeek.json index df9369e38..91b748e85 100644 --- a/packages/service/core/ai/config/provider/DeepSeek.json +++ b/packages/service/core/ai/config/provider/DeepSeek.json @@ -46,8 +46,8 @@ "defaultConfig": {}, "fieldMap": {}, "type": "llm", - "showTopP": true, - "showStopSign": true + "showTopP": false, + "showStopSign": false } ] } diff --git a/packages/service/core/ai/config/provider/Moonshot.json b/packages/service/core/ai/config/provider/Moonshot.json index 796c529b8..f33b09ebe 100644 --- a/packages/service/core/ai/config/provider/Moonshot.json +++ b/packages/service/core/ai/config/provider/Moonshot.json @@ -75,6 +75,81 @@ "showTopP": true, "showStopSign": true, "responseFormatList": ["text", "json_object"] + }, + { + "model": "moonshot-v1-8k-vision-preview", + "name": "moonshot-v1-8k-vision-preview", + "maxContext": 8000, + "maxResponse": 4000, + "quoteMaxToken": 6000, + "maxTemperature": 1, + "vision": true, + "toolChoice": true, + "functionCall": false, + "defaultSystemChatPrompt": "", + "datasetProcess": true, + "usedInClassify": true, + "customCQPrompt": "", + "usedInExtractFields": true, + "usedInQueryExtension": true, + "customExtractPrompt": "", + "usedInToolCall": true, + "defaultConfig": {}, + "fieldMap": {}, + "type": "llm", + "showTopP": true, + "showStopSign": true, + "responseFormatList": ["text", "json_object"] + }, + { + "model": "moonshot-v1-32k-vision-preview", + "name": "moonshot-v1-32k-vision-preview", + "maxContext": 32000, + "maxResponse": 4000, + "quoteMaxToken": 32000, + "maxTemperature": 1, + "vision": true, + "toolChoice": true, + "functionCall": false, + "defaultSystemChatPrompt": "", + "datasetProcess": true, + "usedInClassify": true, + "customCQPrompt": "", + "usedInExtractFields": true, + "usedInQueryExtension": true, + "customExtractPrompt": "", + "usedInToolCall": true, + "defaultConfig": {}, + "fieldMap": {}, + "type": "llm", + "showTopP": true, + "showStopSign": true, + "responseFormatList": ["text", "json_object"] + }, + { + "model": "moonshot-v1-128k-vision-preview", + "name": "moonshot-v1-128k-vision-preview", + "maxContext": 128000, + "maxResponse": 4000, + "quoteMaxToken": 60000, + "maxTemperature": 1, + "vision": true, + "toolChoice": true, + "functionCall": false, + "defaultSystemChatPrompt": "", + "datasetProcess": true, + "usedInClassify": true, + "customCQPrompt": "", + "usedInExtractFields": true, + "usedInQueryExtension": true, + "customExtractPrompt": "", + "usedInToolCall": true, + "defaultConfig": {}, + "fieldMap": {}, + "type": "llm", + "showTopP": true, + "showStopSign": true, + "responseFormatList": ["text", "json_object"] } ] } diff --git a/packages/service/core/ai/config/utils.ts b/packages/service/core/ai/config/utils.ts index c64ea2c57..86c3786df 100644 --- a/packages/service/core/ai/config/utils.ts +++ b/packages/service/core/ai/config/utils.ts @@ -163,6 +163,13 @@ export const loadSystemModels = async (init = false) => { global.systemDefaultModel.rerank = Array.from(global.reRankModelMap.values())[0]; } + // Sort model list + global.systemActiveModelList.sort((a, b) => { + const providerA = getModelProvider(a.provider); + const providerB = getModelProvider(b.provider); + return providerA.order - providerB.order; + }); + console.log('Load models success', JSON.stringify(global.systemActiveModelList, null, 2)); } catch (error) { console.error('Load models error', error); diff --git a/packages/service/core/ai/model.ts b/packages/service/core/ai/model.ts index 185881a23..3b0f0aef8 100644 --- a/packages/service/core/ai/model.ts +++ b/packages/service/core/ai/model.ts @@ -13,6 +13,11 @@ export const getDatasetModel = (model?: string) => { ?.find((item) => item.model === model || item.name === model) ?? getDefaultLLMModel() ); }; +export const getVlmModel = (model?: string) => { + return Array.from(global.llmModelMap.values()) + ?.filter((item) => item.vision) + ?.find((item) => item.model === model || item.name === model); +}; export const getDefaultEmbeddingModel = () => global?.systemDefaultModel.embedding!; export const getEmbeddingModel = (model?: string) => { diff --git a/packages/service/core/app/templates/templateSchema.ts b/packages/service/core/app/templates/templateSchema.ts index 2485f535f..826437412 100644 --- a/packages/service/core/app/templates/templateSchema.ts +++ b/packages/service/core/app/templates/templateSchema.ts @@ -9,41 +9,23 @@ const AppTemplateSchema = new Schema({ type: String, required: true }, - name: { - type: String - }, - intro: { - type: String - }, - avatar: { - type: String - }, - author: { - type: String - }, + name: String, + intro: String, + avatar: String, + author: String, tags: { type: [String], default: undefined }, - type: { - type: String - }, - isActive: { - type: Boolean - }, - userGuide: { - type: Object - }, - isQuickTemplate: { - type: Boolean - }, + type: String, + isActive: Boolean, + userGuide: Object, + isQuickTemplate: Boolean, order: { type: Number, default: -1 }, - workflow: { - type: Object - } + workflow: Object }); AppTemplateSchema.index({ templateId: 1 }); diff --git a/packages/service/core/chat/utils.ts b/packages/service/core/chat/utils.ts index 52edb2cbb..b5a70ace2 100644 --- a/packages/service/core/chat/utils.ts +++ b/packages/service/core/chat/utils.ts @@ -9,10 +9,9 @@ import type { } from '@fastgpt/global/core/ai/type.d'; import axios from 'axios'; import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants'; -import { getFileContentTypeFromHeader, guessBase64ImageType } from '../../common/file/utils'; -import { serverRequestBaseUrl } from '../../common/api/serverRequest'; import { i18nT } from '../../../web/i18n/utils'; import { addLog } from '../../common/system/log'; +import { getImageBase64 } from '../../common/file/image/utils'; export const filterGPTMessageByMaxContext = async ({ messages = [], @@ -166,25 +165,13 @@ export const loadRequestMessages = async ({ try { // If imgUrl is a local path, load image from local, and set url to base64 if (imgUrl.startsWith('/') || process.env.MULTIPLE_DATA_TO_BASE64 === 'true') { - addLog.debug('Load image from local server', { - baseUrl: serverRequestBaseUrl, - requestUrl: imgUrl - }); - const response = await axios.get(imgUrl, { - baseURL: serverRequestBaseUrl, - responseType: 'arraybuffer', - proxy: false - }); - const base64 = Buffer.from(response.data, 'binary').toString('base64'); - const imageType = - getFileContentTypeFromHeader(response.headers['content-type']) || - guessBase64ImageType(base64); + const { completeBase64: base64 } = await getImageBase64(imgUrl); return { ...item, image_url: { ...item.image_url, - url: `data:${imageType};base64,${base64}` + url: base64 } }; } @@ -223,7 +210,8 @@ export const loadRequestMessages = async ({ await Promise.all( content.map(async (item) => { if (item.type === 'text') { - if (item.text) return parseStringWithImages(item.text); + // If it is array, not need to parse image + if (item.text) return item; return; } if (item.type === 'file_url') return; // LLM not support file_url diff --git a/packages/service/core/dataset/apiDataset/api.ts b/packages/service/core/dataset/apiDataset/api.ts index 162d685f5..7d2ecb4da 100644 --- a/packages/service/core/dataset/apiDataset/api.ts +++ b/packages/service/core/dataset/apiDataset/api.ts @@ -108,7 +108,15 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } return formattedFiles; }; - const getFileContent = async ({ teamId, apiFileId }: { teamId: string; apiFileId: string }) => { + const getFileContent = async ({ + teamId, + tmbId, + apiFileId + }: { + teamId: string; + tmbId: string; + apiFileId: string; + }) => { const data = await request( `/v1/file/content`, { id: apiFileId }, @@ -123,6 +131,7 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } if (previewUrl) { const rawText = await readFileRawTextByUrl({ teamId, + tmbId, url: previewUrl, relatedId: apiFileId }); diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 159e62354..0dfcc6152 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -1,6 +1,6 @@ import { DatasetCollectionTypeEnum, - TrainingModeEnum + DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants'; import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; import { MongoDatasetCollection } from './schema'; @@ -19,13 +19,14 @@ import { predictDataLimitLength } from '../../../../global/core/dataset/utils'; import { mongoSessionRun } from '../../../common/mongo/sessionRun'; import { createTrainingUsage } from '../../../support/wallet/usage/controller'; import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants'; -import { getLLMModel, getEmbeddingModel } from '../../ai/model'; +import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model'; import { pushDataListToTrainingQueue } from '../training/controller'; import { MongoImage } from '../../../common/file/image/schema'; import { hashStr } from '@fastgpt/global/common/string/tools'; import { addDays } from 'date-fns'; import { MongoDatasetDataText } from '../data/dataTextSchema'; -import { delay, retryFn } from '@fastgpt/global/common/system/utils'; +import { retryFn } from '@fastgpt/global/common/system/utils'; +import { getTrainingModeByCollection } from './utils'; export const createCollectionAndInsertData = async ({ dataset, @@ -33,6 +34,7 @@ export const createCollectionAndInsertData = async ({ relatedId, createCollectionParams, isQAImport = false, + billId, session }: { dataset: DatasetSchemaType; @@ -41,13 +43,21 @@ export const createCollectionAndInsertData = async ({ createCollectionParams: CreateOneCollectionParams; isQAImport?: boolean; + billId?: string; session?: ClientSession; }) => { + // Adapter 4.9.0 + if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) { + createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk; + createCollectionParams.autoIndexes = true; + } + const teamId = createCollectionParams.teamId; const tmbId = createCollectionParams.tmbId; // Chunk split params - const trainingType = createCollectionParams.trainingType || TrainingModeEnum.chunk; - const chunkSize = createCollectionParams.chunkSize; + const trainingType = + createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk; + const chunkSize = createCollectionParams.chunkSize || 512; const chunkSplitter = createCollectionParams.chunkSplitter; const qaPrompt = createCollectionParams.qaPrompt; const usageName = createCollectionParams.name; @@ -56,7 +66,7 @@ export const createCollectionAndInsertData = async ({ const chunks = rawText2Chunks({ rawText, chunkLen: chunkSize, - overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0, + overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, customReg: chunkSplitter ? [chunkSplitter] : [], isQAImport }); @@ -64,7 +74,14 @@ export const createCollectionAndInsertData = async ({ // 2. auth limit await checkDatasetLimit({ teamId, - insertLen: predictDataLimitLength(trainingType, chunks) + insertLen: predictDataLimitLength( + getTrainingModeByCollection({ + trainingType, + autoIndexes: createCollectionParams.autoIndexes, + imageIndex: createCollectionParams.imageIndex + }), + chunks + ) }); const fn = async (session: ClientSession) => { @@ -89,15 +106,20 @@ export const createCollectionAndInsertData = async ({ }); // 4. create training bill - const { billId } = await createTrainingUsage({ - teamId, - tmbId, - appName: usageName, - billSource: UsageSourceEnum.training, - vectorModel: getEmbeddingModel(dataset.vectorModel)?.name, - agentModel: getLLMModel(dataset.agentModel)?.name, - session - }); + const traingBillId = await (async () => { + if (billId) return billId; + const { billId: newBillId } = await createTrainingUsage({ + teamId, + tmbId, + appName: usageName, + billSource: UsageSourceEnum.training, + vectorModel: getEmbeddingModel(dataset.vectorModel)?.name, + agentModel: getLLMModel(dataset.agentModel)?.name, + vllmModel: getVlmModel(dataset.vlmModel)?.name, + session + }); + return newBillId; + })(); // 5. insert to training queue const insertResults = await pushDataListToTrainingQueue({ @@ -107,9 +129,14 @@ export const createCollectionAndInsertData = async ({ collectionId, agentModel: dataset.agentModel, vectorModel: dataset.vectorModel, - trainingMode: trainingType, + vlmModel: dataset.vlmModel, + mode: getTrainingModeByCollection({ + trainingType, + autoIndexes: createCollectionParams.autoIndexes, + imageIndex: createCollectionParams.imageIndex + }), prompt: qaPrompt, - billId, + billId: traingBillId, data: chunks.map((item, index) => ({ ...item, chunkIndex: index @@ -161,10 +188,15 @@ export async function createOneCollection({ datasetId, type, - trainingType = TrainingModeEnum.chunk, - chunkSize = 512, - chunkSplitter, - qaPrompt, + createTime, + updateTime, + + hashRawText, + rawTextLength, + metadata = {}, + tags, + + nextSyncTime, fileId, rawLink, @@ -172,15 +204,18 @@ export async function createOneCollection({ externalFileUrl, apiFileId, - hashRawText, - rawTextLength, - metadata = {}, - session, - tags, + // Parse settings + customPdfParse, + imageIndex, - createTime, - updateTime, - nextSyncTime + // Chunk settings + trainingType = DatasetCollectionDataProcessModeEnum.chunk, + autoIndexes, + chunkSize = 512, + chunkSplitter, + qaPrompt, + + session }: CreateOneCollectionParams) { // Create collection tags const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session }); @@ -196,25 +231,31 @@ export async function createOneCollection({ name, type, - trainingType, - chunkSize, - chunkSplitter, - qaPrompt, + rawTextLength, + hashRawText, + tags: collectionTags, metadata, + createTime, + updateTime, + nextSyncTime, + ...(fileId ? { fileId } : {}), ...(rawLink ? { rawLink } : {}), ...(externalFileId ? { externalFileId } : {}), ...(externalFileUrl ? { externalFileUrl } : {}), ...(apiFileId ? { apiFileId } : {}), - rawTextLength, - hashRawText, - tags: collectionTags, + // Parse settings + customPdfParse, + imageIndex, - createTime, - updateTime, - nextSyncTime + // Chunk settings + trainingType, + autoIndexes, + chunkSize, + chunkSplitter, + qaPrompt } ], { session, ordered: true } diff --git a/packages/service/core/dataset/collection/schema.ts b/packages/service/core/dataset/collection/schema.ts index da13ed8ae..7e1686f95 100644 --- a/packages/service/core/dataset/collection/schema.ts +++ b/packages/service/core/dataset/collection/schema.ts @@ -1,7 +1,10 @@ import { connectionMongo, getMongoModel } from '../../../common/mongo'; const { Schema, model, models } = connectionMongo; import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d'; -import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants'; +import { + DatasetCollectionTypeMap, + DatasetCollectionDataProcessModeEnum +} from '@fastgpt/global/core/dataset/constants'; import { DatasetCollectionName } from '../schema'; import { TeamCollectionName, @@ -31,6 +34,8 @@ const DatasetCollectionSchema = new Schema({ ref: DatasetCollectionName, required: true }, + + // Basic info type: { type: String, enum: Object.keys(DatasetCollectionTypeMap), @@ -40,6 +45,11 @@ const DatasetCollectionSchema = new Schema({ type: String, required: true }, + tags: { + type: [String], + default: [] + }, + createTime: { type: Date, default: () => new Date() @@ -48,33 +58,8 @@ const DatasetCollectionSchema = new Schema({ type: Date, default: () => new Date() }, - forbid: { - type: Boolean, - default: false - }, - - // chunk filed - trainingType: { - type: String, - enum: Object.keys(TrainingTypeMap) - }, - chunkSize: { - type: Number, - required: true - }, - chunkSplitter: { - type: String - }, - qaPrompt: { - type: String - }, - ocrParse: Boolean, - - tags: { - type: [String], - default: [] - }, + // Metadata // local file collection fileId: { type: Schema.Types.ObjectId, @@ -82,22 +67,39 @@ const DatasetCollectionSchema = new Schema({ }, // web link collection rawLink: String, - // api collection + // Api collection apiFileId: String, - // external collection + // external collection(Abandoned) externalFileId: String, externalFileUrl: String, // external import url - // next sync time - nextSyncTime: Date, - - // metadata rawTextLength: Number, hashRawText: String, metadata: { type: Object, default: {} - } + }, + + forbid: Boolean, + // next sync time + nextSyncTime: Date, + + // Parse settings + customPdfParse: Boolean, + + // Chunk settings + imageIndex: Boolean, + autoIndexes: Boolean, + trainingType: { + type: String, + enum: Object.values(DatasetCollectionDataProcessModeEnum) + }, + chunkSize: { + type: Number, + required: true + }, + chunkSplitter: String, + qaPrompt: String }); DatasetCollectionSchema.virtual('dataset', { diff --git a/packages/service/core/dataset/collection/utils.ts b/packages/service/core/dataset/collection/utils.ts index 4f674a9dd..01051ca77 100644 --- a/packages/service/core/dataset/collection/utils.ts +++ b/packages/service/core/dataset/collection/utils.ts @@ -2,12 +2,17 @@ import { MongoDatasetCollection } from './schema'; import { ClientSession } from '../../../common/mongo'; import { MongoDatasetCollectionTags } from '../tag/schema'; import { readFromSecondary } from '../../../common/mongo/utils'; -import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type'; import { + CollectionWithDatasetType, + DatasetCollectionSchemaType +} from '@fastgpt/global/core/dataset/type'; +import { + DatasetCollectionDataProcessModeEnum, DatasetCollectionSyncResultEnum, DatasetCollectionTypeEnum, DatasetSourceReadTypeEnum, - DatasetTypeEnum + DatasetTypeEnum, + TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset'; import { readDatasetSourceRawText } from '../read'; @@ -160,6 +165,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => { })(); const rawText = await readDatasetSourceRawText({ teamId: collection.teamId, + tmbId: collection.tmbId, ...sourceReadType }); @@ -220,3 +226,24 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => { return DatasetCollectionSyncResultEnum.success; }; + +/* + QA: 独立进程 + Chunk: Image Index -> Auto index -> chunk index +*/ +export const getTrainingModeByCollection = (collection: { + trainingType: DatasetCollectionSchemaType['trainingType']; + autoIndexes?: DatasetCollectionSchemaType['autoIndexes']; + imageIndex?: DatasetCollectionSchemaType['imageIndex']; +}) => { + if (collection.trainingType === DatasetCollectionDataProcessModeEnum.qa) { + return TrainingModeEnum.qa; + } + if (collection.imageIndex && global.feConfigs?.isPlus) { + return TrainingModeEnum.image; + } + if (collection.autoIndexes && global.feConfigs?.isPlus) { + return TrainingModeEnum.auto; + } + return TrainingModeEnum.chunk; +}; diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts index 85dd8a7d2..bdba3b87c 100644 --- a/packages/service/core/dataset/data/schema.ts +++ b/packages/service/core/dataset/data/schema.ts @@ -7,6 +7,7 @@ import { } from '@fastgpt/global/support/user/team/constant'; import { DatasetCollectionName } from '../schema'; import { DatasetColCollectionName } from '../collection/schema'; +import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; export const DatasetDataCollectionName = 'dataset_datas'; @@ -42,9 +43,14 @@ const DatasetDataSchema = new Schema({ indexes: { type: [ { + // Abandon defaultIndex: { - type: Boolean, - default: false + type: Boolean + }, + type: { + type: String, + enum: Object.values(DatasetDataIndexTypeEnum), + default: DatasetDataIndexTypeEnum.custom }, dataId: { type: String, diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index 5d84e065b..7f7125290 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -13,11 +13,15 @@ import { POST } from '../../common/api/plusRequest'; export const readFileRawTextByUrl = async ({ teamId, + tmbId, url, + customPdfParse, relatedId }: { teamId: string; + tmbId: string; url: string; + customPdfParse?: boolean; relatedId: string; // externalFileId / apiFileId }) => { const response = await axios({ @@ -30,8 +34,11 @@ export const readFileRawTextByUrl = async ({ const buffer = Buffer.from(response.data, 'binary'); const { rawText } = await readRawContentByFileBuffer({ + customPdfParse, + isQAImport: false, extension, teamId, + tmbId, buffer, encoding: 'utf-8', metadata: { @@ -49,6 +56,7 @@ export const readFileRawTextByUrl = async ({ */ export const readDatasetSourceRawText = async ({ teamId, + tmbId, type, sourceId, isQAImport, @@ -56,11 +64,14 @@ export const readDatasetSourceRawText = async ({ externalFileId, apiServer, feishuServer, - yuqueServer + yuqueServer, + customPdfParse }: { teamId: string; + tmbId: string; type: DatasetSourceReadTypeEnum; sourceId: string; + customPdfParse?: boolean; isQAImport?: boolean; // csv data selector?: string; // link selector @@ -72,9 +83,11 @@ export const readDatasetSourceRawText = async ({ if (type === DatasetSourceReadTypeEnum.fileLocal) { const { rawText } = await readFileContentFromMongo({ teamId, + tmbId, bucketName: BucketNameEnum.dataset, fileId: sourceId, - isQAImport + isQAImport, + customPdfParse }); return rawText; } else if (type === DatasetSourceReadTypeEnum.link) { @@ -88,8 +101,10 @@ export const readDatasetSourceRawText = async ({ if (!externalFileId) return Promise.reject('FileId not found'); const rawText = await readFileRawTextByUrl({ teamId, + tmbId, url: sourceId, - relatedId: externalFileId + relatedId: externalFileId, + customPdfParse }); return rawText; } else if (type === DatasetSourceReadTypeEnum.apiFile) { @@ -98,7 +113,8 @@ export const readDatasetSourceRawText = async ({ feishuServer, yuqueServer, apiFileId: sourceId, - teamId + teamId, + tmbId }); return rawText; } @@ -110,16 +126,18 @@ export const readApiServerFileContent = async ({ feishuServer, yuqueServer, apiFileId, - teamId + teamId, + tmbId }: { apiServer?: APIFileServer; feishuServer?: FeishuServer; yuqueServer?: YuqueServer; apiFileId: string; teamId: string; + tmbId: string; }) => { if (apiServer) { - return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, apiFileId }); + return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, tmbId, apiFileId }); } if (feishuServer || yuqueServer) { diff --git a/packages/service/core/dataset/schema.ts b/packages/service/core/dataset/schema.ts index f8f80ef4d..22f79fd25 100644 --- a/packages/service/core/dataset/schema.ts +++ b/packages/service/core/dataset/schema.ts @@ -67,6 +67,7 @@ const DatasetSchema = new Schema({ required: true, default: 'gpt-4o-mini' }, + vlmModel: String, intro: { type: String, default: '' diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts index a8bbe9417..d740eec55 100644 --- a/packages/service/core/dataset/training/controller.ts +++ b/packages/service/core/dataset/training/controller.ts @@ -1,16 +1,17 @@ import { MongoDatasetTraining } from './schema'; import type { PushDatasetDataChunkProps, - PushDatasetDataProps, PushDatasetDataResponse } from '@fastgpt/global/core/dataset/api.d'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { simpleText } from '@fastgpt/global/common/string/tools'; import { ClientSession } from '../../../common/mongo'; -import { getLLMModel, getEmbeddingModel } from '../../ai/model'; +import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model'; import { addLog } from '../../../common/system/log'; import { getCollectionWithDataset } from '../controller'; import { mongoSessionRun } from '../../../common/mongo/sessionRun'; +import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type'; +import { i18nT } from '../../../../web/i18n/utils'; export const lockTrainingDataByTeamId = async (teamId: string): Promise => { try { @@ -28,20 +29,17 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise => export const pushDataListToTrainingQueueByCollectionId = async ({ collectionId, ...props -}: { - teamId: string; - tmbId: string; - session?: ClientSession; -} & PushDatasetDataProps) => { +}: Omit) => { const { - dataset: { _id: datasetId, agentModel, vectorModel } + dataset: { _id: datasetId, agentModel, vectorModel, vlmModel } } = await getCollectionWithDataset(collectionId); return pushDataListToTrainingQueue({ ...props, datasetId, collectionId, + vectorModel, agentModel, - vectorModel + vlmModel }); }; @@ -52,30 +50,30 @@ export async function pushDataListToTrainingQueue({ collectionId, agentModel, vectorModel, + vlmModel, data, prompt, billId, - trainingMode = TrainingModeEnum.chunk, + mode = TrainingModeEnum.chunk, session -}: { - teamId: string; - tmbId: string; - datasetId: string; - agentModel: string; - vectorModel: string; - session?: ClientSession; -} & PushDatasetDataProps): Promise { +}: PushDataToTrainingQueueProps): Promise { + const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => { + if (mode !== TrainingModeEnum.image) return mode; + // 检查内容中,是否包含 ![](xxx) 的图片格式 + const text = data.q + data.a || ''; + const regex = /!\[\]\((.*?)\)/g; + const match = text.match(regex); + if (match) { + return TrainingModeEnum.image; + } + return mode; + }; const { model, maxToken, weight } = await (async () => { - const agentModelData = getLLMModel(agentModel); - if (!agentModelData) { - return Promise.reject(`File model ${agentModel} is inValid`); - } - const vectorModelData = getEmbeddingModel(vectorModel); - if (!vectorModelData) { - return Promise.reject(`Vector model ${vectorModel} is inValid`); - } - - if (trainingMode === TrainingModeEnum.chunk) { + if (mode === TrainingModeEnum.chunk) { + const vectorModelData = getEmbeddingModel(vectorModel); + if (!vectorModelData) { + return Promise.reject(i18nT('common:error_embedding_not_config')); + } return { maxToken: vectorModelData.maxToken * 1.5, model: vectorModelData.model, @@ -83,7 +81,11 @@ export async function pushDataListToTrainingQueue({ }; } - if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) { + if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) { + const agentModelData = getLLMModel(agentModel); + if (!agentModelData) { + return Promise.reject(i18nT('common:error_llm_not_config')); + } return { maxToken: agentModelData.maxContext * 0.8, model: agentModelData.model, @@ -91,8 +93,24 @@ export async function pushDataListToTrainingQueue({ }; } - return Promise.reject(`Training mode "${trainingMode}" is inValid`); + if (mode === TrainingModeEnum.image) { + const vllmModelData = getVlmModel(vlmModel); + if (!vllmModelData) { + return Promise.reject(i18nT('common:error_vlm_not_config')); + } + return { + maxToken: vllmModelData.maxContext * 0.8, + model: vllmModelData.model, + weight: 0 + }; + } + + return Promise.reject(`Training mode "${mode}" is inValid`); })(); + // Filter redundant params + if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) { + prompt = undefined; + } // filter repeat or equal content const set = new Set(); @@ -158,7 +176,7 @@ export async function pushDataListToTrainingQueue({ datasetId, collectionId, billId, - mode: trainingMode, + mode: getImageChunkMode(item, mode), prompt, model, q: item.q, diff --git a/packages/service/core/dataset/training/schema.ts b/packages/service/core/dataset/training/schema.ts index 48e01613a..34044674d 100644 --- a/packages/service/core/dataset/training/schema.ts +++ b/packages/service/core/dataset/training/schema.ts @@ -1,14 +1,15 @@ /* 模型的知识库 */ -import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo'; -const { Schema, model, models } = connectionMongo; +import { connectionMongo, getMongoModel } from '../../../common/mongo'; +const { Schema } = connectionMongo; import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type'; -import { TrainingTypeMap } from '@fastgpt/global/core/dataset/constants'; +import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { DatasetColCollectionName } from '../collection/schema'; import { DatasetCollectionName } from '../schema'; import { TeamCollectionName, TeamMemberCollectionName } from '@fastgpt/global/support/user/team/constant'; +import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; export const DatasetTrainingCollectionName = 'dataset_trainings'; @@ -25,7 +26,6 @@ const TrainingDataSchema = new Schema({ }, datasetId: { type: Schema.Types.ObjectId, - ref: DatasetCollectionName, required: true }, collectionId: { @@ -33,15 +33,13 @@ const TrainingDataSchema = new Schema({ ref: DatasetColCollectionName, required: true }, - billId: { - // concat bill - type: String - }, + billId: String, mode: { type: String, - enum: Object.keys(TrainingTypeMap), + enum: Object.values(TrainingModeEnum), required: true }, + expireAt: { // It will be deleted after 7 days type: Date, @@ -88,6 +86,10 @@ const TrainingDataSchema = new Schema({ indexes: { type: [ { + type: { + type: String, + enum: Object.values(DatasetDataIndexTypeEnum) + }, text: { type: String, required: true @@ -98,6 +100,19 @@ const TrainingDataSchema = new Schema({ } }); +TrainingDataSchema.virtual('dataset', { + ref: DatasetCollectionName, + localField: 'datasetId', + foreignField: '_id', + justOne: true +}); +TrainingDataSchema.virtual('collection', { + ref: DatasetColCollectionName, + localField: 'collectionId', + foreignField: '_id', + justOne: true +}); + try { // lock training data(teamId); delete training data TrainingDataSchema.index({ teamId: 1, datasetId: 1 }); diff --git a/packages/service/core/workflow/dispatch/agent/runTool/index.ts b/packages/service/core/workflow/dispatch/agent/runTool/index.ts index c7aeabe31..b10b57eac 100644 --- a/packages/service/core/workflow/dispatch/agent/runTool/index.ts +++ b/packages/service/core/workflow/dispatch/agent/runTool/index.ts @@ -1,6 +1,7 @@ import { NodeOutputKeyEnum } from '@fastgpt/global/core/workflow/constants'; import { DispatchNodeResponseKeyEnum } from '@fastgpt/global/core/workflow/runtime/constants'; import type { + ChatDispatchProps, DispatchNodeResultType, RuntimeNodeItemType } from '@fastgpt/global/core/workflow/runtime/type'; @@ -46,7 +47,7 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise< query, requestOrigin, chatConfig, - runningAppInfo: { teamId }, + runningUserInfo, externalProvider, params: { model, @@ -54,7 +55,8 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise< userChatInput, history = 6, fileUrlList: fileLinks, - aiChatVision + aiChatVision, + aiChatReasoning } } = props; @@ -62,6 +64,9 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise< const useVision = aiChatVision && toolModel.vision; const chatHistories = getHistories(history, histories); + props.params.aiChatVision = aiChatVision && toolModel.vision; + props.params.aiChatReasoning = aiChatReasoning && toolModel.reasoning; + const toolNodeIds = filterToolNodeIdByEdges({ nodeId, edges: runtimeEdges }); // Gets the module to which the tool is connected @@ -99,10 +104,11 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise< const globalFiles = chatValue2RuntimePrompt(query).files; const { documentQuoteText, userFiles } = await getMultiInput({ + runningUserInfo, histories: chatHistories, requestOrigin, maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20, - teamId, + customPdfParse: chatConfig?.fileSelectConfig?.customPdfParse, fileLinks, inputFiles: globalFiles, hasReadFilesTool @@ -289,19 +295,21 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise< }; const getMultiInput = async ({ + runningUserInfo, histories, fileLinks, requestOrigin, maxFiles, - teamId, + customPdfParse, inputFiles, hasReadFilesTool }: { + runningUserInfo: ChatDispatchProps['runningUserInfo']; histories: ChatItemType[]; fileLinks?: string[]; requestOrigin?: string; maxFiles: number; - teamId: string; + customPdfParse?: boolean; inputFiles: UserChatItemValueItemType['file'][]; hasReadFilesTool: boolean; }) => { @@ -329,7 +337,9 @@ const getMultiInput = async ({ urls, requestOrigin, maxFiles, - teamId + customPdfParse, + teamId: runningUserInfo.teamId, + tmbId: runningUserInfo.tmbId }); return { diff --git a/packages/service/core/workflow/dispatch/agent/runTool/promptCall.ts b/packages/service/core/workflow/dispatch/agent/runTool/promptCall.ts index 58e95a059..c5c0cb4bd 100644 --- a/packages/service/core/workflow/dispatch/agent/runTool/promptCall.ts +++ b/packages/service/core/workflow/dispatch/agent/runTool/promptCall.ts @@ -24,7 +24,12 @@ import { import { AIChatItemType } from '@fastgpt/global/core/chat/type'; import { GPTMessages2Chats } from '@fastgpt/global/core/chat/adapt'; import { formatToolResponse, initToolCallEdges, initToolNodes } from './utils'; -import { computedMaxToken, llmCompletionsBodyFormat } from '../../../../ai/utils'; +import { + computedMaxToken, + llmCompletionsBodyFormat, + parseReasoningContent, + parseReasoningStreamContent +} from '../../../../ai/utils'; import { WorkflowResponseType } from '../../type'; import { toolValueTypeList } from '@fastgpt/global/core/workflow/constants'; import { WorkflowInteractiveResponseType } from '@fastgpt/global/core/workflow/template/system/interactive/type'; @@ -58,6 +63,7 @@ export const runToolWithPromptCall = async ( temperature, maxToken, aiChatVision, + aiChatReasoning, aiChatTopP, aiChatStopSign, aiChatResponseFormat, @@ -216,7 +222,7 @@ export const runToolWithPromptCall = async ( const [requestMessages] = await Promise.all([ loadRequestMessages({ messages: filterMessages, - useVision: toolModel.vision && aiChatVision, + useVision: aiChatVision, origin: requestOrigin }) ]); @@ -251,22 +257,46 @@ export const runToolWithPromptCall = async ( } }); - const answer = await (async () => { + const { answer, reasoning } = await (async () => { if (res && isStreamResponse) { - const { answer } = await streamResponse({ + const { answer, reasoning } = await streamResponse({ res, toolNodes, stream: aiResponse, - workflowStreamResponse + workflowStreamResponse, + aiChatReasoning }); - return answer; + return { answer, reasoning }; } else { - const result = aiResponse as ChatCompletion; + const content = aiResponse.choices?.[0]?.message?.content || ''; + const reasoningContent: string = aiResponse.choices?.[0]?.message?.reasoning_content || ''; - return result.choices?.[0]?.message?.content || ''; + // API already parse reasoning content + if (reasoningContent || !aiChatReasoning) { + return { + answer: content, + reasoning: reasoningContent + }; + } + + const [think, answer] = parseReasoningContent(content); + return { + answer, + reasoning: think + }; } })(); + + if (stream && !isStreamResponse && aiChatReasoning && reasoning) { + workflowStreamResponse?.({ + event: SseResponseEventEnum.fastAnswer, + data: textAdaptGptResponse({ + reasoning_content: reasoning + }) + }); + } + const { answer: replaceAnswer, toolJson } = parseAnswer(answer); if (!answer && !toolJson) { return Promise.reject(getEmptyResponseTip()); @@ -294,11 +324,16 @@ export const runToolWithPromptCall = async ( } // No tool is invoked, indicating that the process is over - const gptAssistantResponse: ChatCompletionAssistantMessageParam = { + const gptAssistantResponse: ChatCompletionMessageParam = { role: ChatCompletionRequestMessageRoleEnum.Assistant, - content: replaceAnswer + content: replaceAnswer, + reasoning_text: reasoning }; - const completeMessages = filterMessages.concat(gptAssistantResponse); + const completeMessages = filterMessages.concat({ + ...gptAssistantResponse, + reasoning_text: undefined + }); + const inputTokens = await countGptMessagesTokens(requestMessages); const outputTokens = await countGptMessagesTokens([gptAssistantResponse]); @@ -379,9 +414,10 @@ export const runToolWithPromptCall = async ( })(); // 合并工具调用的结果,使用 functionCall 格式存储。 - const assistantToolMsgParams: ChatCompletionAssistantMessageParam = { + const assistantToolMsgParams: ChatCompletionMessageParam = { role: ChatCompletionRequestMessageRoleEnum.Assistant, - function_call: toolJson + function_call: toolJson, + reasoning_text: reasoning }; // Only toolCall tokens are counted here, Tool response tokens count towards the next reply @@ -502,12 +538,14 @@ ANSWER: `; async function streamResponse({ res, stream, - workflowStreamResponse + workflowStreamResponse, + aiChatReasoning }: { res: NextApiResponse; toolNodes: ToolNodeItemType[]; stream: StreamChatType; workflowStreamResponse?: WorkflowResponseType; + aiChatReasoning?: boolean; }) { const write = responseWriteController({ res, @@ -515,7 +553,9 @@ async function streamResponse({ }); let startResponseWrite = false; - let textAnswer = ''; + let answer = ''; + let reasoning = ''; + const { parsePart, getStartTagBuffer } = parseReasoningStreamContent(); for await (const part of stream) { if (res.closed) { @@ -523,13 +563,21 @@ async function streamResponse({ break; } - const responseChoice = part.choices?.[0]?.delta; - // console.log(responseChoice, '---==='); + const [reasoningContent, content] = parsePart(part, aiChatReasoning); + answer += content; + reasoning += reasoningContent; - if (responseChoice?.content) { - const content = responseChoice?.content || ''; - textAnswer += content; + if (aiChatReasoning && reasoningContent) { + workflowStreamResponse?.({ + write, + event: SseResponseEventEnum.answer, + data: textAdaptGptResponse({ + reasoning_content: reasoningContent + }) + }); + } + if (content) { if (startResponseWrite) { workflowStreamResponse?.({ write, @@ -538,18 +586,20 @@ async function streamResponse({ text: content }) }); - } else if (textAnswer.length >= 3) { - textAnswer = textAnswer.trim(); - if (textAnswer.startsWith('0')) { + } else if (answer.length >= 3) { + answer = answer.trimStart(); + if (/0(:|:)/.test(answer)) { startResponseWrite = true; + // find first : index - const firstIndex = textAnswer.indexOf(':'); - textAnswer = textAnswer.substring(firstIndex + 1).trim(); + const firstIndex = + answer.indexOf('0:') !== -1 ? answer.indexOf('0:') : answer.indexOf('0:'); + answer = answer.substring(firstIndex + 2).trim(); workflowStreamResponse?.({ write, event: SseResponseEventEnum.answer, data: textAdaptGptResponse({ - text: textAnswer + text: answer }) }); } @@ -557,7 +607,23 @@ async function streamResponse({ } } - return { answer: textAnswer.trim() }; + if (answer === '') { + answer = getStartTagBuffer(); + if (/0(:|:)/.test(answer)) { + // find first : index + const firstIndex = answer.indexOf('0:') !== -1 ? answer.indexOf('0:') : answer.indexOf('0:'); + answer = answer.substring(firstIndex + 2).trim(); + workflowStreamResponse?.({ + write, + event: SseResponseEventEnum.answer, + data: textAdaptGptResponse({ + text: answer + }) + }); + } + } + + return { answer, reasoning }; } const parseAnswer = ( @@ -568,8 +634,7 @@ const parseAnswer = ( } => { str = str.trim(); // 首先,使用正则表达式提取TOOL_ID和TOOL_ARGUMENTS - const prefixReg = /^1(:|:)/; - const answerPrefixReg = /^0(:|:)/; + const prefixReg = /1(:|:)/; if (prefixReg.test(str)) { const toolString = sliceJsonStr(str); @@ -585,13 +650,21 @@ const parseAnswer = ( } }; } catch (error) { - return { - answer: ERROR_TEXT - }; + if (/^1(:|:)/.test(str)) { + return { + answer: ERROR_TEXT + }; + } else { + return { + answer: str + }; + } } } else { + const firstIndex = str.indexOf('0:') !== -1 ? str.indexOf('0:') : str.indexOf('0:'); + const answer = str.substring(firstIndex + 2).trim(); return { - answer: str.replace(answerPrefixReg, '') + answer }; } }; diff --git a/packages/service/core/workflow/dispatch/agent/runTool/type.d.ts b/packages/service/core/workflow/dispatch/agent/runTool/type.d.ts index 64ecd79fc..61cb6b217 100644 --- a/packages/service/core/workflow/dispatch/agent/runTool/type.d.ts +++ b/packages/service/core/workflow/dispatch/agent/runTool/type.d.ts @@ -22,6 +22,7 @@ export type DispatchToolModuleProps = ModuleDispatchProps<{ [NodeInputKeyEnum.aiChatTemperature]: number; [NodeInputKeyEnum.aiChatMaxToken]: number; [NodeInputKeyEnum.aiChatVision]?: boolean; + [NodeInputKeyEnum.aiChatReasoning]?: boolean; [NodeInputKeyEnum.aiChatTopP]?: number; [NodeInputKeyEnum.aiChatStopSign]?: string; [NodeInputKeyEnum.aiChatResponseFormat]?: string; diff --git a/packages/service/core/workflow/dispatch/chat/oneapi.ts b/packages/service/core/workflow/dispatch/chat/oneapi.ts index a51ca63b0..07035a826 100644 --- a/packages/service/core/workflow/dispatch/chat/oneapi.ts +++ b/packages/service/core/workflow/dispatch/chat/oneapi.ts @@ -11,7 +11,10 @@ import { formatModelChars2Points } from '../../../../support/wallet/usage/utils' import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d'; import { postTextCensor } from '../../../../common/api/requestPlusApi'; import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants'; -import type { DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/type'; +import type { + ChatDispatchProps, + DispatchNodeResultType +} from '@fastgpt/global/core/workflow/runtime/type'; import { countGptMessagesTokens } from '../../../../common/string/tiktoken/index'; import { chats2GPTMessages, @@ -69,7 +72,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise if (stringQuoteText) { @@ -400,7 +406,9 @@ async function getMultiInput({ urls, requestOrigin, maxFiles, - teamId + customPdfParse, + teamId: runningUserInfo.teamId, + tmbId: runningUserInfo.tmbId }); return { @@ -555,6 +563,15 @@ async function streamResponse({ // if answer is empty, try to get value from startTagBuffer. (Cause: The response content is too short to exceed the minimum parse length) if (answer === '') { answer = getStartTagBuffer(); + if (isResponseAnswerText && answer) { + workflowStreamResponse?.({ + write, + event: SseResponseEventEnum.answer, + data: textAdaptGptResponse({ + text: answer + }) + }); + } } return { answer, reasoning }; diff --git a/packages/service/core/workflow/dispatch/tools/readFiles.ts b/packages/service/core/workflow/dispatch/tools/readFiles.ts index 1e2c5953b..f4593c375 100644 --- a/packages/service/core/workflow/dispatch/tools/readFiles.ts +++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts @@ -45,13 +45,14 @@ ${content.slice(0, 100)}${content.length > 100 ? '......' : ''} export const dispatchReadFiles = async (props: Props): Promise => { const { requestOrigin, - runningAppInfo: { teamId }, + runningUserInfo: { teamId, tmbId }, histories, chatConfig, node: { version }, params: { fileUrlList = [] } } = props; const maxFiles = chatConfig?.fileSelectConfig?.maxFiles || 20; + const customPdfParse = chatConfig?.fileSelectConfig?.customPdfParse || false; // Get files from histories const filesFromHistories = version !== '489' ? [] : getHistoryFileLinks(histories); @@ -61,7 +62,9 @@ export const dispatchReadFiles = async (props: Props): Promise => { urls: [...fileUrlList, ...filesFromHistories], requestOrigin, maxFiles, - teamId + teamId, + tmbId, + customPdfParse }); return { @@ -105,12 +108,16 @@ export const getFileContentFromLinks = async ({ urls, requestOrigin, maxFiles, - teamId + teamId, + tmbId, + customPdfParse }: { urls: string[]; requestOrigin?: string; maxFiles: number; teamId: string; + tmbId: string; + customPdfParse?: boolean; }) => { const parseUrlList = urls // Remove invalid urls @@ -205,8 +212,10 @@ export const getFileContentFromLinks = async ({ extension, isQAImport: false, teamId, + tmbId, buffer, - encoding + encoding, + customPdfParse }); // Add to buffer diff --git a/packages/service/support/wallet/usage/controller.ts b/packages/service/support/wallet/usage/controller.ts index 7eee2f0f1..b9b6dcd1b 100644 --- a/packages/service/support/wallet/usage/controller.ts +++ b/packages/service/support/wallet/usage/controller.ts @@ -117,14 +117,16 @@ export const createTrainingUsage = async ({ billSource, vectorModel, agentModel, + vllmModel, session }: { teamId: string; tmbId: string; appName: string; billSource: UsageSourceEnum; - vectorModel: string; - agentModel: string; + vectorModel?: string; + agentModel?: string; + vllmModel?: string; session?: ClientSession; }) => { const [{ _id }] = await MongoUsage.create( @@ -136,27 +138,46 @@ export const createTrainingUsage = async ({ source: billSource, totalPoints: 0, list: [ - { - moduleName: i18nT('common:support.wallet.moduleName.index'), - model: vectorModel, - amount: 0, - inputTokens: 0, - outputTokens: 0 - }, - { - moduleName: i18nT('common:support.wallet.moduleName.qa'), - model: agentModel, - amount: 0, - inputTokens: 0, - outputTokens: 0 - }, - { - moduleName: i18nT('common:core.dataset.training.Auto mode'), - model: agentModel, - amount: 0, - inputTokens: 0, - outputTokens: 0 - } + ...(vectorModel + ? [ + { + moduleName: i18nT('account_usage:embedding_index'), + model: vectorModel, + amount: 0, + inputTokens: 0, + outputTokens: 0 + } + ] + : []), + ...(agentModel + ? [ + { + moduleName: i18nT('account_usage:qa'), + model: agentModel, + amount: 0, + inputTokens: 0, + outputTokens: 0 + }, + { + moduleName: i18nT('account_usage:auto_index'), + model: agentModel, + amount: 0, + inputTokens: 0, + outputTokens: 0 + } + ] + : []), + ...(vllmModel + ? [ + { + moduleName: i18nT('account_usage:image_parse'), + model: vllmModel, + amount: 0, + inputTokens: 0, + outputTokens: 0 + } + ] + : []) ] } ], @@ -165,3 +186,31 @@ export const createTrainingUsage = async ({ return { billId: String(_id) }; }; + +export const createPdfParseUsage = async ({ + teamId, + tmbId, + pages +}: { + teamId: string; + tmbId: string; + pages: number; +}) => { + const unitPrice = global.systemEnv?.customPdfParse?.price || 0; + const totalPoints = pages * unitPrice; + + createUsage({ + teamId, + tmbId, + appName: i18nT('account_usage:pdf_enhanced_parse'), + totalPoints, + source: UsageSourceEnum.pdfParse, + list: [ + { + moduleName: i18nT('account_usage:pdf_enhanced_parse'), + amount: totalPoints, + pages + } + ] + }); +}; diff --git a/packages/service/worker/htmlStr2Md/utils.ts b/packages/service/worker/htmlStr2Md/utils.ts index 8384d005a..0602fc818 100644 --- a/packages/service/worker/htmlStr2Md/utils.ts +++ b/packages/service/worker/htmlStr2Md/utils.ts @@ -1,6 +1,6 @@ import TurndownService from 'turndown'; import { ImageType } from '../readFile/type'; -import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown'; +import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { getNanoid } from '@fastgpt/global/common/string/tools'; // @ts-ignore const turndownPluginGfm = require('joplin-turndown-plugin-gfm'); @@ -46,7 +46,7 @@ export const html2md = ( // Base64 img to id, otherwise it will occupy memory when going to md const { processedHtml, images } = processBase64Images(html); const md = turndownService.turndown(processedHtml); - const { text, imageList } = matchMdImgTextAndUpload(md); + const { text, imageList } = matchMdImg(md); return { rawText: text, diff --git a/packages/service/worker/readFile/index.ts b/packages/service/worker/readFile/index.ts index 1c44852a8..45092ed72 100644 --- a/packages/service/worker/readFile/index.ts +++ b/packages/service/worker/readFile/index.ts @@ -9,7 +9,7 @@ import { readXlsxRawText } from './extension/xlsx'; import { readCsvRawText } from './extension/csv'; parentPort?.on('message', async (props: ReadRawTextProps) => { - const readRawContentByFileBuffer = async (params: ReadRawTextByBuffer) => { + const read = async (params: ReadRawTextByBuffer) => { switch (params.extension) { case 'txt': case 'md': @@ -27,7 +27,9 @@ parentPort?.on('message', async (props: ReadRawTextProps) => { case 'csv': return readCsvRawText(params); default: - return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx'); + return Promise.reject( + `Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx. "${params.extension}" is not supported.` + ); } }; @@ -41,7 +43,7 @@ parentPort?.on('message', async (props: ReadRawTextProps) => { try { parentPort?.postMessage({ type: 'success', - data: await readRawContentByFileBuffer(newProps) + data: await read(newProps) }); } catch (error) { console.log(error); diff --git a/packages/web/components/common/Image/PhotoView.tsx b/packages/web/components/common/Image/PhotoView.tsx index e7867e212..a0a8a8aa9 100644 --- a/packages/web/components/common/Image/PhotoView.tsx +++ b/packages/web/components/common/Image/PhotoView.tsx @@ -17,7 +17,7 @@ const MyPhotoView = (props: ImageProps) => { loadingElement={} > - + ); diff --git a/packages/web/components/common/MyBox/index.tsx b/packages/web/components/common/MyBox/index.tsx index 501ff568a..fcfaa5154 100644 --- a/packages/web/components/common/MyBox/index.tsx +++ b/packages/web/components/common/MyBox/index.tsx @@ -11,8 +11,8 @@ type Props = BoxProps & { const MyBox = ({ text, isLoading, children, size, ...props }: Props, ref: any) => { return ( - {isLoading && } {children} + {isLoading && } ); }; diff --git a/packages/web/components/common/Radio/LeftRadio.tsx b/packages/web/components/common/Radio/LeftRadio.tsx index 1d7492120..9e244ba4c 100644 --- a/packages/web/components/common/Radio/LeftRadio.tsx +++ b/packages/web/components/common/Radio/LeftRadio.tsx @@ -1,26 +1,24 @@ import React from 'react'; import { Box, Flex, useTheme, Grid, type GridProps, HStack } from '@chakra-ui/react'; import { useTranslation } from 'next-i18next'; -import MyTooltip from '../MyTooltip'; import QuestionTip from '../MyTooltip/QuestionTip'; -// @ts-ignore -interface Props extends GridProps { +type Props = Omit & { list: { title: string; desc?: string; - value: any; + value: T; children?: React.ReactNode; tooltip?: string; }[]; align?: 'flex-top' | 'center'; - value: any; + value: T; defaultBg?: string; activeBg?: string; - onChange: (e: any) => void; -} + onChange: (e: T) => void; +}; -const LeftRadio = ({ +const LeftRadio = ({ list, value, align = 'flex-top', @@ -30,7 +28,7 @@ const LeftRadio = ({ activeBg = 'primary.50', onChange, ...props -}: Props) => { +}: Props) => { const { t } = useTranslation(); const theme = useTheme(); @@ -39,7 +37,7 @@ const LeftRadio = ({ {list.map((item) => ( {typeof item.title === 'string' ? t(item.title as any) : item.title} - {!!item.tooltip && } + {!!item.tooltip && } {!!item.desc && ( diff --git a/packages/web/i18n/en/account_model.json b/packages/web/i18n/en/account_model.json index dcf4a7866..f7143cfdb 100644 --- a/packages/web/i18n/en/account_model.json +++ b/packages/web/i18n/en/account_model.json @@ -21,6 +21,7 @@ "edit_channel": "Channel configuration", "enable_channel": "Enable", "forbid_channel": "Disabled", + "input maxToken_tip": "The model max_tokens parameter, if left blank, means that the model does not support it.", "key_type": "API key format:", "log": "Call log", "log_detail": "Log details", @@ -28,6 +29,7 @@ "log_status": "Status", "mapping": "Model Mapping", "mapping_tip": "A valid Json is required. \nThe model can be mapped when sending a request to the actual address. \nFor example:\n{\n \n \"gpt-4o\": \"gpt-4o-test\"\n\n}\n\nWhen FastGPT requests the gpt-4o model, the gpt-4o-test model is sent to the actual address, instead of gpt-4o.", + "max_temperature_tip": "If the model temperature parameter is not filled in, it means that the model does not support the temperature parameter.", "model": "Model", "model_name": "Model name", "model_test": "Model testing", @@ -43,5 +45,7 @@ "selected_model_empty": "Choose at least one model", "start_test": "Start testing {{num}} models", "test_failed": "There are {{num}} models that report errors", + "vlm_model": "Vlm", + "vlm_model_tip": "Used to generate additional indexing of images in a document in the knowledge base", "waiting_test": "Waiting for testing" } diff --git a/packages/web/i18n/en/account_usage.json b/packages/web/i18n/en/account_usage.json index bfb9cd72e..459007293 100644 --- a/packages/web/i18n/en/account_usage.json +++ b/packages/web/i18n/en/account_usage.json @@ -2,6 +2,7 @@ "ai_model": "AI model", "all": "all", "app_name": "Application name", + "auto_index": "Auto index", "billing_module": "Deduction module", "confirm_export": "A total of {{total}} pieces of data were filtered out. Are you sure to export?", "current_filter_conditions": "Current filter conditions", @@ -9,6 +10,7 @@ "details": "Details", "dingtalk": "DingTalk", "duration_seconds": "Duration (seconds)", + "embedding_index": "Embedding", "every_day": "Day", "every_month": "Moon", "export_confirm": "Export confirmation", @@ -16,6 +18,7 @@ "export_title": "Time,Members,Type,Project name,AI points", "feishu": "Feishu", "generation_time": "Generation time", + "image_parse": "Image tagging", "input_token_length": "input tokens", "member": "member", "member_name": "Member name", @@ -25,8 +28,12 @@ "official_account": "Official Account", "order_number": "Order number", "output_token_length": "output tokens", + "pages": "Pages", + "pdf_enhanced_parse": "PDF Enhanced Analysis", + "pdf_parse": "PDF Analysis", "points": "Points", "project_name": "Project name", + "qa": "QA", "select_member_and_source_first": "Please select members and types first", "share": "Share Link", "source": "source", diff --git a/packages/web/i18n/en/app.json b/packages/web/i18n/en/app.json index 0295f3f94..cc0bc8f46 100644 --- a/packages/web/i18n/en/app.json +++ b/packages/web/i18n/en/app.json @@ -105,6 +105,9 @@ "open_vision_function_tip": "Models with icon switches have image recognition capabilities. \nAfter being turned on, the model will parse the pictures in the file link and automatically parse the pictures in the user's question (user question ≤ 500 words).", "or_drag_JSON": "or drag in JSON file", "paste_config_or_drag": "Paste config or drag JSON file here", + "pdf_enhance_parse": "PDF enhancement analysis", + "pdf_enhance_parse_price": "{{price}}Points/page", + "pdf_enhance_parse_tips": "Calling PDF recognition model for parsing, you can convert it into Markdown and retain pictures in the document. At the same time, you can also identify scanned documents, which will take a long time to identify them.", "permission.des.manage": "Based on write permissions, you can configure publishing channels, view conversation logs, and assign permissions to the application.", "permission.des.read": "Use the app to have conversations", "permission.des.write": "Can view and edit apps", diff --git a/packages/web/i18n/en/common.json b/packages/web/i18n/en/common.json index 4da8c7f2d..d71060c08 100644 --- a/packages/web/i18n/en/common.json +++ b/packages/web/i18n/en/common.json @@ -562,10 +562,7 @@ "core.dataset.file": "File", "core.dataset.folder": "Directory", "core.dataset.import.Auto mode Estimated Price Tips": "Requires calling the file processing model, which consumes a lot of tokens: {{price}} points/1K tokens", - "core.dataset.import.Auto process": "Automatic", - "core.dataset.import.Auto process desc": "Automatically set segmentation and preprocessing rules", "core.dataset.import.Chunk Range": "Range: {{min}}~{{max}}", - "core.dataset.import.Chunk Split": "Chunks", "core.dataset.import.Chunk Split Tip": "Segment the text according to certain rules and convert it into a format that can be semantically searched. Suitable for most scenarios. No additional model processing is required, and the cost is low.", "core.dataset.import.Continue upload": "Continue upload", "core.dataset.import.Custom process": "Custom Rules", @@ -575,7 +572,6 @@ "core.dataset.import.Custom split char Tips": "Allows you to segment based on custom separators. Usually used for pre-processed data, using specific separators for precise segmentation.", "core.dataset.import.Custom text": "Custom Text", "core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset", - "core.dataset.import.Data Preprocessing": "Data Processing", "core.dataset.import.Data process params": "Data Processing Parameters", "core.dataset.import.Down load csv template": "Click to Download CSV Template", "core.dataset.import.Embedding Estimated Price Tips": "Only use the index model, consuming a small amount of AI points: {{price}} points/1K tokens", @@ -597,7 +593,6 @@ "core.dataset.import.Source name": "Source Name", "core.dataset.import.Sources list": "Sources", "core.dataset.import.Start upload": "Start Upload", - "core.dataset.import.Total files": "Total {{total}} Files", "core.dataset.import.Upload complete": "Upload complete", "core.dataset.import.Upload data": "Confirm Upload", "core.dataset.import.Upload file progress": "File Upload Progress", @@ -649,10 +644,10 @@ "core.dataset.training.Agent queue": "QA Training Queue", "core.dataset.training.Auto mode": "Auto index", "core.dataset.training.Auto mode Tip": "Increase the semantic richness of data blocks by generating related questions and summaries through sub-indexes and calling models, making it more conducive to retrieval. Requires more storage space and increases AI call times.", - "core.dataset.training.Chunk mode": "Default", + "core.dataset.training.Chunk mode": "Chunk", "core.dataset.training.Full": "Estimated Over 5 Minutes", "core.dataset.training.Leisure": "Idle", - "core.dataset.training.QA mode": "QA Chunks", + "core.dataset.training.QA mode": "QA", "core.dataset.training.Vector queue": "Index Queue", "core.dataset.training.Waiting": "Estimated 5 Minutes", "core.dataset.training.Website Sync": "Website Sync", @@ -861,7 +856,6 @@ "dataset.collections.Select Collection": "Select File", "dataset.collections.Select One Collection To Store": "Select a File to Store", "dataset.data.Can not edit": "No Edit Permission", - "dataset.data.Custom Index Number": "Custom Index {{number}}", "dataset.data.Default Index": "Default Index", "dataset.data.Delete Tip": "Confirm to Delete This Data?", "dataset.data.Index Placeholder": "Enter Index Text Content", @@ -889,6 +883,9 @@ "error.upload_image_error": "File upload failed", "error.username_empty": "Account cannot be empty", "error_collection_not_exist": "The collection does not exist", + "error_embedding_not_config": "Unconfigured index model", + "error_llm_not_config": "Unconfigured file understanding model", + "error_vlm_not_config": "Image comprehension model not configured", "extraction_results": "Extraction Results", "field_name": "Field Name", "free": "Free", @@ -956,6 +953,7 @@ "new_create": "Create New", "no": "No", "no_laf_env": "System Not Configured with Laf Environment", + "not_model_config": "No related model configured", "not_yet_introduced": "No Introduction Yet", "option": "Option", "pay.amount": "Amount", @@ -1121,7 +1119,6 @@ "support.wallet.invoice_detail": "Invoice Details", "support.wallet.invoice_info": "The invoice will be sent to the email within 3-7 working days, please wait patiently", "support.wallet.invoicing": "Invoicing", - "support.wallet.moduleName.index": "Index Generation", "support.wallet.moduleName.qa": "QA Split", "support.wallet.noBill": "No Bill Records", "support.wallet.no_invoice": "No Invoice Records", diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index 79ce4ac11..0902a421e 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -3,11 +3,16 @@ "add_file": "Import", "api_file": "API Dataset", "api_url": "API Url", + "auto_indexes": "Automatically generate supplementary indexes", + "auto_indexes_tips": "Additional index generation is performed through large models to improve semantic richness and improve retrieval accuracy.", "chunk_max_tokens": "max_tokens", "close_auto_sync": "Are you sure you want to turn off automatic sync?", "collection.Create update time": "Creation/Update Time", "collection.Training type": "Training", + "collection.training_type": "Chunk type", "collection_data_count": "Data amount", + "collection_metadata_custom_pdf_parse": "PDF enhancement analysis", + "collection_metadata_image_parse": "Image tagging", "collection_not_support_retraining": "This collection type does not support retuning parameters", "collection_not_support_sync": "This collection does not support synchronization", "collection_sync": "Sync data", @@ -22,12 +27,21 @@ "custom_data_process_params_desc": "Customize data processing rules", "data.ideal_chunk_length": "ideal block length", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", + "data_index_custom": "Custom index", + "data_index_default": "Default index", + "data_index_image": "Image Index", + "data_index_num": "Index {{index}}", + "data_index_question": "Inferred question index", + "data_index_summary": "Summary index", "data_process_params": "Params", "data_process_setting": "Processing config", "dataset.Unsupported operation": "dataset.Unsupported operation", "dataset.no_collections": "No datasets available", "dataset.no_tags": "No tags available", + "default_params": "default", + "default_params_desc": "Use system default parameters and rules", "edit_dataset_config": "Edit knowledge base configuration", + "enhanced_indexes": "Index enhancement", "error.collectionNotFound": "Collection not found~", "external_file": "External File Library", "external_file_dataset_desc": "Import files from an external file library to build a Dataset. The files will not be stored again.", @@ -38,19 +52,38 @@ "feishu_dataset": "Feishu Dataset", "feishu_dataset_config": "Feishu Dataset Config", "feishu_dataset_desc": "Can build a dataset using Feishu documents by configuring permissions, without secondary storage", + "file_list": "File list", "file_model_function_tip": "Enhances indexing and QA generation", "filename": "Filename", "folder_dataset": "Folder", "ideal_chunk_length": "ideal block length", "ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.", + "image_auto_parse": "Automatic image indexing", + "image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes", "import.Auto mode Estimated Price Tips": "The text understanding model needs to be called, which requires more points: {{price}} points/1K tokens", "import.Embedding Estimated Price Tips": "Only use the index model and consume a small amount of AI points: {{price}} points/1K tokens", + "import_confirm": "Confirm upload", + "import_data_preview": "Data preview", + "import_data_process_setting": "Data processing method settings", + "import_file_parse_setting": "File parsing settings", + "import_model_config": "Model selection", + "import_param_setting": "Parameter settings", + "import_select_file": "Select a file", "is_open_schedule": "Enable scheduled synchronization", + "keep_image": "Keep the picture", "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.", "open_auto_sync": "After scheduled synchronization is turned on, the system will try to synchronize the collection from time to time every day. During the collection synchronization period, the collection data will not be searched.", + "params_setting": "Parameter settings", + "pdf_enhance_parse": "PDF enhancement analysis", + "pdf_enhance_parse_price": "{{price}} points/page", + "pdf_enhance_parse_tips": "Calling PDF recognition model for parsing, you can convert it into Markdown and retain pictures in the document. At the same time, you can also identify scanned documents, which will take a long time to identify them.", "permission.des.manage": "Can manage the entire knowledge base data and information", "permission.des.read": "View knowledge base content", "permission.des.write": "Ability to add and change knowledge base content", + "preview_chunk": "Preview chunks", + "preview_chunk_empty": "Unable to read the contents of the file", + "preview_chunk_intro": "Display up to 10 pieces", + "preview_chunk_not_selected": "Click on the file on the left to preview", "rebuild_embedding_start_tip": "Index model switching task has started", "rebuilding_index_count": "Number of indexes being rebuilt: {{count}}", "request_headers": "Request headers, will automatically append 'Bearer '", @@ -72,8 +105,10 @@ "tag.tags": "Tags", "tag.total_tags": "Total {{total}} tags", "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "The Dataset has indexes that are being trained or rebuilt", + "total_num_files": "Total {{total}} files", "training_mode": "Chunk mode", "vector_model_max_tokens_tip": "Each chunk of data has a maximum length of 3000 tokens", + "vllm_model": "Image understanding model", "website_dataset": "Website Sync", "website_dataset_desc": "Website sync allows you to build a Dataset directly using a web link.", "yuque_dataset": "Yuque Dataset", diff --git a/packages/web/i18n/zh-CN/account_model.json b/packages/web/i18n/zh-CN/account_model.json index 89550d17b..4626d717b 100644 --- a/packages/web/i18n/zh-CN/account_model.json +++ b/packages/web/i18n/zh-CN/account_model.json @@ -21,6 +21,7 @@ "edit_channel": "渠道配置", "enable_channel": "启用", "forbid_channel": "禁用", + "input maxToken_tip": "模型 max_tokens 参数,如果留空,则代表模型不支持该参数。", "key_type": "API key 格式: ", "log": "调用日志", "log_detail": "日志详情", @@ -28,6 +29,7 @@ "log_status": "状态", "mapping": "模型映射", "mapping_tip": "需填写一个有效 Json。可在向实际地址发送请求时,对模型进行映射。例如:\n{\n \"gpt-4o\": \"gpt-4o-test\"\n}\n当 FastGPT 请求 gpt-4o 模型时,会向实际地址发送 gpt-4o-test 的模型,而不是 gpt-4o。", + "max_temperature_tip": "模型 temperature 参数,不填则代表模型不支持 temperature 参数。", "model": "模型", "model_name": "模型名", "model_test": "模型测试", @@ -43,5 +45,7 @@ "selected_model_empty": "至少选择一个模型", "start_test": "开始测试{{num}}个模型", "test_failed": "有{{num}}个模型报错", + "vlm_model": "图片理解模型", + "vlm_model_tip": "用于知识库中对文档中的图片进行额外的索引生成", "waiting_test": "等待测试" } diff --git a/packages/web/i18n/zh-CN/account_usage.json b/packages/web/i18n/zh-CN/account_usage.json index 3b7bb0792..925568857 100644 --- a/packages/web/i18n/zh-CN/account_usage.json +++ b/packages/web/i18n/zh-CN/account_usage.json @@ -2,6 +2,7 @@ "ai_model": "AI 模型", "all": "所有", "app_name": "应用名", + "auto_index": "索引增强", "billing_module": "扣费模块", "confirm_export": "共筛选出 {{total}} 条数据,是否确认导出?", "current_filter_conditions": "当前筛选条件:", @@ -9,6 +10,7 @@ "details": "详情", "dingtalk": "钉钉", "duration_seconds": "时长(秒)", + "embedding_index": "索引生成", "every_day": "天", "every_month": "月", "every_week": "每周", @@ -18,6 +20,7 @@ "export_title": "时间,成员,类型,项目名,AI 积分消耗", "feishu": "飞书", "generation_time": "生成时间", + "image_parse": "图片标注", "input_token_length": "输入 tokens", "member": "成员", "member_name": "成员名", @@ -27,8 +30,12 @@ "official_account": "公众号", "order_number": "订单号", "output_token_length": "输出 tokens", + "pages": "页数", + "pdf_enhanced_parse": "PDF 增强解析", + "pdf_parse": "PDF 解析", "points": "积分", "project_name": "项目名", + "qa": "问答对提取", "select_member_and_source_first": "请先选中成员和类型", "share": "分享链接", "source": "来源", diff --git a/packages/web/i18n/zh-CN/app.json b/packages/web/i18n/zh-CN/app.json index e76d5901f..75232c3d2 100644 --- a/packages/web/i18n/zh-CN/app.json +++ b/packages/web/i18n/zh-CN/app.json @@ -105,6 +105,9 @@ "open_vision_function_tip": "有图示开关的模型即拥有图片识别能力。若开启,模型会解析文件链接里的图片,并自动解析用户问题中的图片(用户问题≤500字时生效)。", "or_drag_JSON": "或拖入JSON文件", "paste_config_or_drag": "粘贴配置或拖入 JSON 文件", + "pdf_enhance_parse": "PDF增强解析", + "pdf_enhance_parse_price": "{{price}}积分/页", + "pdf_enhance_parse_tips": "调用 PDF 识别模型进行解析,可以将其转换成 Markdown 并保留文档中的图片,同时也可以对扫描件进行识别,识别时间较长。", "permission.des.manage": "写权限基础上,可配置发布渠道、查看对话日志、分配该应用权限", "permission.des.read": "可使用该应用进行对话", "permission.des.write": "可查看和编辑应用", diff --git a/packages/web/i18n/zh-CN/common.json b/packages/web/i18n/zh-CN/common.json index b010aee9d..3279be401 100644 --- a/packages/web/i18n/zh-CN/common.json +++ b/packages/web/i18n/zh-CN/common.json @@ -565,10 +565,7 @@ "core.dataset.file": "文件", "core.dataset.folder": "目录", "core.dataset.import.Auto mode Estimated Price Tips": "需调用文本理解模型,需要消耗较多AI 积分:{{price}} 积分/1K tokens", - "core.dataset.import.Auto process": "自动", - "core.dataset.import.Auto process desc": "自动设置分割和预处理规则", "core.dataset.import.Chunk Range": "范围:{{min}}~{{max}}", - "core.dataset.import.Chunk Split": "直接分段", "core.dataset.import.Chunk Split Tip": "将文本按一定的规则进行分段处理后,转成可进行语义搜索的格式,适合绝大多数场景。不需要调用模型额外处理,成本低。", "core.dataset.import.Continue upload": "继续上传", "core.dataset.import.Custom process": "自定义规则", @@ -578,7 +575,6 @@ "core.dataset.import.Custom split char Tips": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。", "core.dataset.import.Custom text": "自定义文本", "core.dataset.import.Custom text desc": "手动输入一段文本作为数据集", - "core.dataset.import.Data Preprocessing": "数据处理", "core.dataset.import.Data process params": "数据处理参数", "core.dataset.import.Down load csv template": "点击下载 CSV 模板", "core.dataset.import.Embedding Estimated Price Tips": "仅使用索引模型,消耗少量 AI 积分:{{price}} 积分/1K tokens", @@ -600,7 +596,6 @@ "core.dataset.import.Source name": "来源名", "core.dataset.import.Sources list": "来源列表", "core.dataset.import.Start upload": "开始上传", - "core.dataset.import.Total files": "共 {{total}} 个文件", "core.dataset.import.Upload complete": "完成上传", "core.dataset.import.Upload data": "确认上传", "core.dataset.import.Upload file progress": "文件上传进度", @@ -650,12 +645,12 @@ "core.dataset.test.test result placeholder": "测试结果将在这里展示", "core.dataset.test.test result tip": "根据知识库内容与测试文本的相似度进行排序,你可以根据测试结果调整对应的文本。\n注意:测试记录中的数据可能已经被修改过,点击某条测试数据后将展示最新的数据。", "core.dataset.training.Agent queue": "QA 训练排队", - "core.dataset.training.Auto mode": "增强处理", + "core.dataset.training.Auto mode": "补充索引", "core.dataset.training.Auto mode Tip": "通过子索引以及调用模型生成相关问题与摘要,来增加数据块的语义丰富度,更利于检索。需要消耗更多的存储空间和增加 AI 调用次数。", - "core.dataset.training.Chunk mode": "直接分段", + "core.dataset.training.Chunk mode": "直接分块", "core.dataset.training.Full": "预计 5 分钟以上", "core.dataset.training.Leisure": "空闲", - "core.dataset.training.QA mode": "问答拆分", + "core.dataset.training.QA mode": "问答对提取", "core.dataset.training.Vector queue": "索引排队", "core.dataset.training.Waiting": "预计 5 分钟", "core.dataset.training.Website Sync": "Web 站点同步", @@ -864,7 +859,6 @@ "dataset.collections.Select Collection": "选择文件", "dataset.collections.Select One Collection To Store": "选择一个文件进行存储", "dataset.data.Can not edit": "无编辑权限", - "dataset.data.Custom Index Number": "自定义索引{{number}}", "dataset.data.Default Index": "默认索引", "dataset.data.Delete Tip": "确认删除该条数据?", "dataset.data.Index Placeholder": "输入索引文本内容", @@ -892,6 +886,9 @@ "error.upload_image_error": "上传文件失败", "error.username_empty": "账号不能为空", "error_collection_not_exist": "集合不存在", + "error_embedding_not_config": "未配置索引模型", + "error_llm_not_config": "未配置文件理解模型", + "error_vlm_not_config": "未配置图片理解模型", "extraction_results": "提取结果", "field_name": "字段名", "free": "免费", @@ -959,6 +956,7 @@ "new_create": "新建", "no": "否", "no_laf_env": "系统未配置Laf环境", + "not_model_config": "未配置相关模型", "not_yet_introduced": "暂无介绍", "option": "选项", "pay.amount": "金额", @@ -1124,7 +1122,6 @@ "support.wallet.invoice_detail": "发票详情", "support.wallet.invoice_info": "发票将在 3-7 个工作日内发送至邮箱,请耐心等待", "support.wallet.invoicing": "开票", - "support.wallet.moduleName.index": "索引生成", "support.wallet.moduleName.qa": "QA 拆分", "support.wallet.noBill": "无账单记录~", "support.wallet.no_invoice": "暂无开票记录", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index 682d2ccdd..2dc1cc54c 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -3,11 +3,16 @@ "add_file": "添加文件", "api_file": "API 文件库", "api_url": "接口地址", + "auto_indexes": "自动生成补充索引", + "auto_indexes_tips": "通过大模型进行额外索引生成,提高语义丰富度,提高检索的精度。", "chunk_max_tokens": "分块上限", "close_auto_sync": "确认关闭自动同步功能?", "collection.Create update time": "创建/更新时间", "collection.Training type": "训练模式", + "collection.training_type": "处理模式", "collection_data_count": "数据量", + "collection_metadata_custom_pdf_parse": "PDF增强解析", + "collection_metadata_image_parse": "图片标注", "collection_not_support_retraining": "该集合类型不支持重新调整参数", "collection_not_support_sync": "该集合不支持同步", "collection_sync": "立即同步", @@ -22,12 +27,21 @@ "custom_data_process_params_desc": "自定义设置数据处理规则", "data.ideal_chunk_length": "理想分块长度", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", + "data_index_custom": "自定义索引", + "data_index_default": "默认索引", + "data_index_image": "图片索引", + "data_index_num": "索引 {{index}}", + "data_index_question": "推测问题索引", + "data_index_summary": "摘要索引", "data_process_params": "处理参数", "data_process_setting": "数据处理配置", "dataset.Unsupported operation": "操作不支持", "dataset.no_collections": "暂无数据集", "dataset.no_tags": "暂无标签", + "default_params": "默认", + "default_params_desc": "使用系统默认的参数和规则", "edit_dataset_config": "编辑知识库配置", + "enhanced_indexes": "索引增强", "error.collectionNotFound": "集合找不到了~", "external_file": "外部文件库", "external_file_dataset_desc": "可以从外部文件库导入文件构建知识库,文件不会进行二次存储", @@ -38,19 +52,38 @@ "feishu_dataset": "飞书知识库", "feishu_dataset_config": "配置飞书知识库", "feishu_dataset_desc": "可通过配置飞书文档权限,使用飞书文档构建知识库,文档不会进行二次存储", + "file_list": "文件列表", "file_model_function_tip": "用于增强索引和 QA 生成", "filename": "文件名", "folder_dataset": "文件夹", "ideal_chunk_length": "理想分块长度", "ideal_chunk_length_tips": "按结束符号进行分段,并将多个分段组成一个分块,该值决定了分块的预估大小,如果会有上下浮动。", + "image_auto_parse": "图片自动索引", + "image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引", "import.Auto mode Estimated Price Tips": "需调用文本理解模型,需要消耗较多AI 积分:{{price}} 积分/1K tokens", "import.Embedding Estimated Price Tips": "仅使用索引模型,消耗少量 AI 积分:{{price}} 积分/1K tokens", + "import_confirm": "确认上传", + "import_data_preview": "数据预览", + "import_data_process_setting": "数据处理方式设置", + "import_file_parse_setting": "文件解析设置", + "import_model_config": "模型选择", + "import_param_setting": "参数设置", + "import_select_file": "选择文件", "is_open_schedule": "启用定时同步", + "keep_image": "保留图片", "move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。", "open_auto_sync": "开启定时同步后,系统将会每天不定时尝试同步集合,集合同步期间,会出现无法搜索到该集合数据现象。", + "params_setting": "参数设置", + "pdf_enhance_parse": "PDF增强解析", + "pdf_enhance_parse_price": "{{price}}积分/页", + "pdf_enhance_parse_tips": "调用 PDF 识别模型进行解析,可以将其转换成 Markdown 并保留文档中的图片,同时也可以对扫描件进行识别,识别时间较长。", "permission.des.manage": "可管理整个知识库数据和信息", "permission.des.read": "可查看知识库内容", "permission.des.write": "可增加和变更知识库内容", + "preview_chunk": "分块预览", + "preview_chunk_empty": "无法读取该文件内容", + "preview_chunk_intro": "最多展示 10 个分块", + "preview_chunk_not_selected": "点击左侧文件后进行预览", "rebuild_embedding_start_tip": "切换索引模型任务已开始", "rebuilding_index_count": "重建中索引数量:{{count}}", "request_headers": "请求头参数,会自动补充 Bearer", @@ -72,8 +105,10 @@ "tag.tags": "标签", "tag.total_tags": "共{{total}}个标签", "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "知识库有训练中或正在重建的索引", + "total_num_files": "共 {{total}} 个文件", "training_mode": "处理方式", "vector_model_max_tokens_tip": "每个分块数据,最大长度为 3000 tokens", + "vllm_model": "图片理解模型", "website_dataset": "Web 站点同步", "website_dataset_desc": "Web 站点同步允许你直接使用一个网页链接构建知识库", "yuque_dataset": "语雀知识库", diff --git a/packages/web/i18n/zh-Hant/account_model.json b/packages/web/i18n/zh-Hant/account_model.json index 20b56ca6a..cf3803b37 100644 --- a/packages/web/i18n/zh-Hant/account_model.json +++ b/packages/web/i18n/zh-Hant/account_model.json @@ -19,6 +19,7 @@ "edit_channel": "渠道配置", "enable_channel": "啟用", "forbid_channel": "禁用", + "input maxToken_tip": "模型 max_tokens 參數,如果留空,則代表模型不支持該參數。", "key_type": "API key 格式:", "log": "調用日誌", "log_detail": "日誌詳情", @@ -26,6 +27,7 @@ "log_status": "狀態", "mapping": "模型映射", "mapping_tip": "需填寫一個有效 Json。\n可在向實際地址發送請求時,對模型進行映射。\n例如:\n{\n \n \"gpt-4o\": \"gpt-4o-test\"\n\n}\n\n當 FastGPT 請求 gpt-4o 模型時,會向實際地址發送 gpt-4o-test 的模型,而不是 gpt-4o。", + "max_temperature_tip": "模型 temperature 參數,不填則代表模型不支持 temperature 參數。", "model": "模型", "model_name": "模型名", "model_test": "模型測試", @@ -41,5 +43,7 @@ "selected_model_empty": "至少選擇一個模型", "start_test": "開始測試{{num}}個模型", "test_failed": "有{{num}}個模型報錯", + "vlm_model": "圖片理解模型", + "vlm_model_tip": "用於知識庫中對文檔中的圖片進行額外的索引生成", "waiting_test": "等待測試" } diff --git a/packages/web/i18n/zh-Hant/account_usage.json b/packages/web/i18n/zh-Hant/account_usage.json index dd6699c6d..4c8e276c8 100644 --- a/packages/web/i18n/zh-Hant/account_usage.json +++ b/packages/web/i18n/zh-Hant/account_usage.json @@ -2,6 +2,7 @@ "ai_model": "AI 模型", "all": "所有", "app_name": "應用程式名", + "auto_index": "索引增強", "billing_module": "扣費模組", "confirm_export": "共篩選出 {{total}} 條數據,是否確認導出?", "current_filter_conditions": "當前篩選條件:", @@ -9,6 +10,7 @@ "details": "詳情", "dingtalk": "釘釘", "duration_seconds": "時長(秒)", + "embedding_index": "索引生成", "every_day": "天", "every_month": "月", "export_confirm": "導出確認", @@ -16,6 +18,7 @@ "export_title": "時間,成員,類型,項目名,AI 積分消耗", "feishu": "飛書", "generation_time": "生成時間", + "image_parse": "圖片標註", "input_token_length": "輸入 tokens", "member": "成員", "member_name": "成員名", @@ -25,8 +28,12 @@ "official_account": "公眾號", "order_number": "訂單編號", "output_token_length": "輸出 tokens", + "pages": "頁數", + "pdf_enhanced_parse": "PDF 增強解析", + "pdf_parse": "PDF 解析", "points": "積分", "project_name": "專案名", + "qa": "問答對提取", "select_member_and_source_first": "請先選取成員和類型", "share": "分享連結", "source": "來源", diff --git a/packages/web/i18n/zh-Hant/app.json b/packages/web/i18n/zh-Hant/app.json index 8173ff642..dfa6bfc7d 100644 --- a/packages/web/i18n/zh-Hant/app.json +++ b/packages/web/i18n/zh-Hant/app.json @@ -105,6 +105,9 @@ "open_vision_function_tip": "有圖示開關的模型即擁有圖片辨識功能。若開啟,模型會解析檔案連結中的圖片,並自動解析使用者問題中的圖片(使用者問題 ≤ 500 字時生效)。", "or_drag_JSON": "或拖曳 JSON 檔案", "paste_config_or_drag": "貼上配置或拖入 JSON 文件", + "pdf_enhance_parse": "PDF增強解析", + "pdf_enhance_parse_price": "{{price}}積分/頁", + "pdf_enhance_parse_tips": "調用 PDF 識別模型進行解析,可以將其轉換成 Markdown 並保留文檔中的圖片,同時也可以對掃描件進行識別,識別時間較長。", "permission.des.manage": "在寫入權限基礎上,可以設定發布通道、檢視對話紀錄、分配這個應用程式的權限", "permission.des.read": "可以使用這個應用程式進行對話", "permission.des.write": "可以檢視和編輯應用程式", diff --git a/packages/web/i18n/zh-Hant/common.json b/packages/web/i18n/zh-Hant/common.json index 7d14dbf38..e3fd6e908 100644 --- a/packages/web/i18n/zh-Hant/common.json +++ b/packages/web/i18n/zh-Hant/common.json @@ -561,10 +561,7 @@ "core.dataset.file": "檔案", "core.dataset.folder": "目錄", "core.dataset.import.Auto mode Estimated Price Tips": "需要呼叫檔案處理模型,將消耗較多 AI 點數:{{price}} 點數/1K tokens", - "core.dataset.import.Auto process": "自動", - "core.dataset.import.Auto process desc": "自動設定分割和預處理規則", "core.dataset.import.Chunk Range": "範圍:{{min}}~{{max}}", - "core.dataset.import.Chunk Split": "直接分段", "core.dataset.import.Chunk Split Tip": "將文字依照特定規則進行分段處理後,轉換成可進行語意搜尋的格式,適合大多數場景。不需要呼叫模型額外處理,成本較低。", "core.dataset.import.Continue upload": "繼續上傳", "core.dataset.import.Custom process": "自訂規則", @@ -574,7 +571,6 @@ "core.dataset.import.Custom split char Tips": "允許您根據自訂的分隔符進行分割。通常用於已處理好的資料,使用特定的分隔符來精確分割。", "core.dataset.import.Custom text": "自訂文字", "core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集", - "core.dataset.import.Data Preprocessing": "資料處理", "core.dataset.import.Data process params": "資料處理參數", "core.dataset.import.Down load csv template": "點選下載 CSV 範本", "core.dataset.import.Embedding Estimated Price Tips": "僅使用索引模型,消耗少量 AI 點數:{{price}} 點數/1K tokens", @@ -596,7 +592,6 @@ "core.dataset.import.Source name": "來源名稱", "core.dataset.import.Sources list": "來源列表", "core.dataset.import.Start upload": "開始上傳", - "core.dataset.import.Total files": "共 {{total}} 個檔案", "core.dataset.import.Upload complete": "上傳完成", "core.dataset.import.Upload data": "確認上傳", "core.dataset.import.Upload file progress": "檔案上傳進度", @@ -646,12 +641,12 @@ "core.dataset.test.test result placeholder": "測試結果將顯示在這裡", "core.dataset.test.test result tip": "根據知識庫內容與測試文字的相似度進行排序。您可以根據測試結果調整相應的文字。\n注意:測試記錄中的資料可能已經被修改。點選某筆測試資料後將顯示最新資料。", "core.dataset.training.Agent queue": "問答訓練排隊中", - "core.dataset.training.Auto mode": "增強處理", + "core.dataset.training.Auto mode": "補充索引", "core.dataset.training.Auto mode Tip": "透過子索引以及呼叫模型產生相關問題與摘要,來增加資料區塊的語意豐富度,更有利於檢索。需要消耗更多的儲存空間並增加 AI 呼叫次數。", - "core.dataset.training.Chunk mode": "直接分段", + "core.dataset.training.Chunk mode": "直接分块", "core.dataset.training.Full": "預計超過 5 分鐘", "core.dataset.training.Leisure": "閒置", - "core.dataset.training.QA mode": "問答拆分", + "core.dataset.training.QA mode": "問答對提取", "core.dataset.training.Vector queue": "索引排隊中", "core.dataset.training.Waiting": "預計 5 分鐘", "core.dataset.training.Website Sync": "網站同步", @@ -861,7 +856,6 @@ "dataset.collections.Select Collection": "選擇檔案", "dataset.collections.Select One Collection To Store": "選擇一個檔案進行儲存", "dataset.data.Can not edit": "無編輯權限", - "dataset.data.Custom Index Number": "自訂索引 {{number}}", "dataset.data.Default Index": "預設索引", "dataset.data.Delete Tip": "確認刪除此資料?", "dataset.data.Index Placeholder": "輸入索引文字內容", @@ -889,6 +883,9 @@ "error.upload_image_error": "上傳文件失敗", "error.username_empty": "帳號不能為空", "error_collection_not_exist": "集合不存在", + "error_embedding_not_config": "未配置索引模型", + "error_llm_not_config": "未配置文件理解模型", + "error_vlm_not_config": "未配置圖片理解模型", "extraction_results": "提取結果", "field_name": "欄位名稱", "free": "免費", @@ -955,6 +952,7 @@ "new_create": "建立新項目", "no": "否", "no_laf_env": "系統未設定 LAF 環境", + "not_model_config": "未配置相關模型", "not_yet_introduced": "暫無介紹", "option": "選項", "pay.amount": "金額", @@ -1120,7 +1118,6 @@ "support.wallet.invoice_detail": "發票詳細資訊", "support.wallet.invoice_info": "發票將在 3-7 個工作天內寄送至電子郵件信箱,請耐心等候", "support.wallet.invoicing": "開立發票", - "support.wallet.moduleName.index": "產生索引", "support.wallet.moduleName.qa": "問答拆分", "support.wallet.noBill": "無帳單紀錄", "support.wallet.no_invoice": "無發票紀錄", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index e44d20974..7eab0f080 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -3,11 +3,16 @@ "add_file": "新增文件", "api_file": "API 檔案庫", "api_url": "介面位址", + "auto_indexes": "自動生成補充索引", + "auto_indexes_tips": "通過大模型進行額外索引生成,提高語義豐富度,提高檢索的精度。", "chunk_max_tokens": "分塊上限", "close_auto_sync": "確認關閉自動同步功能?", "collection.Create update time": "建立/更新時間", "collection.Training type": "分段模式", + "collection.training_type": "處理模式", "collection_data_count": "數據量", + "collection_metadata_custom_pdf_parse": "PDF增強解析", + "collection_metadata_image_parse": "圖片標註", "collection_not_support_retraining": "此集合類型不支援重新調整參數", "collection_not_support_sync": "該集合不支援同步", "collection_sync": "立即同步", @@ -22,12 +27,21 @@ "custom_data_process_params_desc": "自訂資料處理規則", "data.ideal_chunk_length": "理想分塊長度", "data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引", + "data_index_custom": "自定義索引", + "data_index_default": "默認索引", + "data_index_image": "圖片索引", + "data_index_num": "索引 {{index}}", + "data_index_question": "推測問題索引", + "data_index_summary": "摘要索引", "data_process_params": "處理參數", "data_process_setting": "資料處理設定", "dataset.Unsupported operation": "操作不支持", "dataset.no_collections": "尚無資料集", "dataset.no_tags": "尚無標籤", + "default_params": "預設", + "default_params_desc": "使用系統默認的參數和規則", "edit_dataset_config": "編輯知識庫配置", + "enhanced_indexes": "索引增強", "error.collectionNotFound": "找不到集合", "external_file": "外部檔案庫", "external_file_dataset_desc": "可以從外部檔案庫匯入檔案建立資料集,檔案不會進行二次儲存", @@ -38,19 +52,38 @@ "feishu_dataset": "飛書知識庫", "feishu_dataset_config": "配置飛書知識庫", "feishu_dataset_desc": "可通過配置飛書文檔權限,使用飛書文檔構建知識庫,文檔不會進行二次存儲", + "file_list": "文件列表", "file_model_function_tip": "用於增強索引和問答生成", "filename": "檔案名稱", "folder_dataset": "資料夾", "ideal_chunk_length": "理想分塊長度", "ideal_chunk_length_tips": "依結束符號進行分段,並將多個分段組成一個分塊,此值決定了分塊的預估大小,可能會有上下浮動。", + "image_auto_parse": "圖片自動索引", + "image_auto_parse_tips": "調用 VLM 自動標註文檔裡的圖片,並生成額外的檢索索引", "import.Auto mode Estimated Price Tips": "需呼叫文字理解模型,將消耗較多 AI 點數:{{price}} 點數 / 1K tokens", "import.Embedding Estimated Price Tips": "僅使用索引模型,消耗少量 AI 點數:{{price}} 點數 / 1K tokens", + "import_confirm": "確認上傳", + "import_data_preview": "數據預覽", + "import_data_process_setting": "數據處理方式設置", + "import_file_parse_setting": "文件解析設置", + "import_model_config": "模型選擇", + "import_param_setting": "參數設置", + "import_select_file": "選擇文件", "is_open_schedule": "啟用定時同步", + "keep_image": "保留圖片", "move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。", "open_auto_sync": "開啟定時同步後,系統將每天不定時嘗試同步集合,集合同步期間,會出現無法搜尋到該集合資料現象。", + "params_setting": "參數設置", + "pdf_enhance_parse": "PDF增強解析", + "pdf_enhance_parse_price": "{{price}}積分/頁", + "pdf_enhance_parse_tips": "調用 PDF 識別模型進行解析,可以將其轉換成 Markdown 並保留文檔中的圖片,同時也可以對掃描件進行識別,識別時間較長。", "permission.des.manage": "可管理整個資料集的資料和資訊", "permission.des.read": "可檢視資料集內容", "permission.des.write": "可新增和變更資料集內容", + "preview_chunk": "分塊預覽", + "preview_chunk_empty": "無法讀取該文件內容", + "preview_chunk_intro": "最多展示 10 個分塊", + "preview_chunk_not_selected": "點擊左側文件後進行預覽", "rebuild_embedding_start_tip": "切換索引模型任務已開始", "rebuilding_index_count": "重建中索引數量:{{count}}", "request_headers": "請求頭", @@ -72,8 +105,10 @@ "tag.tags": "標籤", "tag.total_tags": "共 {{total}} 個標籤", "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "資料集有索引正在訓練或重建中", + "total_num_files": "共 {{total}} 個文件", "training_mode": "分段模式", "vector_model_max_tokens_tip": "每個分塊數據,最大長度為 3000 tokens", + "vllm_model": "圖片理解模型", "website_dataset": "網站同步", "website_dataset_desc": "網站同步功能讓您可以直接使用網頁連結建立資料集", "yuque_dataset": "語雀知識庫", diff --git a/plugins/model/pdf-marker/Readme.md b/plugins/model/pdf-marker/Readme.md index b296c12d3..0a617335b 100644 --- a/plugins/model/pdf-marker/Readme.md +++ b/plugins/model/pdf-marker/Readme.md @@ -70,7 +70,7 @@ export PROCESSES_PER_GPU="1" python api_mp.py ``` -# 镜像打包和部署 +# 镜像打包和部署(推荐) ## 本地构建镜像 @@ -83,26 +83,39 @@ export PROCESSES_PER_GPU="1" ```bash sudo docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 -e PROCESSES_PER_GPU="2" model_pdf ``` -## 快速构建镜像 + +## 快速构建镜像(推荐) + ```dockerfile -docker pull crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:latest -docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 -e PROCESSES_PER_GPU="2" crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:latest +docker pull crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:v0.2 +docker run --gpus all -itd -p 7231:7232 --name model_pdf_v2 -e PROCESSES_PER_GPU="2" crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:v0.2 ``` -*注意*:参数PROCESSES_PER_GPU设置每张显卡上文件处理的并行数量,24G的显卡可以设置为2。在多显卡的环境中会自动切换显卡来运行多文件的并行处理。 + # 访问示例 -用Post方法访问端口为 `7321 ` 的 `v1/parse/file` 服务 +marker v0.1:用Post方法访问端口为 `7321 ` 的 `v1/parse/file` 服务 + +marker v0.2:用Post方法访问端口为 `7321 ` 的 `v2/parse/file` 服务 + -参数:file-->本地文件的地址 - 访问方法 - - ``` - curl --location --request POST "http://localhost:7231/v1/parse/file" \ - --header "Authorization: Bearer your_access_token" \ - --form "file=@./file/chinese_test.pdf" - ``` - + + - v0.2 + ``` + curl --location --request POST "http://localhost:7231/v2/parse/file" \ + --header "Authorization: Bearer your_access_token" \ + --form "file=@./file/chinese_test.pdf" + ``` + - v0.1 + ``` + curl --location --request POST "http://localhost:7231/v1/parse/file" \ + --header "Authorization: Bearer your_access_token" \ + --form "file=@./file/chinese_test.pdf" + ``` + + 参数:file-->本地文件的地址 + - 多文件测试数据 运行 `test` 文件下的 `test.py` 文件,修改里面的 `file_paths` 为自己仓库的 `url` 即可 diff --git a/projects/app/data/config.json b/projects/app/data/config.json index c5bf0c5e1..0b9e7a598 100644 --- a/projects/app/data/config.json +++ b/projects/app/data/config.json @@ -4,9 +4,16 @@ "lafEnv": "https://laf.dev" // laf环境。 https://laf.run (杭州阿里云) ,或者私有化的laf环境。如果使用 Laf openapi 功能,需要最新版的 laf 。 }, "systemEnv": { - "vectorMaxProcess": 15, // 向量处理线程数量 - "qaMaxProcess": 15, // 问答拆分线程数量 + "vectorMaxProcess": 10, // 向量处理线程数量 + "qaMaxProcess": 10, // 问答拆分线程数量 + "vlmMaxProcess": 10, // 图片理解模型最大处理进程 "tokenWorkers": 30, // Token 计算线程保持数,会持续占用内存,不能设置太大。 - "pgHNSWEfSearch": 100 // 向量搜索参数。越大,搜索越精确,但是速度越慢。设置为100,有99%+精度。 + "pgHNSWEfSearch": 100, // 向量搜索参数。越大,搜索越精确,但是速度越慢。设置为100,有99%+精度。 + "customPdfParse": { + "url": "", // 自定义 PDF 解析服务地址 + "key": "", // 自定义 PDF 解析服务密钥 + "doc2xKey": "", // doc2x 服务密钥 + "price": 0 // PDF 解析服务价格 + } } } diff --git a/projects/app/package.json b/projects/app/package.json index 5f088e1d3..3762a2bae 100644 --- a/projects/app/package.json +++ b/projects/app/package.json @@ -1,6 +1,6 @@ { "name": "app", - "version": "4.8.23", + "version": "4.9.0", "private": false, "scripts": { "dev": "next dev", diff --git a/projects/app/public/docs/versionIntro.md b/projects/app/public/docs/versionIntro.md index c7b1571fc..c7aeb68e7 100644 --- a/projects/app/public/docs/versionIntro.md +++ b/projects/app/public/docs/versionIntro.md @@ -1,13 +1,21 @@ -### FastGPT V4.8.20 更新说明 +### FastGPT V4.9.0 更新说明 -1. 新增 - 使用记录导出和仪表盘。 -2. 新增 - DeepSeek resoner 模型支持输出思考过程。 -3. 新增 - markdown 语法扩展,支持音视频(代码块 audio 和 video)。 -4. 新增 - 飞书/语雀知识库。 -5. 新增 - 工作流知识库检索支持按知识库权限进行过滤。 -6. 新增 - 流程等待插件,可以等待 n 毫秒后继续执行流程。 -7. 新增 - 飞书机器人接入,支持配置私有化飞书地址。 -8. 新增 - 支持通过 JSON 配置直接创建应用。 -9. 新增 - 支持通过 CURL 脚本快速创建 HTTP 插件。 -10. 新增 - 支持部门架构权限模式。 +#### 弃用 & 兼容 + +1. 弃用 - 之前私有化部署的自定义文件解析方案,请同步更新到最新的配置方案。[点击查看 PDF 增强解析配置](/docs/development/configuration/#使用-doc2x-解析-pdf-文件) +2. 弃用 - 弃用旧版本地文件上传 API:/api/core/dataset/collection/create/file(以前仅商业版可用的 API,该接口已放切换成:/api/core/dataset/collection/create/localFile) +3. 停止维护,即将弃用 - 外部文件库相关 API,可通过 API 文件库替代。 +4. API更新 - 上传文件至知识库、创建连接集合、API 文件库、推送分块数据等带有 `trainingType` 字段的接口,`trainingType`字段未来仅支持`chunk`和`QA`两种模式。增强索引模式将设置单独字段:`autoIndexes`,目前仍有适配旧版`trainingType=auto`代码,但请尽快变更成新接口类型。具体可见:[知识库 OpenAPI 文档](/docs/development/openapi/dataset.md) + +#### 功能更新 + +1. 新增 - PDF 增强解析,可以识别图片、公式、扫描件,并将内容转化成 Markdown 格式。 +2. 新增 - 支持对文档中的图片链接,进行图片索引,提高图片内容的检索精度。 +3. 新增 - 语义检索增加迭代搜索,减少漏检。 +4. 优化 - 知识库数据不再限制索引数量,可无限自定义。同时可自动更新输入文本的索引,不影响自定义索引。 +5. 优化 - Markdown 解析,增加链接后中文标点符号检测,增加空格。 +6. 优化 - Prompt 模式工具调用,支持思考模型。同时优化其格式检测,减少空输出的概率。 +7. 优化 - 优化文件读取代码,极大提高大文件读取速度。50M PDF 读取时间提高 3 倍。 +8. 优化 - HTTP Body 适配,增加对字符串对象的适配。 +9. 修复 - 批量运行时,全局变量未进一步传递到下一次运行中,导致最终变量更新错误。 diff --git a/projects/app/src/components/Layout/index.tsx b/projects/app/src/components/Layout/index.tsx index 9d53d3da0..2ece77c27 100644 --- a/projects/app/src/components/Layout/index.tsx +++ b/projects/app/src/components/Layout/index.tsx @@ -51,13 +51,13 @@ export const navbarWidth = '64px'; const Layout = ({ children }: { children: JSX.Element }) => { const router = useRouter(); + const { toast } = useToast(); const { t } = useTranslation(); const { Loading } = useLoading(); const { loading, feConfigs, notSufficientModalType, llmModelList, embeddingModelList } = useSystemStore(); const { isPc } = useSystem(); - const { userInfo, teamPlanStatus, isUpdateNotification, setIsUpdateNotification } = - useUserStore(); + const { userInfo, isUpdateNotification, setIsUpdateNotification } = useUserStore(); const { setUserDefaultLng } = useI18nLng(); const isChatPage = useMemo( @@ -87,7 +87,6 @@ const Layout = ({ children }: { children: JSX.Element }) => { }); // Check model invalid - const { toast } = useToast(); useDebounceEffect( () => { if (userInfo?.username === 'root') { diff --git a/projects/app/src/components/Markdown/utils.ts b/projects/app/src/components/Markdown/utils.ts index d20f60848..6b4d1e5c5 100644 --- a/projects/app/src/components/Markdown/utils.ts +++ b/projects/app/src/components/Markdown/utils.ts @@ -14,32 +14,30 @@ export enum CodeClassNameEnum { export const mdTextFormat = (text: string) => { // NextChat function - Format latex to $$ - const escapeBrackets = (text: string) => { - const pattern = /(```[\s\S]*?```|`.*?`)|\\\[([\s\S]*?[^\\])\\\]|\\\((.*?)\\\)/g; - return text.replace(pattern, (match, codeBlock, squareBracket, roundBracket) => { - if (codeBlock) { - return codeBlock; - } else if (squareBracket) { - return `$$${squareBracket}$$`; - } else if (roundBracket) { - return `$${roundBracket}$`; - } - return match; - }); - }; - // 处理 [quote:id] 格式引用,将 [quote:675934a198f46329dfc6d05a] 转换为 [675934a198f46329dfc6d05a](QUOTE) - const formatQuote = (text: string) => { - return ( - text - // .replace( - // /([\u4e00-\u9fa5\u3000-\u303f])([a-zA-Z0-9])|([a-zA-Z0-9])([\u4e00-\u9fa5\u3000-\u303f])/g, - // '$1$3 $2$4' - // ) - // 处理 [quote:id] 格式引用,将 [quote:675934a198f46329dfc6d05a] 转换为 [675934a198f46329dfc6d05a](QUOTE) - .replace(/\[quote:?\s*([a-f0-9]{24})\](?!\()/gi, '[$1](QUOTE)') - .replace(/\[([a-f0-9]{24})\](?!\()/g, '[$1](QUOTE)') - ); - }; + const pattern = /(```[\s\S]*?```|`.*?`)|\\\[([\s\S]*?[^\\])\\\]|\\\((.*?)\\\)/g; + text = text.replace(pattern, (match, codeBlock, squareBracket, roundBracket) => { + if (codeBlock) { + return codeBlock; + } else if (squareBracket) { + return `$$${squareBracket}$$`; + } else if (roundBracket) { + return `$${roundBracket}$`; + } + return match; + }); - return formatQuote(escapeBrackets(text)); + // 处理 [quote:id] 格式引用,将 [quote:675934a198f46329dfc6d05a] 转换为 [675934a198f46329dfc6d05a](QUOTE) + text = text + // .replace( + // /([\u4e00-\u9fa5\u3000-\u303f])([a-zA-Z0-9])|([a-zA-Z0-9])([\u4e00-\u9fa5\u3000-\u303f])/g, + // '$1$3 $2$4' + // ) + // 处理 [quote:id] 格式引用,将 [quote:675934a198f46329dfc6d05a] 转换为 [675934a198f46329dfc6d05a](QUOTE) + .replace(/\[quote:?\s*([a-f0-9]{24})\](?!\()/gi, '[$1](QUOTE)') + .replace(/\[([a-f0-9]{24})\](?!\()/g, '[$1](QUOTE)'); + + // 处理链接后的中文标点符号,增加空格 + text = text.replace(/(https?:\/\/[^\s,。!?;:、]+)([,。!?;:、])/g, '$1 $2'); + + return text; }; diff --git a/projects/app/src/components/Select/AIModelSelector.tsx b/projects/app/src/components/Select/AIModelSelector.tsx index 2a0881e9f..3a760e44a 100644 --- a/projects/app/src/components/Select/AIModelSelector.tsx +++ b/projects/app/src/components/Select/AIModelSelector.tsx @@ -35,19 +35,18 @@ const OneRowSelector = ({ list, onchange, disableTip, ...props }: Props) => { return props.size ? size[props.size] : size['md']; }, [props.size]); - const avatarList = useMemo( - () => - list.map((item) => { - const modelData = getModelFromList( - [ - ...llmModelList, - ...embeddingModelList, - ...ttsModelList, - ...sttModelList, - ...reRankModelList - ], - item.value - ); + const avatarList = useMemo(() => { + const allModels = [ + ...llmModelList, + ...embeddingModelList, + ...ttsModelList, + ...sttModelList, + ...reRankModelList + ]; + return list + .map((item) => { + const modelData = getModelFromList(allModels, item.value)!; + if (!modelData) return; return { value: item.value, @@ -64,17 +63,20 @@ const OneRowSelector = ({ list, onchange, disableTip, ...props }: Props) => { ) }; - }), - [ - list, - llmModelList, - embeddingModelList, - ttsModelList, - sttModelList, - reRankModelList, - avatarSize - ] - ); + }) + .filter(Boolean) as { + value: any; + label: React.JSX.Element; + }[]; + }, [ + list, + llmModelList, + embeddingModelList, + ttsModelList, + sttModelList, + reRankModelList, + avatarSize + ]); return ( { className="nowheel" isDisabled={!!disableTip} list={avatarList} + placeholder={t('common:not_model_config')} h={'40px'} {...props} onchange={(e) => { @@ -107,19 +110,21 @@ const OneRowSelector = ({ list, onchange, disableTip, ...props }: Props) => { ); }; -const MultipleRowSelector = ({ list, onchange, disableTip, ...props }: Props) => { +const MultipleRowSelector = ({ list, onchange, disableTip, placeholder, ...props }: Props) => { const { t } = useTranslation(); const { llmModelList, embeddingModelList, ttsModelList, sttModelList, reRankModelList } = useSystemStore(); const modelList = useMemo(() => { - return [ + const allModels = [ ...llmModelList, ...embeddingModelList, ...ttsModelList, ...sttModelList, ...reRankModelList ]; - }, [llmModelList, embeddingModelList, ttsModelList, sttModelList, reRankModelList]); + + return list.map((item) => getModelFromList(allModels, item.value)!).filter(Boolean); + }, [llmModelList, embeddingModelList, ttsModelList, sttModelList, reRankModelList, list]); const [value, setValue] = useState([]); @@ -157,6 +162,7 @@ const MultipleRowSelector = ({ list, onchange, disableTip, ...props }: Props) => for (const item of list) { const modelData = getModelFromList(modelList, item.value); + if (!modelData) continue; const provider = renderList.find((item) => item.value === (modelData?.provider || 'Other')) ?? renderList[renderList.length - 1]; @@ -168,7 +174,7 @@ const MultipleRowSelector = ({ list, onchange, disableTip, ...props }: Props) => } return renderList.filter((item) => item.children.length > 0); - }, [avatarSize, list, modelList]); + }, [avatarSize, list, modelList, t]); const onSelect = useCallback( (e: string[]) => { @@ -178,8 +184,11 @@ const MultipleRowSelector = ({ list, onchange, disableTip, ...props }: Props) => ); const SelectedModel = useMemo(() => { + if (!props.value) return <>{t('common:not_model_config')}; const modelData = getModelFromList(modelList, props.value); + if (!modelData) return <>{t('common:not_model_config')}; + setValue([modelData.provider, props.value]); return ( @@ -194,7 +203,7 @@ const MultipleRowSelector = ({ list, onchange, disableTip, ...props }: Props) => {modelData?.name} ); - }, [modelList, props.value, avatarSize]); + }, [modelList, props.value, t, avatarSize]); return ( list={selectorList} onSelect={onSelect} value={value} + placeholder={placeholder} rowMinWidth="160px" ButtonProps={{ isDisabled: !!disableTip, diff --git a/projects/app/src/components/core/app/FileSelect.tsx b/projects/app/src/components/core/app/FileSelect.tsx index 7f7361d9c..f1e3879bc 100644 --- a/projects/app/src/components/core/app/FileSelect.tsx +++ b/projects/app/src/components/core/app/FileSelect.tsx @@ -9,7 +9,8 @@ import { HStack, Switch, ModalFooter, - BoxProps + BoxProps, + Checkbox } from '@chakra-ui/react'; import React, { useMemo } from 'react'; import { useTranslation } from 'next-i18next'; @@ -22,6 +23,8 @@ import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel'; import { useMount } from 'ahooks'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; +import MyTag from '@fastgpt/web/components/common/Tag/index'; +import MyDivider from '@fastgpt/web/components/common/MyDivider'; const FileSelect = ({ forbidVision = false, @@ -95,6 +98,42 @@ const FileSelect = ({ }} /> + {value.canSelectFile && feConfigs.showCustomPdfParse && ( + <> + + { + onChange({ + ...value, + customPdfParse: e.target.checked + }); + }} + > + {t('app:pdf_enhance_parse')} + + + {feConfigs?.show_pay && ( + + {t('app:pdf_enhance_parse_price', { + price: feConfigs.customPdfParsePrice || 0 + })} + + )} + + + + )} {t('app:image_upload')} {forbidVision ? ( diff --git a/projects/app/src/global/core/dataset/api.d.ts b/projects/app/src/global/core/dataset/api.d.ts index 5716b3503..fc09332c6 100644 --- a/projects/app/src/global/core/dataset/api.d.ts +++ b/projects/app/src/global/core/dataset/api.d.ts @@ -26,6 +26,7 @@ export type CreateDatasetParams = { avatar: string; vectorModel?: string; agentModel?: string; + vlmModel?: string; apiServer?: APIFileServer; feishuServer?: FeishuServer; yuqueServer?: YuqueServer; diff --git a/projects/app/src/pageComponents/account/model/AddModelBox.tsx b/projects/app/src/pageComponents/account/model/AddModelBox.tsx index 9ae33164f..582bea51c 100644 --- a/projects/app/src/pageComponents/account/model/AddModelBox.tsx +++ b/projects/app/src/pageComponents/account/model/AddModelBox.tsx @@ -38,6 +38,7 @@ import { useSystemStore } from '@/web/common/system/useSystemStore'; import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; import { Prompt_CQJson, Prompt_ExtractJson } from '@fastgpt/global/core/ai/prompt/agent'; import MyModal from '@fastgpt/web/components/common/MyModal'; +import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel'; export const AddModelButton = ({ onCreate, @@ -134,6 +135,14 @@ export const ModelEditModal = ({ const { runAsync: updateModel, loading: updatingModel } = useRequest2( async (data: SystemModelItemType) => { + for (const key in data) { + // @ts-ignore + const val = data[key]; + if (val === null || val === undefined || Number.isNaN(val)) { + // @ts-ignore + data[key] = ''; + } + } return putSystemModel({ model: data.model, metadata: data @@ -295,7 +304,16 @@ export const ModelEditModal = ({ {isLLMModel && ( <> - + - + - + - + } {hasCharsLen && } {hasDuration && } + {hasPages && } @@ -126,6 +130,7 @@ const UsageDetail = ({ usage, onClose }: { usage: UsageItemType; onClose: () => {hasOutputToken && } {hasCharsLen && } {hasDuration && } + {hasPages && } ))} diff --git a/projects/app/src/pageComponents/account/usage/UsageTable.tsx b/projects/app/src/pageComponents/account/usage/UsageTable.tsx index ccb4bf0b9..45ae24140 100644 --- a/projects/app/src/pageComponents/account/usage/UsageTable.tsx +++ b/projects/app/src/pageComponents/account/usage/UsageTable.tsx @@ -87,8 +87,8 @@ const UsageTableList = ({ 'common:support.wallet.usage.Audio Speech' ), ['support.wallet.usage.Whisper']: t('common:support.wallet.usage.Whisper'), - ['support.wallet.moduleName.index']: t('common:support.wallet.moduleName.index'), - ['support.wallet.moduleName.qa']: t('common:support.wallet.moduleName.qa'), + ['account_usage:embedding_index']: t('account_usage:embedding_index'), + ['account_usage:qa']: t('account_usage:qa'), ['core.dataset.training.Auto mode']: t('common:core.dataset.training.Auto mode'), ['common:core.module.template.ai_chat']: t('common:core.module.template.ai_chat') }, @@ -122,49 +122,51 @@ const UsageTableList = ({ onConfirm={exportUsage} /> - - -
{t('common:core.ai.Max context')} + + {t('common:core.ai.Max context')} + +
{t('account:model.max_quote')} + + {t('account:model.max_quote')} + +
{t('common:core.chat.response.module maxToken')} + + {t('common:core.chat.response.module maxToken')} + + + @@ -329,7 +361,12 @@ export const ModelEditModal = ({
{t('account:model.max_temperature')} + + {t('account:model.max_temperature')} + + + { { const val = (() => { - if (!e) return 0; + if (!e) return 1; return e; })(); updateChannel({ diff --git a/projects/app/src/pageComponents/account/model/ModelConfigTable.tsx b/projects/app/src/pageComponents/account/model/ModelConfigTable.tsx index c2a59a829..7bf61393e 100644 --- a/projects/app/src/pageComponents/account/model/ModelConfigTable.tsx +++ b/projects/app/src/pageComponents/account/model/ModelConfigTable.tsx @@ -563,8 +563,10 @@ const DefaultModelModal = ({ embeddingModelList, ttsModelList, sttModelList, - reRankModelList + reRankModelList, + getVlmModelList } = useSystemStore(); + const vlmModelList = useMemo(() => getVlmModelList(), [getVlmModelList]); // Create a copy of defaultModels for local state management const [defaultData, setDefaultData] = useState(defaultModels); @@ -703,6 +705,28 @@ const DefaultModelModal = ({ /> + + + {t('account_model:vlm_model')} + + + + ({ + value: item.model, + label: item.name + }))} + onchange={(e) => { + setDefaultData((state) => ({ + ...state, + datasetImageLLM: vlmModelList.find((item) => item.model === e) + })); + }} + /> + + {t('account_usage:output_token_length')}{t('account_usage:text_length')}{t('account_usage:duration_seconds')}{t('account_usage:pages')}{t('account_usage:total_points_consumed')}
{item.outputTokens ?? '-'}{item.charsLength ?? '-'}{item.duration ?? '-'}{item.pages ?? '-'}{formatNumber(item.amount)}
- - - - - - - - - - - - {usages.map((item) => ( - - - - - - - + + + +
{t('common:user.Time')}{t('account_usage:member')}{t('account_usage:user_type')}{t('account_usage:project_name')}{t('account_usage:total_points')}
{dayjs(item.time).format('YYYY/MM/DD HH:mm:ss')} - - - {item.sourceMember.name} - - {t(UsageSourceMap[item.source]?.label as any) || '-'}{t(item.appName as any) || '-'}{formatNumber(item.totalPoints) || 0} - -
+ + + + + + + + - ))} - -
{t('common:user.Time')}{t('account_usage:member')}{t('account_usage:user_type')}{t('account_usage:project_name')}{t('account_usage:total_points')}
- {!isLoading && usages.length === 0 && ( - - )} - + + + {usages.map((item) => ( + + {dayjs(item.time).format('YYYY/MM/DD HH:mm:ss')} + + + + {item.sourceMember.name} + + + {t(UsageSourceMap[item.source]?.label as any) || '-'} + {t(item.appName as any) || '-'} + {formatNumber(item.totalPoints) || 0} + + + + + ))} + + + {!isLoading && usages.length === 0 && ( + + )} + + diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/Header.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/Header.tsx index 1462ab563..7fc193f40 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/Header.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/Header.tsx @@ -18,7 +18,7 @@ import { useQuery } from '@tanstack/react-query'; import { useTranslation } from 'next-i18next'; import MyIcon from '@fastgpt/web/components/common/Icon'; import MyInput from '@/components/MyInput'; -import { useRequest } from '@fastgpt/web/hooks/useRequest'; +import { useRequest, useRequest2 } from '@fastgpt/web/hooks/useRequest'; import { useRouter } from 'next/router'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import MyMenu from '@fastgpt/web/components/common/MyMenu'; @@ -28,7 +28,8 @@ import { TrainingModeEnum, DatasetTypeEnum, DatasetTypeMap, - DatasetStatusEnum + DatasetStatusEnum, + DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants'; import EditFolderModal, { useEditFolder } from '../../EditFolderModal'; import { TabEnum } from '../../../../pages/dataset/detail/index'; @@ -41,6 +42,7 @@ import { CollectionPageContext } from './Context'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { useSystem } from '@fastgpt/web/hooks/useSystem'; import HeaderTagPopOver from './HeaderTagPopOver'; +import MyBox from '@fastgpt/web/components/common/MyBox'; const FileSourceSelector = dynamic(() => import('../Import/components/FileSourceSelector')); @@ -48,7 +50,7 @@ const Header = ({}: {}) => { const { t } = useTranslation(); const theme = useTheme(); - const { setLoading, feConfigs } = useSystemStore(); + const { feConfigs } = useSystemStore(); const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const router = useRouter(); @@ -69,50 +71,36 @@ const Header = ({}: {}) => { tip: t('common:dataset.Manual collection Tip'), canEmpty: false }); + const { isOpen: isOpenFileSourceSelector, onOpen: onOpenFileSourceSelector, onClose: onCloseFileSourceSelector } = useDisclosure(); - const { mutate: onCreateCollection } = useRequest({ - mutationFn: async ({ - name, - type, - callback, - ...props - }: { - name: string; - type: DatasetCollectionTypeEnum; - callback?: (id: string) => void; - trainingType?: TrainingModeEnum; - rawLink?: string; - chunkSize?: number; - }) => { - setLoading(true); + + const { runAsync: onCreateCollection, loading: onCreating } = useRequest2( + async ({ name, type }: { name: string; type: DatasetCollectionTypeEnum }) => { const id = await postDatasetCollection({ parentId, datasetId: datasetDetail._id, name, - type, - ...props + type }); - callback?.(id); return id; }, - onSuccess() { - getData(pageNum); - }, - onSettled() { - setLoading(false); - }, + { + onSuccess() { + getData(pageNum); + }, + successToast: t('common:common.Create Success'), + errorToast: t('common:common.Create Failed') + } + ); - successToast: t('common:common.Create Success'), - errorToast: t('common:common.Create Failed') - }); const isWebSite = datasetDetail?.type === DatasetTypeEnum.websiteDataset; return ( - + { )} {isOpenFileSourceSelector && } - + ); }; diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx index ace71ab04..1c5c998b7 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx @@ -29,7 +29,8 @@ import { DatasetCollectionTypeEnum, DatasetStatusEnum, DatasetCollectionSyncResultMap, - DatasetTypeEnum + DatasetTypeEnum, + DatasetCollectionDataProcessModeMap } from '@fastgpt/global/core/dataset/constants'; import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils'; import { TabEnum } from '../../../../pages/dataset/detail/index'; @@ -44,10 +45,7 @@ import { CollectionPageContext } from './Context'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { formatTime2YMDHM } from '@fastgpt/global/common/string/time'; import MyTag from '@fastgpt/web/components/common/Tag/index'; -import { - checkCollectionIsFolder, - getTrainingTypeLabel -} from '@fastgpt/global/core/dataset/collection/utils'; +import { checkCollectionIsFolder } from '@fastgpt/global/core/dataset/collection/utils'; import { useFolderDrag } from '@/components/common/folder/useFolderDrag'; import TagsPopOver from './TagsPopOver'; import { useSystemStore } from '@/web/common/system/useSystemStore'; @@ -194,7 +192,7 @@ const CollectionCard = () => { {t('common:common.Name')} - {t('dataset:collection.Training type')} + {t('dataset:collection.training_type')} {t('dataset:collection_data_count')} {t('dataset:collection.Create update time')} {t('common:common.Status')} @@ -251,7 +249,14 @@ const CollectionCard = () => { {!checkCollectionIsFolder(collection.type) ? ( - <>{t((getTrainingTypeLabel(collection.trainingType) || '-') as any)} + <> + {collection.trainingType + ? t( + (DatasetCollectionDataProcessModeMap[collection.trainingType] + ?.label || '-') as any + ) + : '-'} + ) : ( '-' )} diff --git a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx index 52eacd9bf..853efddec 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx @@ -1,13 +1,16 @@ import { useRouter } from 'next/router'; -import { SetStateAction, useState } from 'react'; +import { SetStateAction, useMemo, useState } from 'react'; import { useTranslation } from 'next-i18next'; import { createContext, useContextSelector } from 'use-context-selector'; -import { ImportDataSourceEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { + DatasetCollectionDataProcessModeEnum, + ImportDataSourceEnum +} from '@fastgpt/global/core/dataset/constants'; import { useMyStep } from '@fastgpt/web/hooks/useStep'; import { Box, Button, Flex, IconButton } from '@chakra-ui/react'; import MyIcon from '@fastgpt/web/components/common/Icon'; import { TabEnum } from '../NavBar'; -import { ImportProcessWayEnum } from '@/web/core/dataset/constants'; +import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; import { UseFormReturn, useForm } from 'react-hook-form'; import { ImportSourceItemType } from '@/web/core/dataset/type'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; @@ -19,12 +22,10 @@ type TrainingFiledType = { minChunkSize: number; autoChunkSize: number; chunkSize: number; - showChunkInput: boolean; - showPromptInput: boolean; charsPointsPrice: number; priceTip: string; uploadRate: number; - chunkSizeField?: ChunkSizeFieldType; + chunkSizeField: ChunkSizeFieldType; }; type DatasetImportContextType = { importSource: ImportDataSourceEnum; @@ -39,8 +40,13 @@ type DatasetImportContextType = { type ChunkSizeFieldType = 'embeddingChunkSize' | 'qaChunkSize'; export type ImportFormType = { - mode: TrainingModeEnum; - way: ImportProcessWayEnum; + customPdfParse: boolean; + + trainingType: DatasetCollectionDataProcessModeEnum; + imageIndex: boolean; + autoIndexes: boolean; + + chunkSettingMode: ChunkSettingModeEnum; embeddingChunkSize: number; qaChunkSize: number; customSplitChar: string; @@ -58,8 +64,6 @@ export const DatasetImportContext = createContext({ maxChunkSize: 0, minChunkSize: 0, - showChunkInput: false, - showPromptInput: false, sources: [], setSources: function (value: SetStateAction): void { throw new Error('Function not implemented.'); @@ -88,72 +92,93 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode const modeSteps: Record = { [ImportDataSourceEnum.reTraining]: [ { title: t('dataset:core.dataset.import.Adjust parameters') }, - { title: t('common:core.dataset.import.Upload data') } + { + title: t('dataset:import_data_preview') + }, + { title: t('dataset:import_confirm') } ], [ImportDataSourceEnum.fileLocal]: [ { - title: t('common:core.dataset.import.Select file') + title: t('dataset:import_select_file') }, { - title: t('common:core.dataset.import.Data Preprocessing') + title: t('dataset:import_param_setting') }, { - title: t('common:core.dataset.import.Upload data') + title: t('dataset:import_data_preview') + }, + { + title: t('dataset:import_confirm') } ], [ImportDataSourceEnum.fileLink]: [ { - title: t('common:core.dataset.import.Select file') + title: t('dataset:import_select_file') }, { - title: t('common:core.dataset.import.Data Preprocessing') + title: t('dataset:import_param_setting') }, { - title: t('common:core.dataset.import.Upload data') + title: t('dataset:import_data_preview') + }, + { + title: t('dataset:import_confirm') } ], [ImportDataSourceEnum.fileCustom]: [ { - title: t('common:core.dataset.import.Select file') + title: t('dataset:import_select_file') }, { - title: t('common:core.dataset.import.Data Preprocessing') + title: t('dataset:import_param_setting') }, { - title: t('common:core.dataset.import.Upload data') + title: t('dataset:import_data_preview') + }, + { + title: t('dataset:import_confirm') } ], [ImportDataSourceEnum.csvTable]: [ { - title: t('common:core.dataset.import.Select file') + title: t('dataset:import_select_file') }, { - title: t('common:core.dataset.import.Data Preprocessing') + title: t('dataset:import_param_setting') }, { - title: t('common:core.dataset.import.Upload data') + title: t('dataset:import_data_preview') + }, + { + title: t('dataset:import_confirm') } ], [ImportDataSourceEnum.externalFile]: [ { - title: t('common:core.dataset.import.Select file') + title: t('dataset:import_select_file') }, { - title: t('common:core.dataset.import.Data Preprocessing') + title: t('dataset:import_param_setting') }, { - title: t('common:core.dataset.import.Upload data') + title: t('dataset:import_data_preview') + }, + { + title: t('dataset:import_confirm') } ], [ImportDataSourceEnum.apiDataset]: [ { - title: t('common:core.dataset.import.Select file') + title: t('dataset:import_select_file') }, { - title: t('common:core.dataset.import.Data Preprocessing') + title: t('dataset:import_param_setting') }, { - title: t('common:core.dataset.import.Upload data') + title: t('dataset:import_data_preview') + }, + { + title: t('dataset:import_confirm') } ] }; @@ -168,96 +193,114 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode const processParamsForm = useForm({ defaultValues: { - mode: TrainingModeEnum.chunk, - way: ImportProcessWayEnum.auto, + imageIndex: false, + autoIndexes: false, + + trainingType: DatasetCollectionDataProcessModeEnum.chunk, + + chunkSettingMode: ChunkSettingModeEnum.auto, embeddingChunkSize: vectorModel?.defaultToken || 512, qaChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7), customSplitChar: '', qaPrompt: Prompt_AgentQA.description, - webSelector: '' + webSelector: '', + customPdfParse: false } }); const [sources, setSources] = useState([]); // watch form - const mode = processParamsForm.watch('mode'); - const way = processParamsForm.watch('way'); + const trainingType = processParamsForm.watch('trainingType'); + const chunkSettingMode = processParamsForm.watch('chunkSettingMode'); const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize'); const qaChunkSize = processParamsForm.watch('qaChunkSize'); const customSplitChar = processParamsForm.watch('customSplitChar'); + const autoIndexes = processParamsForm.watch('autoIndexes'); - const modeStaticParams: Record = { - [TrainingModeEnum.auto]: { - chunkOverlapRatio: 0.2, - maxChunkSize: 2048, - minChunkSize: 100, - autoChunkSize: vectorModel?.defaultToken ? vectorModel?.defaultToken * 2 : 1024, - chunkSize: vectorModel?.defaultToken ? vectorModel?.defaultToken * 2 : 1024, - showChunkInput: false, - showPromptInput: false, - charsPointsPrice: agentModel.charsPointsPrice || 0, - priceTip: t('dataset:import.Auto mode Estimated Price Tips', { - price: agentModel.charsPointsPrice - }), - uploadRate: 100 - }, - [TrainingModeEnum.chunk]: { - chunkSizeField: 'embeddingChunkSize' as ChunkSizeFieldType, - chunkOverlapRatio: 0.2, - maxChunkSize: vectorModel?.maxToken || 512, - minChunkSize: 100, - autoChunkSize: vectorModel?.defaultToken || 512, - chunkSize: embeddingChunkSize, - showChunkInput: true, - showPromptInput: false, - charsPointsPrice: vectorModel.charsPointsPrice || 0, - priceTip: t('dataset:import.Embedding Estimated Price Tips', { - price: vectorModel.charsPointsPrice - }), - uploadRate: 150 - }, - [TrainingModeEnum.qa]: { - chunkSizeField: 'qaChunkSize' as ChunkSizeFieldType, - chunkOverlapRatio: 0, - maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7), - minChunkSize: 4000, - autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7), - chunkSize: qaChunkSize, - showChunkInput: true, - showPromptInput: true, - charsPointsPrice: agentModel.charsPointsPrice || 0, - priceTip: t('dataset:import.Auto mode Estimated Price Tips', { - price: agentModel.charsPointsPrice - }), - uploadRate: 30 + const TrainingModeMap = useMemo(() => { + if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { + return { + chunkSizeField: 'qaChunkSize', + chunkOverlapRatio: 0, + maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7), + minChunkSize: 4000, + autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7), + chunkSize: qaChunkSize, + charsPointsPrice: agentModel.charsPointsPrice || 0, + priceTip: t('dataset:import.Auto mode Estimated Price Tips', { + price: agentModel.charsPointsPrice + }), + uploadRate: 30 + }; + } else if (autoIndexes) { + return { + chunkSizeField: 'embeddingChunkSize', + chunkOverlapRatio: 0.2, + maxChunkSize: 2048, + minChunkSize: 100, + autoChunkSize: vectorModel?.defaultToken ? vectorModel.defaultToken * 2 : 1024, + chunkSize: embeddingChunkSize, + charsPointsPrice: agentModel.charsPointsPrice || 0, + priceTip: t('dataset:import.Auto mode Estimated Price Tips', { + price: agentModel.charsPointsPrice + }), + uploadRate: 100 + }; + } else { + return { + chunkSizeField: 'embeddingChunkSize', + chunkOverlapRatio: 0.2, + maxChunkSize: vectorModel?.maxToken || 512, + minChunkSize: 100, + autoChunkSize: vectorModel?.defaultToken || 512, + chunkSize: embeddingChunkSize, + charsPointsPrice: vectorModel.charsPointsPrice || 0, + priceTip: t('dataset:import.Embedding Estimated Price Tips', { + price: vectorModel.charsPointsPrice + }), + uploadRate: 150 + }; } - }; - const selectModelStaticParam = modeStaticParams[mode]; + }, [ + trainingType, + autoIndexes, + agentModel.maxResponse, + agentModel.maxContext, + agentModel.charsPointsPrice, + qaChunkSize, + t, + vectorModel.defaultToken, + vectorModel?.maxToken, + vectorModel.charsPointsPrice, + embeddingChunkSize + ]); - const wayStaticPrams = { - [ImportProcessWayEnum.auto]: { - chunkSize: selectModelStaticParam.autoChunkSize, - customSplitChar: '' - }, - [ImportProcessWayEnum.custom]: { - chunkSize: modeStaticParams[mode].chunkSize, - customSplitChar + const chunkSettingModeMap = useMemo(() => { + if (chunkSettingMode === ChunkSettingModeEnum.auto) { + return { + chunkSize: TrainingModeMap.autoChunkSize, + customSplitChar: '' + }; + } else { + return { + chunkSize: TrainingModeMap.chunkSize, + customSplitChar + }; } - }; - const chunkSize = wayStaticPrams[way].chunkSize; + }, [chunkSettingMode, TrainingModeMap.autoChunkSize, TrainingModeMap.chunkSize, customSplitChar]); const contextValue = { + ...TrainingModeMap, + ...chunkSettingModeMap, importSource: source, parentId, activeStep, goToNext, processParamsForm, - ...selectModelStaticParam, sources, - setSources, - chunkSize + setSources }; return ( diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx index c5e30ed49..6daae5d73 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/DataProcess.tsx @@ -1,4 +1,4 @@ -import React, { useCallback, useMemo, useRef } from 'react'; +import React, { useCallback, useEffect, useMemo, useRef } from 'react'; import { Box, Flex, @@ -7,45 +7,48 @@ import { ModalBody, ModalFooter, Textarea, - useDisclosure + useDisclosure, + Checkbox, + Accordion, + AccordionItem, + AccordionButton, + AccordionPanel, + AccordionIcon, + HStack } from '@chakra-ui/react'; import MyIcon from '@fastgpt/web/components/common/Icon'; import { useTranslation } from 'next-i18next'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; -import { TrainingModeEnum, TrainingTypeMap } from '@fastgpt/global/core/dataset/constants'; -import { ImportProcessWayEnum } from '@/web/core/dataset/constants'; +import { + DatasetCollectionDataProcessModeEnum, + DatasetCollectionDataProcessModeMap +} from '@fastgpt/global/core/dataset/constants'; +import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import MyModal from '@fastgpt/web/components/common/MyModal'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; -import Preview from '../components/Preview'; import MyTag from '@fastgpt/web/components/common/Tag/index'; import { useContextSelector } from 'use-context-selector'; import { DatasetImportContext } from '../Context'; -import { useToast } from '@fastgpt/web/hooks/useToast'; import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel'; import MyNumberInput from '@fastgpt/web/components/common/Input/NumberInput'; import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; +import { shadowLight } from '@fastgpt/web/styles/theme'; +import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; +import { useToast } from '@fastgpt/web/hooks/useToast'; -function DataProcess({ showPreviewChunks = true }: { showPreviewChunks: boolean }) { +function DataProcess() { const { t } = useTranslation(); const { feConfigs } = useSystemStore(); - - const { - goToNext, - processParamsForm, - chunkSizeField, - minChunkSize, - showChunkInput, - showPromptInput, - maxChunkSize, - priceTip, - chunkSize - } = useContextSelector(DatasetImportContext, (v) => v); - const { getValues, setValue, register, watch } = processParamsForm; const { toast } = useToast(); - const mode = watch('mode'); - const way = watch('way'); + + const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } = + useContextSelector(DatasetImportContext, (v) => v); + const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); + const { getValues, setValue, register, watch } = processParamsForm; + const trainingType = watch('trainingType'); + const chunkSettingMode = watch('chunkSettingMode'); const { isOpen: isOpenCustomPrompt, @@ -54,214 +57,317 @@ function DataProcess({ showPreviewChunks = true }: { showPreviewChunks: boolean } = useDisclosure(); const trainingModeList = useMemo(() => { - const list = Object.entries(TrainingTypeMap); - return list; + const list = Object.entries(DatasetCollectionDataProcessModeMap); + return list + .filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto) + .map(([key, value]) => ({ + title: t(value.label as any), + value: key as DatasetCollectionDataProcessModeEnum, + tooltip: t(value.tooltip as any) + })); }, []); - const onSelectTrainWay = useCallback( - (e: TrainingModeEnum) => { - if (!feConfigs?.isPlus && !TrainingTypeMap[e]?.openSource) { - return toast({ - status: 'warning', - title: t('common:common.system.Commercial version function') - }); - } - setValue('mode', e); - }, - [feConfigs?.isPlus, setValue, t, toast] - ); + const Title = useCallback(({ title }: { title: string }) => { + return ( + + + + {title} + + + + ); + }, []); + + // Adapt auto training + useEffect(() => { + if (trainingType === DatasetCollectionDataProcessModeEnum.auto) { + setValue('autoIndexes', true); + setValue('trainingType', DatasetCollectionDataProcessModeEnum.chunk); + } + }, [trainingType, setValue]); + + const showFileParseSetting = feConfigs?.showCustomPdfParse; + const showQAPromptInput = trainingType === DatasetCollectionDataProcessModeEnum.qa; return ( - - - - - {t('dataset:data_process_setting')} - + <> + + + {showFileParseSetting && ( + + - <Box display={['block', 'flex']} mt={4} alignItems={'center'}> - <FormLabel flex={'0 0 100px'}>{t('dataset:training_mode')}</FormLabel> - <LeftRadio - list={trainingModeList.map(([key, value]) => ({ - title: t(value.label as any), - value: key, - tooltip: t(value.tooltip as any) - }))} - px={3} - py={2} - value={mode} - onChange={onSelectTrainWay} - defaultBg="white" - activeBg="white" - display={'flex'} - flexWrap={'wrap'} - /> - </Box> - - <Box display={['block', 'flex']} mt={5}> - <FormLabel flex={'0 0 100px'}>{t('dataset:data_process_params')}</FormLabel> - <LeftRadio - list={[ - { - title: t('common:core.dataset.import.Auto process'), - desc: t('common:core.dataset.import.Auto process desc'), - value: ImportProcessWayEnum.auto - }, - { - title: t('dataset:custom_data_process_params'), - desc: t('dataset:custom_data_process_params_desc'), - value: ImportProcessWayEnum.custom, - children: way === ImportProcessWayEnum.custom && ( - <Box mt={5}> - {showChunkInput && chunkSizeField && ( - <Box> - <Flex alignItems={'center'}> - <Box>{t('dataset:ideal_chunk_length')}</Box> - <QuestionTip label={t('dataset:ideal_chunk_length_tips')} /> - </Flex> - <Box - mt={1} - css={{ - '& > span': { - display: 'block' - } - }} - > - <MyTooltip - label={t('common:core.dataset.import.Chunk Range', { - min: minChunkSize, - max: maxChunkSize - })} - > - <MyNumberInput - name={chunkSizeField} - min={minChunkSize} - max={maxChunkSize} - size={'sm'} - step={100} - value={chunkSize} - onChange={(e) => { - if (e === undefined) return; - setValue(chunkSizeField, +e); - }} - /> - </MyTooltip> - </Box> - </Box> - )} - - <Box mt={3}> - <Box> - {t('common:core.dataset.import.Custom split char')} - <QuestionTip - label={t('common:core.dataset.import.Custom split char Tips')} - /> - </Box> - <Box mt={1}> - <Input - size={'sm'} - bg={'myGray.50'} - defaultValue={''} - placeholder="\n;======;==SPLIT==" - {...register('customSplitChar')} - /> - </Box> - </Box> - - {showPromptInput && ( - <Box mt={3}> - <Box>{t('common:core.dataset.collection.QA Prompt')}</Box> - <Box - position={'relative'} - py={2} - px={3} - bg={'myGray.50'} - fontSize={'xs'} - whiteSpace={'pre-wrap'} - border={'1px'} - borderColor={'borderColor.base'} + <AccordionPanel p={2}> + <Flex + flexDirection={'column'} + gap={3} + border={'1px solid'} + borderColor={'primary.600'} + borderRadius={'md'} + boxShadow={shadowLight} + p={4} + > + {feConfigs.showCustomPdfParse && ( + <HStack spacing={1}> + <Checkbox {...register('customPdfParse')}> + <FormLabel>{t('dataset:pdf_enhance_parse')}</FormLabel> + </Checkbox> + <QuestionTip label={t('dataset:pdf_enhance_parse_tips')} /> + {feConfigs?.show_pay && ( + <MyTag + type={'borderSolid'} + borderColor={'myGray.200'} + bg={'myGray.100'} + color={'primary.600'} + py={1.5} borderRadius={'md'} - maxH={'140px'} - overflow={'auto'} - _hover={{ - '& .mask': { - display: 'block' - } - }} + px={3} + whiteSpace={'wrap'} + ml={1} > - {getValues('qaPrompt')} + {t('dataset:pdf_enhance_parse_price', { + price: feConfigs.customPdfParsePrice || 0 + })} + </MyTag> + )} + </HStack> + )} + </Flex> + </AccordionPanel> + </AccordionItem> + )} - <Box - display={'none'} - className="mask" - position={'absolute'} - top={0} - right={0} - bottom={0} - left={0} - background={ - 'linear-gradient(182deg, rgba(255, 255, 255, 0.00) 1.76%, #FFF 84.07%)' - } - > - <Button - size="xs" - variant={'whiteBase'} - leftIcon={<MyIcon name={'edit'} w={'13px'} />} - color={'black'} - position={'absolute'} - right={2} - bottom={2} - onClick={onOpenCustomPrompt} - > - {t('common:core.dataset.import.Custom prompt')} - </Button> - </Box> - </Box> - </Box> - )} + <AccordionItem mt={4} border={'none'}> + <Title title={t('dataset:import_data_process_setting')} /> + + <AccordionPanel p={2}> + <Box mt={2}> + <Box fontSize={'sm'} mb={2} color={'myGray.600'}> + {t('dataset:training_mode')} + </Box> + <LeftRadio<DatasetCollectionDataProcessModeEnum> + list={trainingModeList} + px={3} + py={2.5} + value={trainingType} + onChange={(e) => { + setValue('trainingType', e); + }} + defaultBg="white" + activeBg="white" + gridTemplateColumns={'repeat(2, 1fr)'} + /> + </Box> + {trainingType === DatasetCollectionDataProcessModeEnum.chunk && feConfigs?.isPlus && ( + <Box mt={6}> + <Box fontSize={'sm'} mb={2} color={'myGray.600'}> + {t('dataset:enhanced_indexes')} </Box> - ) - } - ]} - px={3} - py={3} - defaultBg="white" - activeBg="white" - value={way} - w={'100%'} - onChange={(e) => { - setValue('way', e); - }} - ></LeftRadio> - </Box> + <HStack gap={[3, 7]}> + <HStack flex={'1'} spacing={1}> + <Checkbox {...register('autoIndexes')}> + <FormLabel>{t('dataset:auto_indexes')}</FormLabel> + </Checkbox> + <QuestionTip label={t('dataset:auto_indexes_tips')} /> + </HStack> + <HStack flex={'1'} spacing={1}> + <MyTooltip + label={!datasetDetail?.vlmModel ? t('common:error_vlm_not_config') : ''} + > + <Checkbox isDisabled={!datasetDetail?.vlmModel} {...register('imageIndex')}> + <FormLabel>{t('dataset:image_auto_parse')}</FormLabel> + </Checkbox> + </MyTooltip> + <QuestionTip label={t('dataset:image_auto_parse_tips')} /> + </HStack> + </HStack> + </Box> + )} + <Box mt={6}> + <Box fontSize={'sm'} mb={2} color={'myGray.600'}> + {t('dataset:params_setting')} + </Box> + <LeftRadio<ChunkSettingModeEnum> + list={[ + { + title: t('dataset:default_params'), + desc: t('dataset:default_params_desc'), + value: ChunkSettingModeEnum.auto + }, + { + title: t('dataset:custom_data_process_params'), + desc: t('dataset:custom_data_process_params_desc'), + value: ChunkSettingModeEnum.custom, + children: chunkSettingMode === ChunkSettingModeEnum.custom && ( + <Box mt={5}> + <Box> + <Flex alignItems={'center'}> + <Box>{t('dataset:ideal_chunk_length')}</Box> + <QuestionTip label={t('dataset:ideal_chunk_length_tips')} /> + </Flex> + <Box + mt={1} + css={{ + '& > span': { + display: 'block' + } + }} + > + <MyTooltip + label={t('common:core.dataset.import.Chunk Range', { + min: minChunkSize, + max: maxChunkSize + })} + > + <MyNumberInput + register={register} + name={chunkSizeField} + min={minChunkSize} + max={maxChunkSize} + size={'sm'} + step={100} + /> + </MyTooltip> + </Box> + </Box> - {feConfigs?.show_pay && ( - <Box mt={5} pl={[0, '100px']} gap={3}> - <MyTag colorSchema={'gray'} py={1.5} borderRadius={'md'} px={3} whiteSpace={'wrap'}> - {priceTip} - </MyTag> - </Box> - )} + <Box mt={3}> + <Box> + {t('common:core.dataset.import.Custom split char')} + <QuestionTip + label={t('common:core.dataset.import.Custom split char Tips')} + /> + </Box> + <Box mt={1}> + <Input + size={'sm'} + bg={'myGray.50'} + defaultValue={''} + placeholder="\n;======;==SPLIT==" + {...register('customSplitChar')} + /> + </Box> + </Box> - <Flex mt={5} gap={3} justifyContent={'flex-end'}> - <Button - onClick={() => { - goToNext(); - }} - > - {t('common:common.Next Step')} - </Button> - </Flex> - </Box> - <Box flex={'1 0 0'} w={['auto', '0']} h={['auto', '100%']} pl={[0, 3]}> - <Preview showPreviewChunks={showPreviewChunks} /> + {showQAPromptInput && ( + <Box mt={3}> + <Box>{t('common:core.dataset.collection.QA Prompt')}</Box> + <Box + position={'relative'} + py={2} + px={3} + bg={'myGray.50'} + fontSize={'xs'} + whiteSpace={'pre-wrap'} + border={'1px'} + borderColor={'borderColor.base'} + borderRadius={'md'} + maxH={'140px'} + overflow={'auto'} + _hover={{ + '& .mask': { + display: 'block' + } + }} + > + {getValues('qaPrompt')} + + <Box + display={'none'} + className="mask" + position={'absolute'} + top={0} + right={0} + bottom={0} + left={0} + background={ + 'linear-gradient(182deg, rgba(255, 255, 255, 0.00) 1.76%, #FFF 84.07%)' + } + > + <Button + size="xs" + variant={'whiteBase'} + leftIcon={<MyIcon name={'edit'} w={'13px'} />} + color={'black'} + position={'absolute'} + right={2} + bottom={2} + onClick={onOpenCustomPrompt} + > + {t('common:core.dataset.import.Custom prompt')} + </Button> + </Box> + </Box> + </Box> + )} + </Box> + ) + } + ]} + gridGap={3} + px={3} + py={3} + defaultBg="white" + activeBg="white" + value={chunkSettingMode} + w={'100%'} + onChange={(e) => { + setValue('chunkSettingMode', e); + }} + /> + </Box> + </AccordionPanel> + </AccordionItem> + + {/* <AccordionItem mt={4} border={'none'}> + <Title title={t('dataset:import_model_config')} /> + <AccordionPanel p={2} fontSize={'sm'}> + <Box> + <Box>{t('common:core.ai.model.Dataset Agent Model')}</Box> + <Box mt={1}> + <AIModelSelector + w={'100%'} + value={llmModel} + list={datasetModelList.map((item) => ({ + label: item.name, + value: item.model + }))} + onchange={(e) => { + setValue('llmModel', e); + }} + /> + </Box> + </Box> + <Box pt={5}> + <Box>{t('dataset:vllm_model')}</Box> + <Box mt={1}> + <AIModelSelector + w={'100%'} + value={vlmModel} + list={vllmModelList.map((item) => ({ + label: item.name, + value: item.model + }))} + onchange={(e) => { + setValue('vlmModel', e); + }} + /> + </Box> + </Box> + </AccordionPanel> + </AccordionItem> */} + + <Flex mt={5} gap={3} justifyContent={'flex-end'}> + <Button + onClick={() => { + goToNext(); + }} + > + {t('common:common.Next Step')} + </Button> + </Flex> + </Accordion> </Box> {isOpenCustomPrompt && ( @@ -273,7 +379,7 @@ function DataProcess({ showPreviewChunks = true }: { showPreviewChunks: boolean onClose={onCloseCustomPrompt} /> )} - </Box> + </> ); } diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx index 892ffc25f..1b2ce5c23 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx @@ -1,19 +1,162 @@ -import React from 'react'; -import Preview from '../components/Preview'; -import { Box, Button, Flex } from '@chakra-ui/react'; +import React, { useState } from 'react'; +import { Box, Button, Flex, HStack } from '@chakra-ui/react'; import { useTranslation } from 'next-i18next'; import { useContextSelector } from 'use-context-selector'; import { DatasetImportContext } from '../Context'; +import MyIcon from '@fastgpt/web/components/common/Icon'; +import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel'; +import EmptyTip from '@fastgpt/web/components/common/EmptyTip'; +import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; +import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; +import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; +import { getPreviewChunks } from '@/web/core/dataset/api'; +import { ImportSourceItemType } from '@/web/core/dataset/type'; +import { getPreviewSourceReadType } from '../utils'; +import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; +import MyBox from '@fastgpt/web/components/common/MyBox'; +import Markdown from '@/components/Markdown'; +import { useToast } from '@fastgpt/web/hooks/useToast'; -const PreviewData = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => { +const PreviewData = () => { const { t } = useTranslation(); + const { toast } = useToast(); const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext); + const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); + + const sources = useContextSelector(DatasetImportContext, (v) => v.sources); + const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource); + const chunkSize = useContextSelector(DatasetImportContext, (v) => v.chunkSize); + const chunkOverlapRatio = useContextSelector(DatasetImportContext, (v) => v.chunkOverlapRatio); + const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm); + + const [previewFile, setPreviewFile] = useState<ImportSourceItemType>(); + + const { data = [], loading: isLoading } = useRequest2( + async () => { + if (!previewFile) return; + if (importSource === ImportDataSourceEnum.fileCustom) { + const customSplitChar = processParamsForm.getValues('customSplitChar'); + const { chunks } = splitText2Chunks({ + text: previewFile.rawText || '', + chunkLen: chunkSize, + overlapRatio: chunkOverlapRatio, + customReg: customSplitChar ? [customSplitChar] : [] + }); + return chunks.map((chunk) => ({ + q: chunk, + a: '' + })); + } + + return getPreviewChunks({ + datasetId, + type: getPreviewSourceReadType(previewFile), + sourceId: + previewFile.dbFileId || + previewFile.link || + previewFile.externalFileUrl || + previewFile.apiFileId || + '', + + customPdfParse: processParamsForm.getValues('customPdfParse'), + + chunkSize, + overlapRatio: chunkOverlapRatio, + customSplitChar: processParamsForm.getValues('customSplitChar'), + + selector: processParamsForm.getValues('webSelector'), + isQAImport: importSource === ImportDataSourceEnum.csvTable, + externalFileId: previewFile.externalFileId + }); + }, + { + refreshDeps: [previewFile], + manual: false, + onSuccess(result) { + if (!previewFile) return; + if (!result || result.length === 0) { + toast({ + title: t('dataset:preview_chunk_empty'), + status: 'error' + }); + } + } + } + ); + return ( <Flex flexDirection={'column'} h={'100%'}> - <Box flex={'1 0 0 '}> - <Preview showPreviewChunks={showPreviewChunks} /> - </Box> + <Flex flex={'1 0 0'} border={'base'} borderRadius={'md'}> + <Flex flexDirection={'column'} flex={'1 0 0'} borderRight={'base'}> + <FormLabel fontSize={'md'} py={4} px={5} borderBottom={'base'}> + {t('dataset:file_list')} + </FormLabel> + <Box flex={'1 0 0'} overflowY={'auto'} px={5} py={3}> + {sources.map((source) => ( + <HStack + key={source.id} + bg={'myGray.50'} + p={4} + borderRadius={'md'} + borderWidth={'1px'} + borderColor={'transparent'} + cursor={'pointer'} + _hover={{ + borderColor: 'primary.300' + }} + {...(previewFile?.id === source.id && { + borderColor: 'primary.500 !important', + bg: 'primary.50 !important' + })} + _notLast={{ mb: 3 }} + onClick={() => setPreviewFile(source)} + > + <MyIcon name={source.icon as any} w={'1.25rem'} /> + <Box ml={1} flex={'1 0 0'} wordBreak={'break-all'} fontSize={'sm'}> + {source.sourceName} + </Box> + </HStack> + ))} + </Box> + </Flex> + <Flex flexDirection={'column'} flex={'1 0 0'}> + <Flex py={4} px={5} borderBottom={'base'} justifyContent={'space-between'}> + <FormLabel fontSize={'md'}>{t('dataset:preview_chunk')}</FormLabel> + <Box fontSize={'xs'} color={'myGray.500'}> + {t('dataset:preview_chunk_intro')} + </Box> + </Flex> + <MyBox isLoading={isLoading} flex={'1 0 0'} h={0}> + <Box h={'100%'} overflowY={'auto'} px={5} py={3}> + {previewFile ? ( + <> + {data.map((item, index) => ( + <Box + key={index} + fontSize={'sm'} + color={'myGray.600'} + _notLast={{ + mb: 3, + pb: 3, + borderBottom: 'base' + }} + _hover={{ + bg: 'myGray.100' + }} + > + <Markdown source={item.q} /> + <Markdown source={item.a} /> + </Box> + ))} + </> + ) : ( + <EmptyTip text={t('dataset:preview_chunk_not_selected')} /> + )} + </Box> + </MyBox> + </Flex> + </Flex> <Flex mt={2} justifyContent={'flex-end'}> <Button onClick={goToNext}>{t('common:common.Next Step')}</Button> </Flex> diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx index e811a5254..489f9c0f2 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx @@ -14,7 +14,10 @@ import { IconButton, Tooltip } from '@chakra-ui/react'; -import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; +import { + DatasetCollectionDataProcessModeEnum, + ImportDataSourceEnum +} from '@fastgpt/global/core/dataset/constants'; import { useTranslation } from 'next-i18next'; import MyIcon from '@fastgpt/web/components/common/Icon'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; @@ -34,6 +37,7 @@ import MyTag from '@fastgpt/web/components/common/Tag/index'; import { useContextSelector } from 'use-context-selector'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetImportContext, type ImportFormType } from '../Context'; +import { ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; const Upload = () => { const { t } = useTranslation(); @@ -77,7 +81,7 @@ const Upload = () => { }, [waitingFilesCount, totalFilesCount, allFinished, t]); const { runAsync: startUpload, loading: isLoading } = useRequest2( - async ({ mode, customSplitChar, qaPrompt, webSelector }: ImportFormType) => { + async ({ trainingType, customSplitChar, qaPrompt, webSelector }: ImportFormType) => { if (sources.length === 0) return; const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting'); @@ -95,15 +99,21 @@ const Upload = () => { ); // create collection - const commonParams = { + const commonParams: ApiCreateDatasetCollectionParams & { + name: string; + } = { parentId, - trainingType: mode, datasetId: datasetDetail._id, + name: item.sourceName, + + customPdfParse: processParamsForm.getValues('customPdfParse'), + + trainingType, + imageIndex: processParamsForm.getValues('imageIndex'), + autoIndexes: processParamsForm.getValues('autoIndexes'), chunkSize, chunkSplitter: customSplitChar, - qaPrompt, - - name: item.sourceName + qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined }; if (importSource === ImportDataSourceEnum.reTraining) { const res = await postReTrainingDatasetFileCollection({ @@ -272,7 +282,7 @@ const Upload = () => { <Flex justifyContent={'flex-end'} mt={4}> <Button isLoading={isLoading} onClick={handleSubmit((data) => startUpload(data))}> {totalFilesCount > 0 && - `${t('common:core.dataset.import.Total files', { + `${t('dataset:total_num_files', { total: totalFilesCount })} | `} {buttonText} diff --git a/projects/app/src/pageComponents/dataset/detail/Import/components/Preview.tsx b/projects/app/src/pageComponents/dataset/detail/Import/components/Preview.tsx deleted file mode 100644 index 09ba85742..000000000 --- a/projects/app/src/pageComponents/dataset/detail/Import/components/Preview.tsx +++ /dev/null @@ -1,102 +0,0 @@ -import React, { useState } from 'react'; -import { Box, Flex, Grid, IconButton } from '@chakra-ui/react'; -import MyIcon from '@fastgpt/web/components/common/Icon'; -import { useTranslation } from 'next-i18next'; - -import MyMenu from '@fastgpt/web/components/common/MyMenu'; -import { ImportSourceItemType } from '@/web/core/dataset/type'; -import dynamic from 'next/dynamic'; -import { useContextSelector } from 'use-context-selector'; -import { DatasetImportContext } from '../Context'; -const PreviewRawText = dynamic(() => import('./PreviewRawText')); -const PreviewChunks = dynamic(() => import('./PreviewChunks')); - -const Preview = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => { - const { t } = useTranslation(); - - const { sources } = useContextSelector(DatasetImportContext, (v) => v); - const [previewRawTextSource, setPreviewRawTextSource] = useState<ImportSourceItemType>(); - const [previewChunkSource, setPreviewChunkSource] = useState<ImportSourceItemType>(); - - return ( - <Box h={'100%'} w={'100%'} display={['block', 'flex']} flexDirection={'column'}> - <Flex alignItems={'center'}> - <MyIcon name={'core/dataset/fileCollection'} w={'20px'} /> - <Box fontSize={'md'}>{t('common:core.dataset.import.Sources list')}</Box> - </Flex> - <Box mt={3} flex={'1 0 0'} h={['auto', 0]} width={'100%'} overflowY={'auto'}> - <Grid w={'100%'} gap={3} gridTemplateColumns={['1fr', '1fr', '1fr', '1fr', '1fr 1fr']}> - {sources.map((source) => ( - <Flex - key={source.id} - bg={'white'} - p={4} - borderRadius={'md'} - borderWidth={'1px'} - borderColor={'borderColor.low'} - boxShadow={'2'} - alignItems={'center'} - > - <MyIcon name={source.icon as any} w={['1rem', '1.25rem']} /> - <Box mx={1} flex={'1 0 0'} wordBreak={'break-all'} fontSize={'sm'}> - {source.sourceName} - </Box> - {showPreviewChunks && ( - <Box fontSize={'xs'} color={'myGray.600'}> - <MyMenu - Button={ - <IconButton - icon={<MyIcon name={'common/viewLight'} w={'14px'} p={2} />} - aria-label={''} - size={'sm'} - variant={'whitePrimary'} - /> - } - menuList={[ - { - children: [ - { - label: ( - <Flex alignItems={'center'}> - <MyIcon name={'core/dataset/fileCollection'} w={'14px'} mr={2} /> - {t('common:core.dataset.import.Preview raw text')} - </Flex> - ), - onClick: () => setPreviewRawTextSource(source) - }, - { - label: ( - <Flex alignItems={'center'}> - <MyIcon name={'core/dataset/splitLight'} w={'14px'} mr={2} /> - {t('common:core.dataset.import.Preview chunks')} - </Flex> - ), - onClick: () => setPreviewChunkSource(source) - } - ] - } - ]} - /> - </Box> - )} - </Flex> - ))} - </Grid> - </Box> - {!!previewRawTextSource && ( - <PreviewRawText - previewSource={previewRawTextSource} - onClose={() => setPreviewRawTextSource(undefined)} - /> - )} - {!!previewChunkSource && ( - <PreviewChunks - previewSource={previewChunkSource} - onClose={() => setPreviewChunkSource(undefined)} - /> - )} - </Box> - ); -}; - -export default React.memo(Preview); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/components/PreviewRawText.tsx b/projects/app/src/pageComponents/dataset/detail/Import/components/PreviewRawText.tsx deleted file mode 100644 index 6eb01b693..000000000 --- a/projects/app/src/pageComponents/dataset/detail/Import/components/PreviewRawText.tsx +++ /dev/null @@ -1,78 +0,0 @@ -import React from 'react'; -import { Box } from '@chakra-ui/react'; -import { ImportSourceItemType } from '@/web/core/dataset/type'; -import { getPreviewFileContent } from '@/web/common/file/api'; -import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer'; -import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; -import { useToast } from '@fastgpt/web/hooks/useToast'; -import { getErrText } from '@fastgpt/global/common/error/utils'; -import { useContextSelector } from 'use-context-selector'; -import { DatasetImportContext } from '../Context'; -import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; -import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; -import { getPreviewSourceReadType } from '../utils'; - -const PreviewRawText = ({ - previewSource, - onClose -}: { - previewSource: ImportSourceItemType; - onClose: () => void; -}) => { - const { toast } = useToast(); - const { importSource, processParamsForm } = useContextSelector(DatasetImportContext, (v) => v); - const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); - - const { data, loading: isLoading } = useRequest2( - async () => { - if (importSource === ImportDataSourceEnum.fileCustom && previewSource.rawText) { - return { - previewContent: previewSource.rawText.slice(0, 3000) - }; - } - - return getPreviewFileContent({ - datasetId, - type: getPreviewSourceReadType(previewSource), - sourceId: - previewSource.dbFileId || - previewSource.link || - previewSource.externalFileUrl || - previewSource.apiFileId || - '', - - isQAImport: importSource === ImportDataSourceEnum.csvTable, - selector: processParamsForm.getValues('webSelector'), - externalFileId: previewSource.externalFileId - }); - }, - { - refreshDeps: [previewSource.dbFileId, previewSource.link, previewSource.externalFileUrl], - manual: false, - onError(err) { - toast({ - status: 'warning', - title: getErrText(err) - }); - } - } - ); - - const rawText = data?.previewContent || ''; - - return ( - <MyRightDrawer - onClose={onClose} - iconSrc={previewSource.icon} - title={previewSource.sourceName} - isLoading={isLoading} - px={0} - > - <Box whiteSpace={'pre-wrap'} overflowY={'auto'} px={5} fontSize={'sm'}> - {rawText} - </Box> - </MyRightDrawer> - ); -}; - -export default React.memo(PreviewRawText); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/components/RenderFiles.tsx b/projects/app/src/pageComponents/dataset/detail/Import/components/RenderFiles.tsx index 473575560..86aae27cb 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/components/RenderFiles.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/components/RenderFiles.tsx @@ -14,24 +14,17 @@ import { import { ImportSourceItemType } from '@/web/core/dataset/type.d'; import MyIcon from '@fastgpt/web/components/common/Icon'; import { useTranslation } from 'next-i18next'; -import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; -import dynamic from 'next/dynamic'; import { useI18n } from '@/web/context/I18n'; -const PreviewRawText = dynamic(() => import('./PreviewRawText')); - export const RenderUploadFiles = ({ files, - setFiles, - showPreviewContent + setFiles }: { files: ImportSourceItemType[]; setFiles: React.Dispatch<React.SetStateAction<ImportSourceItemType[]>>; - showPreviewContent?: boolean; }) => { const { t } = useTranslation(); const { fileT } = useI18n(); - const [previewFile, setPreviewFile] = useState<ImportSourceItemType>(); return files.length > 0 ? ( <> @@ -84,18 +77,6 @@ export const RenderUploadFiles = ({ <Td> {!item.isUploading && ( <Flex alignItems={'center'} gap={4}> - {showPreviewContent && ( - <MyTooltip label={t('common:core.dataset.import.Preview raw text')}> - <IconButton - variant={'whitePrimary'} - size={'sm'} - icon={<MyIcon name={'common/viewLight'} w={'18px'} />} - aria-label={''} - onClick={() => setPreviewFile(item)} - /> - </MyTooltip> - )} - <IconButton variant={'grayDanger'} size={'sm'} @@ -113,9 +94,6 @@ export const RenderUploadFiles = ({ </Tbody> </Table> </TableContainer> - {!!previewFile && ( - <PreviewRawText previewSource={previewFile} onClose={() => setPreviewFile(undefined)} /> - )} </> ) : null; }; diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/APIDataset.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/APIDataset.tsx index e0cdff95c..dee1dd83f 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/APIDataset.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/APIDataset.tsx @@ -28,7 +28,7 @@ const APIDatasetCollection = () => { return ( <> {activeStep === 0 && <CustomAPIFileInput />} - {activeStep === 1 && <DataProcess showPreviewChunks={true} />} + {activeStep === 1 && <DataProcess />} {activeStep === 2 && <Upload />} </> ); @@ -272,7 +272,7 @@ const CustomAPIFileInput = () => { onClick={onclickNext} > {selectFiles.length > 0 - ? `${t('common:core.dataset.import.Total files', { total: selectFiles.length })} | ` + ? `${t('dataset:total_num_files', { total: selectFiles.length })} | ` : ''} {t('common:common.Next Step')} </Button> diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ExternalFile.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ExternalFile.tsx index aef5bb934..e1e29b00a 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ExternalFile.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ExternalFile.tsx @@ -34,7 +34,7 @@ const ExternalFileCollection = () => { return ( <> {activeStep === 0 && <CustomLinkInput />} - {activeStep === 1 && <DataProcess showPreviewChunks={true} />} + {activeStep === 1 && <DataProcess />} {activeStep === 2 && <Upload />} </> ); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx index b40fd7489..12c0c28de 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx @@ -19,7 +19,7 @@ const CustomTet = () => { return ( <> {activeStep === 0 && <CustomTextInput />} - {activeStep === 1 && <DataProcess showPreviewChunks />} + {activeStep === 1 && <DataProcess />} {activeStep === 2 && <Upload />} </> ); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLink.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLink.tsx index 11a5e4fe7..b9f0e192e 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLink.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLink.tsx @@ -23,7 +23,7 @@ const LinkCollection = () => { return ( <> {activeStep === 0 && <CustomLinkImport />} - {activeStep === 1 && <DataProcess showPreviewChunks />} + {activeStep === 1 && <DataProcess />} {activeStep === 2 && <Upload />} </> ); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx index aa162f4a9..605511c80 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx @@ -10,9 +10,8 @@ import { RenderUploadFiles } from '../components/RenderFiles'; import { useContextSelector } from 'use-context-selector'; import { DatasetImportContext } from '../Context'; -const DataProcess = dynamic(() => import('../commonProgress/DataProcess'), { - loading: () => <Loading fixed={false} /> -}); +const DataProcess = dynamic(() => import('../commonProgress/DataProcess')); +const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); const Upload = dynamic(() => import('../commonProgress/Upload')); const fileType = '.txt, .docx, .csv, .xlsx, .pdf, .md, .html, .pptx'; @@ -23,8 +22,9 @@ const FileLocal = () => { return ( <> {activeStep === 0 && <SelectFile />} - {activeStep === 1 && <DataProcess showPreviewChunks />} - {activeStep === 2 && <Upload />} + {activeStep === 1 && <DataProcess />} + {activeStep === 2 && <PreviewData />} + {activeStep === 3 && <Upload />} </> ); }; @@ -64,12 +64,12 @@ const SelectFile = React.memo(function SelectFile() { /> {/* render files */} - <RenderUploadFiles files={selectFiles} setFiles={setSelectFiles} showPreviewContent /> + <RenderUploadFiles files={selectFiles} setFiles={setSelectFiles} /> <Box textAlign={'right'} mt={5}> <Button isDisabled={successFiles.length === 0 || uploading} onClick={onclickNext}> {selectFiles.length > 0 - ? `${t('core.dataset.import.Total files', { total: selectFiles.length })} | ` + ? `${t('dataset:total_num_files', { total: selectFiles.length })} | ` : ''} {t('common:common.Next Step')} </Button> diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx index 2c6e6cc4e..ba7d56fa2 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx @@ -8,10 +8,13 @@ import { useRouter } from 'next/router'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; import { getDatasetCollectionById } from '@/web/core/dataset/api'; import MyBox from '@fastgpt/web/components/common/MyBox'; -import { ImportProcessWayEnum } from '@/web/core/dataset/constants'; +import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils'; +import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; +import { Box } from '@chakra-ui/react'; const Upload = dynamic(() => import('../commonProgress/Upload')); +const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); const ReTraining = () => { const router = useRouter(); @@ -20,6 +23,7 @@ const ReTraining = () => { collectionId: string; }; + const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep); const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources); const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm); @@ -43,8 +47,12 @@ const ReTraining = () => { } ]); processParamsForm.reset({ - mode: collection.trainingType, - way: ImportProcessWayEnum.auto, + customPdfParse: collection.customPdfParse, + trainingType: collection.trainingType, + imageIndex: collection.imageIndex, + autoIndexes: collection.autoIndexes, + + chunkSettingMode: ChunkSettingModeEnum.auto, embeddingChunkSize: collection.chunkSize, qaChunkSize: collection.chunkSize, customSplitChar: collection.chunkSplitter, @@ -55,9 +63,12 @@ const ReTraining = () => { }); return ( - <MyBox isLoading={loading} h={'100%'} overflow={'auto'}> - {activeStep === 0 && <DataProcess showPreviewChunks={true} />} - {activeStep === 1 && <Upload />} + <MyBox isLoading={loading} h={'100%'}> + <Box h={'100%'} overflow={'auto'}> + {activeStep === 0 && <DataProcess />} + {activeStep === 1 && <PreviewData />} + {activeStep === 2 && <Upload />} + </Box> </MyBox> ); }; diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/TableLocal.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/TableLocal.tsx index 83cc4b7d0..2b878ff6e 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/TableLocal.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/TableLocal.tsx @@ -21,7 +21,7 @@ const FileLocal = () => { return ( <> {activeStep === 0 && <SelectFile />} - {activeStep === 1 && <PreviewData showPreviewChunks />} + {activeStep === 1 && <PreviewData />} {activeStep === 2 && <Upload />} </> ); @@ -91,7 +91,7 @@ const SelectFile = React.memo(function SelectFile() { }} > {selectFiles.length > 0 - ? `${t('core.dataset.import.Total files', { total: selectFiles.length })} | ` + ? `${t('dataset:total_num_files', { total: selectFiles.length })} | ` : ''} {t('common:common.Next Step')} </Button> diff --git a/projects/app/src/pageComponents/dataset/detail/Info/index.tsx b/projects/app/src/pageComponents/dataset/detail/Info/index.tsx index 89dbe73a7..375e07abd 100644 --- a/projects/app/src/pageComponents/dataset/detail/Info/index.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Info/index.tsx @@ -1,4 +1,4 @@ -import React, { useEffect, useState } from 'react'; +import React, { useEffect, useMemo, useState } from 'react'; import { Box, Flex, Switch, Input } from '@chakra-ui/react'; import { useConfirm } from '@fastgpt/web/hooks/useConfirm'; import { useForm } from 'react-hook-form'; @@ -37,6 +37,8 @@ const Info = ({ datasetId }: { datasetId: string }) => { const { t } = useTranslation(); const { datasetDetail, loadDatasetDetail, updateDataset, rebuildingCount, trainingCount } = useContextSelector(DatasetPageContext, (v) => v); + const { feConfigs, datasetModelList, embeddingModelList, getVlmModelList } = useSystemStore(); + const [editedDataset, setEditedDataset] = useState<EditResourceInfoFormType>(); const [editedAPIDataset, setEditedAPIDataset] = useState<EditAPIDatasetInfoFormType>(); const refetchDatasetTraining = useContextSelector( @@ -50,7 +52,9 @@ const Info = ({ datasetId }: { datasetId: string }) => { const vectorModel = watch('vectorModel'); const agentModel = watch('agentModel'); - const { feConfigs, datasetModelList, embeddingModelList } = useSystemStore(); + const vllmModelList = useMemo(() => getVlmModelList(), [getVlmModelList]); + const vlmModel = watch('vlmModel'); + const { ConfirmModal: ConfirmDelModal } = useConfirm({ content: t('common:core.dataset.Delete Confirm'), type: 'delete' @@ -69,7 +73,8 @@ const Info = ({ datasetId }: { datasetId: string }) => { (data: DatasetItemType) => { return updateDataset({ id: datasetId, - agentModel: data.agentModel, + agentModel: data.agentModel?.model, + vlmModel: data.vlmModel?.model, externalReadUrl: data.externalReadUrl }); }, @@ -225,6 +230,31 @@ const Info = ({ datasetId }: { datasetId: string }) => { </Box> </Box> + {feConfigs?.isPlus && ( + <Box pt={5}> + <FormLabel fontSize={'mini'} fontWeight={'500'}> + {t('dataset:vllm_model')} + </FormLabel> + <Box pt={2}> + <AIModelSelector + w={'100%'} + value={vlmModel?.model} + list={vllmModelList.map((item) => ({ + label: item.name, + value: item.model + }))} + fontSize={'mini'} + onchange={(e) => { + const vlmModel = vllmModelList.find((item) => item.model === e); + if (!vlmModel) return; + setValue('vlmModel', vlmModel); + return handleSubmit((data) => onSave({ ...data, vlmModel }))(); + }} + /> + </Box> + </Box> + )} + {feConfigs?.isPlus && ( <Flex alignItems={'center'} pt={5}> <FormLabel fontSize={'mini'} fontWeight={'500'}> diff --git a/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx b/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx index e39373e6f..ce69dea01 100644 --- a/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx +++ b/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx @@ -1,9 +1,7 @@ import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'; -import { Box, Flex, Button, Textarea, useTheme, Grid, HStack } from '@chakra-ui/react'; +import { Box, Flex, Button, Textarea } from '@chakra-ui/react'; import { - Control, FieldArrayWithId, - UseFieldArrayAppend, UseFieldArrayRemove, UseFormRegister, useFieldArray, @@ -12,7 +10,6 @@ import { import { postInsertData2Dataset, putDatasetDataById, - delOneDatasetDataById, getDatasetCollectionById, getDatasetDataItemById } from '@/web/core/dataset/api'; @@ -22,9 +19,8 @@ import MyModal from '@fastgpt/web/components/common/MyModal'; import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; import { useQuery } from '@tanstack/react-query'; import { useTranslation } from 'next-i18next'; -import { useRequest, useRequest2 } from '@fastgpt/web/hooks/useRequest'; -import { useConfirm } from '@fastgpt/web/hooks/useConfirm'; -import { getDefaultIndex, getSourceNameIcon } from '@fastgpt/global/core/dataset/utils'; +import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; +import { getSourceNameIcon } from '@fastgpt/global/core/dataset/utils'; import { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type'; import DeleteIcon from '@fastgpt/web/components/common/Icon/delete'; import { defaultCollectionDetail } from '@/web/core/dataset/constants'; @@ -33,9 +29,12 @@ import MyBox from '@fastgpt/web/components/common/MyBox'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip'; -import { useSystem } from '@fastgpt/web/hooks/useSystem'; import LightRowTabs from '@fastgpt/web/components/common/Tabs/LightRowTabs'; import styles from './styles.module.scss'; +import { + DatasetDataIndexTypeEnum, + getDatasetIndexMapData +} from '@fastgpt/global/core/dataset/data/constants'; export type InputDataType = { q: string; @@ -64,11 +63,10 @@ const InputDataModal = ({ onSuccess: (data: InputDataType & { dataId: string }) => void; }) => { const { t } = useTranslation(); - const theme = useTheme(); const { toast } = useToast(); const [currentTab, setCurrentTab] = useState(TabEnum.content); const { embeddingModelList, defaultModels } = useSystemStore(); - const { isPc } = useSystem(); + const { register, handleSubmit, reset, control } = useForm<InputDataType>(); const { fields: indexes, @@ -114,11 +112,6 @@ const InputDataModal = ({ } ]; - const { ConfirmModal, openConfirm } = useConfirm({ - content: t('common:dataset.data.Delete Tip'), - type: 'delete' - }); - const { data: collection = defaultCollectionDetail } = useQuery( ['loadCollectionId', collectionId], () => { @@ -165,8 +158,8 @@ const InputDataModal = ({ }, [collection.dataset.vectorModel, defaultModels.embedding, embeddingModelList]); // import new data - const { mutate: sureImportData, isLoading: isImporting } = useRequest({ - mutationFn: async (e: InputDataType) => { + const { runAsync: sureImportData, loading: isImporting } = useRequest2( + async (e: InputDataType) => { if (!e.q) { setCurrentTab(TabEnum.content); return Promise.reject(t('common:dataset.data.input is empty')); @@ -183,12 +176,8 @@ const InputDataModal = ({ collectionId: collection._id, q: e.q, a: e.a, - // remove dataId - indexes: - e.indexes?.map((index) => ({ - ...index, - dataId: undefined - })) || [] + // Contains no default index + indexes: e.indexes }); return { @@ -196,18 +185,20 @@ const InputDataModal = ({ dataId }; }, - successToast: t('common:dataset.data.Input Success Tip'), - onSuccess(e) { - reset({ - ...e, - q: '', - a: '', - indexes: [] - }); - onSuccess(e); - }, - errorToast: t('common:common.error.unKnow') - }); + { + successToast: t('common:dataset.data.Input Success Tip'), + onSuccess(e) { + reset({ + ...e, + q: '', + a: '', + indexes: [] + }); + onSuccess(e); + }, + errorToast: t('common:common.error.unKnow') + } + ); // update const { runAsync: onUpdateData, loading: isUpdating } = useRequest2( @@ -218,10 +209,7 @@ const InputDataModal = ({ await putDatasetDataById({ dataId, ...e, - indexes: - e.indexes?.map((index) => - index.defaultIndex ? getDefaultIndex({ q: e.q, a: e.a, dataId: index.dataId }) : index - ) || [] + indexes: e.indexes }); return { @@ -244,6 +232,7 @@ const InputDataModal = ({ () => getSourceNameIcon({ sourceName: collection.sourceName, sourceId: collection.sourceId }), [collection] ); + return ( <MyModal isOpen={true} @@ -296,9 +285,8 @@ const InputDataModal = ({ p={0} onClick={() => appendIndexes({ - defaultIndex: false, - text: '', - dataId: `${Date.now()}` + type: DatasetDataIndexTypeEnum.custom, + text: '' }) } > @@ -315,7 +303,6 @@ const InputDataModal = ({ <DataIndex register={register} maxToken={maxToken} - appendIndexes={appendIndexes} removeIndexes={removeIndexes} indexes={indexes} /> @@ -337,7 +324,6 @@ const InputDataModal = ({ </MyTooltip> </Flex> </MyBox> - <ConfirmModal /> </MyModal> ); }; @@ -424,13 +410,11 @@ const DataIndex = ({ maxToken, register, indexes, - appendIndexes, removeIndexes }: { maxToken: number; register: UseFormRegister<InputDataType>; indexes: FieldArrayWithId<InputDataType, 'indexes', 'id'>[]; - appendIndexes: UseFieldArrayAppend<InputDataType, 'indexes'>; removeIndexes: UseFieldArrayRemove; }) => { const { t } = useTranslation(); @@ -438,52 +422,41 @@ const DataIndex = ({ return ( <> <Flex mt={3} gap={3} flexDir={'column'}> - <Box - p={4} - borderRadius={'md'} - border={'1.5px solid var(--light-fastgpt-primary-opacity-01, rgba(51, 112, 255, 0.10))'} - bg={'primary.50'} - > - <Flex mb={2}> - <Box flex={1} fontWeight={'medium'} fontSize={'sm'} color={'primary.700'}> - {t('common:dataset.data.Default Index')} - </Box> - </Flex> - <Box fontSize={'sm'} fontWeight={'medium'} color={'myGray.600'}> - {t('common:core.dataset.data.Default Index Tip')} - </Box> - </Box> {indexes?.map((index, i) => { + const data = getDatasetIndexMapData(index.type); return ( - !index.defaultIndex && ( - <Box - key={index.dataId || i} - p={4} - borderRadius={'md'} - border={'1.5px solid var(--Gray-Modern-200, #E8EBF0)'} - bg={'myGray.25'} - _hover={{ - '& .delete': { - display: 'block' - } - }} - > - <Flex mb={2}> - <Box flex={1} fontWeight={'medium'} fontSize={'sm'} color={'myGray.900'}> - {t('dataset.data.Custom Index Number', { number: i })} - </Box> + <Box + key={index.dataId || i} + p={4} + borderRadius={'md'} + border={'1.5px solid var(--Gray-Modern-200, #E8EBF0)'} + bg={'myGray.25'} + _hover={{ + '& .delete': { + display: 'block' + } + }} + > + <Flex mb={2}> + <Box flex={1} fontWeight={'medium'} fontSize={'sm'} color={'myGray.900'}> + {t(data.label)} + </Box> + {index.type !== 'default' && ( <DeleteIcon onClick={() => { - if (indexes.length <= 1) { - appendIndexes(getDefaultIndex({ dataId: `${Date.now()}` })); - } removeIndexes(i); }} /> - </Flex> - <DataIndexTextArea index={i} maxToken={maxToken} register={register} /> - </Box> - ) + )} + </Flex> + <DataIndexTextArea + disabled={index.type === 'default'} + index={i} + value={index.text} + maxToken={maxToken} + register={register} + /> + </Box> ); })} </Flex> @@ -491,14 +464,19 @@ const DataIndex = ({ ); }; +const textareaMinH = '40px'; const DataIndexTextArea = ({ + value, index, maxToken, - register + register, + disabled }: { + value: string; index: number; maxToken: number; register: UseFormRegister<InputDataType>; + disabled?: boolean; }) => { const { t } = useTranslation(); const TextareaDom = useRef<HTMLTextAreaElement | null>(null); @@ -509,7 +487,7 @@ const DataIndexTextArea = ({ onChange: onTextChange, onBlur } = register(`indexes.${index}.text`, { required: true }); - const textareaMinH = '40px'; + useEffect(() => { if (TextareaDom.current) { TextareaDom.current.style.height = textareaMinH; @@ -522,7 +500,12 @@ const DataIndexTextArea = ({ e.target.style.height = `${e.target.scrollHeight + 5}px`; } }, []); - return ( + + return disabled ? ( + <Box fontSize={'sm'} color={'myGray.500'} whiteSpace={'pre-wrap'}> + {value} + </Box> + ) : ( <Textarea maxLength={maxToken} borderColor={'transparent'} diff --git a/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx b/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx index cc339936c..43059c4c7 100644 --- a/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx +++ b/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx @@ -7,7 +7,10 @@ import { useRouter } from 'next/router'; import MyBox from '@fastgpt/web/components/common/MyBox'; import { formatFileSize } from '@fastgpt/global/common/file/tools'; import { formatTime2YMDHM } from '@fastgpt/global/common/string/time'; -import { DatasetCollectionTypeMap, TrainingTypeMap } from '@fastgpt/global/core/dataset/constants'; +import { + DatasetCollectionDataProcessModeMap, + DatasetCollectionTypeMap +} from '@fastgpt/global/core/dataset/constants'; import { getCollectionSourceAndOpen } from '@/web/core/dataset/hooks/readCollectionSource'; import MyIcon from '@fastgpt/web/components/common/Icon'; @@ -61,13 +64,25 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => { label: t('common:core.dataset.collection.metadata.Updatetime'), value: formatTime2YMDHM(collection.updateTime) }, + { + label: t('dataset:collection_metadata_custom_pdf_parse'), + value: collection.customPdfParse ? 'Yes' : 'No' + }, { label: t('common:core.dataset.collection.metadata.Raw text length'), value: collection.rawTextLength ?? '-' }, { - label: t('dataset:collection.Training type'), - value: t(TrainingTypeMap[collection.trainingType]?.label as any) + label: t('dataset:collection_metadata_image_parse'), + value: collection.imageIndex ? 'Yes' : 'No' + }, + { + label: t('dataset:auto_indexes'), + value: collection.autoIndexes ? 'Yes' : 'No' + }, + { + label: t('dataset:collection.training_type'), + value: t(DatasetCollectionDataProcessModeMap[collection.trainingType]?.label as any) }, { label: t('common:core.dataset.collection.metadata.Chunk Size'), @@ -99,8 +114,8 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => { <Box fontSize={'md'} pb={4}> {t('common:core.dataset.collection.metadata.metadata')} </Box> - <Flex mb={4} wordBreak={'break-all'} fontSize={'sm'}> - <Box color={'myGray.500'} flex={'0 0 70px'}> + <Flex mb={3} wordBreak={'break-all'} fontSize={'sm'}> + <Box color={'myGray.500'} flex={'0 0 90px'}> {t('common:core.dataset.collection.id')}: </Box> <Box>{collection?._id}</Box> @@ -109,8 +124,8 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => { (item, i) => item.label && item.value && ( - <Flex key={i} alignItems={'center'} mb={4} wordBreak={'break-all'} fontSize={'sm'}> - <Box color={'myGray.500'} flex={'0 0 70px'}> + <Flex key={i} alignItems={'center'} mb={3} wordBreak={'break-all'} fontSize={'sm'}> + <Box color={'myGray.500'} flex={'0 0 90px'}> {item.label} </Box> <Box>{item.value}</Box> diff --git a/projects/app/src/pageComponents/dataset/list/CreateModal.tsx b/projects/app/src/pageComponents/dataset/list/CreateModal.tsx index cd3742f14..5e9005191 100644 --- a/projects/app/src/pageComponents/dataset/list/CreateModal.tsx +++ b/projects/app/src/pageComponents/dataset/list/CreateModal.tsx @@ -2,7 +2,6 @@ import React, { useMemo } from 'react'; import { Box, Flex, Button, ModalFooter, ModalBody, Input, HStack } from '@chakra-ui/react'; import { useSelectFile } from '@/web/common/file/hooks/useSelectFile'; import { useForm } from 'react-hook-form'; -import { useToast } from '@fastgpt/web/hooks/useToast'; import { useRouter } from 'next/router'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; @@ -41,7 +40,8 @@ const CreateModal = ({ }) => { const { t } = useTranslation(); const router = useRouter(); - const { defaultModels, embeddingModelList, datasetModelList } = useSystemStore(); + const { feConfigs, defaultModels, embeddingModelList, datasetModelList, getVlmModelList } = + useSystemStore(); const { isPc } = useSystem(); const datasetTypeMap = useMemo(() => { @@ -71,6 +71,8 @@ const CreateModal = ({ const filterNotHiddenVectorModelList = embeddingModelList.filter((item) => !item.hidden); + const vllmModelList = useMemo(() => getVlmModelList(), [getVlmModelList]); + const form = useForm<CreateDatasetParams>({ defaultValues: { parentId, @@ -81,13 +83,15 @@ const CreateModal = ({ vectorModel: defaultModels.embedding?.model || getWebDefaultEmbeddingModel(embeddingModelList)?.model, agentModel: - defaultModels.datasetTextLLM?.model || getWebDefaultLLMModel(datasetModelList)?.model + defaultModels.datasetTextLLM?.model || getWebDefaultLLMModel(datasetModelList)?.model, + vlmModel: defaultModels.datasetImageLLM?.model } }); const { register, setValue, handleSubmit, watch } = form; const avatar = watch('avatar'); const vectorModel = watch('vectorModel'); const agentModel = watch('agentModel'); + const vlmModel = watch('vlmModel'); const { File, @@ -174,6 +178,7 @@ const CreateModal = ({ /> </Flex> </Box> + <Flex mt={6} alignItems={['flex-start', 'center']} @@ -206,6 +211,7 @@ const CreateModal = ({ /> </Box> </Flex> + <Flex mt={6} alignItems={['flex-start', 'center']} @@ -232,11 +238,45 @@ const CreateModal = ({ value: item.model }))} onchange={(e) => { - setValue('agentModel' as const, e); + setValue('agentModel', e); }} /> </Box> </Flex> + + {feConfigs?.isPlus && ( + <Flex + mt={6} + alignItems={['flex-start', 'center']} + justify={'space-between'} + flexDir={['column', 'row']} + > + <HStack + spacing={1} + flex={['', '0 0 110px']} + fontSize={'sm'} + color={'myGray.900'} + fontWeight={500} + pb={['12px', '0']} + > + <Box>{t('dataset:vllm_model')}</Box> + </HStack> + <Box w={['100%', '300px']}> + <AIModelSelector + w={['100%', '300px']} + value={vlmModel} + list={vllmModelList.map((item) => ({ + label: item.name, + value: item.model + }))} + onchange={(e) => { + setValue('vlmModel', e); + }} + /> + </Box> + </Flex> + )} + {/* @ts-ignore */} <ApiDatasetForm type={type} form={form} /> </ModalBody> diff --git a/projects/app/src/pages/api/admin/initv490.ts b/projects/app/src/pages/api/admin/initv490.ts new file mode 100644 index 000000000..f18e9bc8e --- /dev/null +++ b/projects/app/src/pages/api/admin/initv490.ts @@ -0,0 +1,76 @@ +import { NextAPI } from '@/service/middleware/entry'; +import { authCert } from '@fastgpt/service/support/permission/auth/common'; +import { NextApiRequest, NextApiResponse } from 'next'; +import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; +import { DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema'; +import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; +import { PgClient } from '@fastgpt/service/common/vectorStore/pg'; +import { PG_ADDRESS } from '@fastgpt/service/common/vectorStore/constants'; + +// 所有 trainingType=auto 的 collection,都改成 trainingType=chunk +const updateCollections = async () => { + await MongoDatasetCollection.updateMany( + { + trainingType: DatasetCollectionDataProcessModeEnum.auto + }, + { + $set: { + trainingType: DatasetCollectionDataProcessModeEnum.chunk, + autoIndexes: true + } + } + ); +}; +const updateData = async () => { + await MongoDatasetData.updateMany({}, [ + { + $set: { + indexes: { + $map: { + input: '$indexes', + as: 'index', + in: { + $mergeObjects: [ + '$$index', + { + type: { + $cond: { + if: { $eq: ['$$index.defaultIndex', true] }, + then: DatasetDataIndexTypeEnum.default, + else: DatasetDataIndexTypeEnum.custom + } + } + } + ] + } + } + } + } + } + ]); +}; +const upgradePgVector = async () => { + if (!PG_ADDRESS) return; + await PgClient.query(` + ALTER EXTENSION vector UPDATE; + `); +}; + +async function handler(req: NextApiRequest, _res: NextApiResponse) { + await authCert({ req, authRoot: true }); + + console.log('升级 PG vector 插件'); + await upgradePgVector(); + + console.log('变更所有 collection 的 trainingType 为 chunk'); + await updateCollections(); + + console.log( + "更新所有 data 的 index, autoIndex=true 的,增加type='default',其他的增加 type='custom'" + ); + await updateData(); + return { success: true }; +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/common/file/previewContent.ts b/projects/app/src/pages/api/common/file/previewContent.ts deleted file mode 100644 index cfe4d1681..000000000 --- a/projects/app/src/pages/api/common/file/previewContent.ts +++ /dev/null @@ -1,78 +0,0 @@ -/* - Read db file content and response 3000 words -*/ -import type { NextApiResponse } from 'next'; -import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file'; -import { NextAPI } from '@/service/middleware/entry'; -import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; -import { readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read'; -import { ApiRequestProps } from '@fastgpt/service/type/next'; -import { - OwnerPermissionVal, - WritePermissionVal -} from '@fastgpt/global/support/permission/constant'; -import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; - -export type PreviewContextProps = { - datasetId: string; - type: DatasetSourceReadTypeEnum; - sourceId: string; - isQAImport?: boolean; - selector?: string; - externalFileId?: string; -}; - -async function handler(req: ApiRequestProps<PreviewContextProps>, res: NextApiResponse<any>) { - const { type, sourceId, isQAImport, selector, datasetId, externalFileId } = req.body; - - if (!sourceId) { - throw new Error('fileId is empty'); - } - - const { teamId, apiServer, feishuServer, yuqueServer } = await (async () => { - if (type === DatasetSourceReadTypeEnum.fileLocal) { - const res = await authCollectionFile({ - req, - authToken: true, - authApiKey: true, - fileId: sourceId, - per: OwnerPermissionVal - }); - return { - teamId: res.teamId - }; - } - const { dataset } = await authDataset({ - req, - authApiKey: true, - authToken: true, - datasetId, - per: WritePermissionVal - }); - return { - teamId: dataset.teamId, - apiServer: dataset.apiServer, - feishuServer: dataset.feishuServer, - yuqueServer: dataset.yuqueServer - }; - })(); - - const rawText = await readDatasetSourceRawText({ - teamId, - type, - sourceId, - isQAImport, - selector, - apiServer, - feishuServer, - yuqueServer, - externalFileId - }); - - return { - previewContent: rawText.slice(0, 3000), - totalLength: rawText.length - }; -} - -export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/dataset/collection/create/apiCollection.ts b/projects/app/src/pages/api/core/dataset/collection/create/apiCollection.ts index 1950e42f4..c4c4d9578 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/apiCollection.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/apiCollection.ts @@ -4,7 +4,8 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; import { TrainingModeEnum, - DatasetCollectionTypeEnum + DatasetCollectionTypeEnum, + DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants'; import { NextAPI } from '@/service/middleware/entry'; @@ -15,15 +16,7 @@ import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset'; async function handler(req: NextApiRequest): CreateCollectionResponse { - const { - name, - apiFileId, - trainingType = TrainingModeEnum.chunk, - chunkSize = 512, - chunkSplitter, - qaPrompt, - ...body - } = req.body as ApiDatasetCreateDatasetCollectionParams; + const { name, apiFileId, ...body } = req.body as ApiDatasetCreateDatasetCollectionParams; const { teamId, tmbId, dataset } = await authDataset({ req, @@ -56,7 +49,8 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { feishuServer, yuqueServer, apiFileId, - teamId + teamId, + tmbId }); const { collectionId, insertResults } = await createCollectionAndInsertData({ @@ -69,10 +63,6 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { tmbId, type: DatasetCollectionTypeEnum.apiFile, name: name, - trainingType, - chunkSize, - chunkSplitter, - qaPrompt, apiFileId, metadata: { relatedImgId: apiFileId diff --git a/projects/app/src/pages/api/core/dataset/collection/create/csvTable.ts b/projects/app/src/pages/api/core/dataset/collection/create/csvTable.ts index 443858123..f7178492d 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/csvTable.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/csvTable.ts @@ -4,6 +4,7 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; import { + DatasetCollectionDataProcessModeEnum, DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; @@ -15,7 +16,6 @@ import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schem async function handler(req: NextApiRequest): CreateCollectionResponse { const { datasetId, parentId, fileId, ...body } = req.body as FileIdCreateDatasetCollectionParams; - const trainingType = TrainingModeEnum.chunk; const { teamId, tmbId, dataset } = await authDataset({ req, authToken: true, @@ -27,6 +27,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { // 1. read file const { rawText, filename } = await readFileContentFromMongo({ teamId, + tmbId, bucketName: BucketNameEnum.dataset, fileId, isQAImport: true @@ -47,7 +48,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { fileId, // special metadata - trainingType, + trainingType: DatasetCollectionDataProcessModeEnum.chunk, chunkSize: 0 } }); diff --git a/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts b/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts index 967c2ba31..2dcdc8ee5 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts @@ -2,12 +2,8 @@ import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/co import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; -import { - DatasetCollectionTypeEnum, - TrainingModeEnum -} from '@fastgpt/global/core/dataset/constants'; +import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; -import { hashStr } from '@fastgpt/global/common/string/tools'; import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schema'; import { NextAPI } from '@/service/middleware/entry'; import { ApiRequestProps } from '@fastgpt/service/type/next'; @@ -17,14 +13,7 @@ import { CreateCollectionResponse } from '@/global/core/dataset/api'; async function handler( req: ApiRequestProps<FileIdCreateDatasetCollectionParams> ): CreateCollectionResponse { - const { - fileId, - trainingType = TrainingModeEnum.chunk, - chunkSize = 512, - chunkSplitter, - qaPrompt, - ...body - } = req.body; + const { fileId, customPdfParse, ...body } = req.body; const { teamId, tmbId, dataset } = await authDataset({ req, @@ -37,8 +26,10 @@ async function handler( // 1. read file const { rawText, filename } = await readFileContentFromMongo({ teamId, + tmbId, bucketName: BucketNameEnum.dataset, - fileId + fileId, + customPdfParse }); const { collectionId, insertResults } = await createCollectionAndInsertData({ @@ -54,12 +45,7 @@ async function handler( metadata: { relatedImgId: fileId }, - - // special metadata - trainingType, - chunkSize, - chunkSplitter, - qaPrompt + customPdfParse }, relatedId: fileId diff --git a/projects/app/src/pages/api/core/dataset/collection/create/link.ts b/projects/app/src/pages/api/core/dataset/collection/create/link.ts index db9eaf9d8..2c8800434 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/link.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/link.ts @@ -13,14 +13,7 @@ import { urlsFetch } from '@fastgpt/service/common/string/cheerio'; import { hashStr } from '@fastgpt/global/common/string/tools'; async function handler(req: NextApiRequest): CreateCollectionResponse { - const { - link, - trainingType = TrainingModeEnum.chunk, - chunkSize = 512, - chunkSplitter, - qaPrompt, - ...body - } = req.body as LinkCreateDatasetCollectionParams; + const { link, ...body } = req.body as LinkCreateDatasetCollectionParams; const { teamId, tmbId, dataset } = await authDataset({ req, @@ -36,8 +29,8 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { }); const { title = link, content = '' } = result[0]; - if (!content) { - return Promise.reject('Can not fetch content from link'); + if (!content || content === 'Cannot fetch internal url') { + return Promise.reject(content || 'Can not fetch content from link'); } const { collectionId, insertResults } = await createCollectionAndInsertData({ @@ -45,7 +38,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { rawText: content, createCollectionParams: { ...body, - name: title, + name: title || link, teamId, tmbId, type: DatasetCollectionTypeEnum.link, @@ -53,12 +46,6 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { relatedImgId: link, webPageSelector: body?.metadata?.webPageSelector }, - - trainingType, - chunkSize, - chunkSplitter, - qaPrompt, - rawLink: link }, diff --git a/projects/app/src/pages/api/core/dataset/collection/create/localFile.ts b/projects/app/src/pages/api/core/dataset/collection/create/localFile.ts index f8548f931..a8b8e1a1a 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/localFile.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/localFile.ts @@ -6,7 +6,7 @@ import { FileCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/ import { removeFilesByPaths } from '@fastgpt/service/common/file/utils'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; -import { getNanoid, hashStr } from '@fastgpt/global/common/string/tools'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; import { readRawTextByLocalFile } from '@fastgpt/service/common/file/read/utils'; import { NextAPI } from '@/service/middleware/entry'; @@ -48,8 +48,10 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>): CreateCo // 1. read file const { rawText } = await readRawTextByLocalFile({ teamId, + tmbId, path: file.path, encoding: file.encoding, + customPdfParse: collectionData.customPdfParse, metadata: { ...fileMetadata, relatedId: relatedImgId diff --git a/projects/app/src/pages/api/core/dataset/collection/create/reTrainingCollection.ts b/projects/app/src/pages/api/core/dataset/collection/create/reTrainingCollection.ts index 48cccdd26..9671d9912 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/reTrainingCollection.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/reTrainingCollection.ts @@ -24,20 +24,14 @@ type RetrainingCollectionResponse = { async function handler( req: ApiRequestProps<reTrainingDatasetFileCollectionParams> ): Promise<RetrainingCollectionResponse> { - const { - collectionId, - trainingType = TrainingModeEnum.chunk, - chunkSize = 512, - chunkSplitter, - qaPrompt - } = req.body; + const { collectionId, customPdfParse, ...data } = req.body; if (!collectionId) { return Promise.reject(CommonErrEnum.missingParams); } // 凭证校验 - const { collection } = await authDatasetCollection({ + const { collection, teamId, tmbId } = await authDatasetCollection({ req, authToken: true, authApiKey: true, @@ -84,7 +78,9 @@ async function handler( })(); const rawText = await readDatasetSourceRawText({ - teamId: collection.teamId, + teamId, + tmbId, + customPdfParse, ...sourceReadType }); @@ -100,12 +96,15 @@ async function handler( dataset: collection.dataset, rawText, createCollectionParams: { + ...data, teamId: collection.teamId, tmbId: collection.tmbId, datasetId: collection.dataset._id, name: collection.name, type: collection.type, + customPdfParse, + fileId: collection.fileId, rawLink: collection.rawLink, externalFileId: collection.externalFileId, @@ -121,10 +120,6 @@ async function handler( parentId: collection.parentId, // special metadata - trainingType, - chunkSize, - chunkSplitter, - qaPrompt, metadata: collection.metadata } }); diff --git a/projects/app/src/pages/api/core/dataset/collection/create/text.ts b/projects/app/src/pages/api/core/dataset/collection/create/text.ts index ec3af225d..c77b50d57 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/text.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/text.ts @@ -2,25 +2,13 @@ import type { NextApiRequest } from 'next'; import type { TextCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; -import { - TrainingModeEnum, - DatasetCollectionTypeEnum -} from '@fastgpt/global/core/dataset/constants'; -import { hashStr } from '@fastgpt/global/common/string/tools'; +import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { NextAPI } from '@/service/middleware/entry'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { CreateCollectionResponse } from '@/global/core/dataset/api'; async function handler(req: NextApiRequest): CreateCollectionResponse { - const { - name, - text, - trainingType = TrainingModeEnum.chunk, - chunkSize = 512, - chunkSplitter, - qaPrompt, - ...body - } = req.body as TextCreateDatasetCollectionParams; + const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams; const { teamId, tmbId, dataset } = await authDataset({ req, @@ -39,11 +27,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { tmbId, type: DatasetCollectionTypeEnum.virtual, - name, - trainingType, - chunkSize, - chunkSplitter, - qaPrompt + name } }); diff --git a/projects/app/src/pages/api/core/dataset/create.ts b/projects/app/src/pages/api/core/dataset/create.ts index 1d356b6c8..40e4b5ef2 100644 --- a/projects/app/src/pages/api/core/dataset/create.ts +++ b/projects/app/src/pages/api/core/dataset/create.ts @@ -6,12 +6,12 @@ import { getLLMModel, getEmbeddingModel, getDatasetModel, - getDefaultEmbeddingModel + getDefaultEmbeddingModel, + getVlmModel } from '@fastgpt/service/core/ai/model'; import { checkTeamDatasetLimit } from '@fastgpt/service/support/permission/teamLimit'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { NextAPI } from '@/service/middleware/entry'; -import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset'; import type { ApiRequestProps } from '@fastgpt/service/type/next'; import { parseParentIdInMongo } from '@fastgpt/global/common/parentFolder/utils'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; @@ -32,8 +32,9 @@ async function handler( intro, type = DatasetTypeEnum.dataset, avatar, - vectorModel = getDefaultEmbeddingModel().model, - agentModel = getDatasetModel().model, + vectorModel = getDefaultEmbeddingModel()?.model, + agentModel = getDatasetModel()?.model, + vlmModel, apiServer, feishuServer, yuqueServer @@ -63,8 +64,11 @@ async function handler( // check model valid const vectorModelStore = getEmbeddingModel(vectorModel); const agentModelStore = getLLMModel(agentModel); - if (!vectorModelStore || !agentModelStore) { - return Promise.reject(DatasetErrEnum.invalidVectorModelOrQAModel); + if (!vectorModelStore) { + return Promise.reject(`System not embedding model`); + } + if (!agentModelStore) { + return Promise.reject(`System not llm model`); } // check limit @@ -81,6 +85,7 @@ async function handler( tmbId, vectorModel, agentModel, + vlmModel, avatar, type, apiServer, diff --git a/projects/app/src/pages/api/core/dataset/data/pushData.ts b/projects/app/src/pages/api/core/dataset/data/pushData.ts index 2df622353..de0a8f000 100644 --- a/projects/app/src/pages/api/core/dataset/data/pushData.ts +++ b/projects/app/src/pages/api/core/dataset/data/pushData.ts @@ -7,9 +7,13 @@ import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils'; import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller'; import { NextAPI } from '@/service/middleware/entry'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; +import { getTrainingModeByCollection } from '@fastgpt/service/core/dataset/collection/utils'; async function handler(req: NextApiRequest, res: NextApiResponse<any>) { const body = req.body as PushDatasetDataProps; + // Adapter 4.9.0 + body.trainingType = body.trainingType || body.trainingMode; + const { collectionId, data } = body; if (!collectionId || !Array.isArray(data)) { @@ -32,7 +36,7 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>) { // auth dataset limit await checkDatasetLimit({ teamId, - insertLen: predictDataLimitLength(collection.trainingType, data) + insertLen: predictDataLimitLength(getTrainingModeByCollection(collection), data) }); return pushDataListToTrainingQueue({ @@ -40,8 +44,9 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>) { teamId, tmbId, datasetId: collection.datasetId, + vectorModel: collection.dataset.vectorModel, agentModel: collection.dataset.agentModel, - vectorModel: collection.dataset.vectorModel + vlmModel: collection.dataset.vlmModel }); } diff --git a/projects/app/src/pages/api/core/dataset/detail.ts b/projects/app/src/pages/api/core/dataset/detail.ts index 6f96d7404..f1e17e4e8 100644 --- a/projects/app/src/pages/api/core/dataset/detail.ts +++ b/projects/app/src/pages/api/core/dataset/detail.ts @@ -1,4 +1,4 @@ -import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model'; +import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant'; import { NextAPI } from '@/service/middleware/entry'; @@ -51,7 +51,8 @@ async function handler(req: ApiRequestProps<Query>): Promise<DatasetItemType> { : undefined, permission, vectorModel: getEmbeddingModel(dataset.vectorModel), - agentModel: getLLMModel(dataset.agentModel) + agentModel: getLLMModel(dataset.agentModel), + vlmModel: getVlmModel(dataset.vlmModel) }; } diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index d1d22b3fa..690c016a3 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -17,6 +17,7 @@ export type PostPreviewFilesChunksProps = { chunkSize: number; overlapRatio: number; customSplitChar?: string; + customPdfParse?: boolean; // Read params selector?: string; @@ -40,7 +41,8 @@ async function handler( selector, isQAImport, datasetId, - externalFileId + externalFileId, + customPdfParse = false } = req.body; if (!sourceId) { @@ -50,7 +52,7 @@ async function handler( throw new Error('chunkSize is too large, should be less than 30000'); } - const { teamId, apiServer, feishuServer, yuqueServer } = await (async () => { + const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => { if (type === DatasetSourceReadTypeEnum.fileLocal) { const res = await authCollectionFile({ req, @@ -60,10 +62,11 @@ async function handler( per: OwnerPermissionVal }); return { - teamId: res.teamId + teamId: res.teamId, + tmbId: res.tmbId }; } - const { dataset } = await authDataset({ + const { dataset, teamId, tmbId } = await authDataset({ req, authApiKey: true, authToken: true, @@ -71,7 +74,8 @@ async function handler( per: WritePermissionVal }); return { - teamId: dataset.teamId, + teamId, + tmbId, apiServer: dataset.apiServer, feishuServer: dataset.feishuServer, yuqueServer: dataset.yuqueServer @@ -80,6 +84,7 @@ async function handler( const rawText = await readDatasetSourceRawText({ teamId, + tmbId, type, sourceId, selector, @@ -87,7 +92,8 @@ async function handler( apiServer, feishuServer, yuqueServer, - externalFileId + externalFileId, + customPdfParse }); return rawText2Chunks({ @@ -96,6 +102,6 @@ async function handler( overlapRatio, customReg: customSplitChar ? [customSplitChar] : [], isQAImport: isQAImport - }).slice(0, 15); + }).slice(0, 10); } export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts b/projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts index a1ac81d34..851547447 100644 --- a/projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts +++ b/projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts @@ -6,7 +6,7 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema'; import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller'; import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants'; -import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model'; +import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { ApiRequestProps } from '@fastgpt/service/type/next'; import { OwnerPermissionVal } from '@fastgpt/global/support/permission/constant'; @@ -50,7 +50,8 @@ async function handler(req: ApiRequestProps<rebuildEmbeddingBody>): Promise<Resp appName: '切换索引模型', billSource: UsageSourceEnum.training, vectorModel: getEmbeddingModel(dataset.vectorModel)?.name, - agentModel: getLLMModel(dataset.agentModel)?.name + agentModel: getLLMModel(dataset.agentModel)?.name, + vllmModel: getVlmModel(dataset.vlmModel)?.name }); // update vector model and dataset.data rebuild field diff --git a/projects/app/src/pages/api/core/dataset/update.ts b/projects/app/src/pages/api/core/dataset/update.ts index f20eeeb6e..2ebff1121 100644 --- a/projects/app/src/pages/api/core/dataset/update.ts +++ b/projects/app/src/pages/api/core/dataset/update.ts @@ -56,6 +56,7 @@ async function handler( avatar, intro, agentModel, + vlmModel, websiteConfig, externalReadUrl, apiServer, @@ -109,7 +110,7 @@ async function handler( updateTraining({ teamId: dataset.teamId, datasetId: id, - agentModel: agentModel?.model + agentModel }); const onUpdate = async (session: ClientSession) => { @@ -119,7 +120,8 @@ async function handler( ...parseParentIdInMongo(parentId), ...(name && { name }), ...(avatar && { avatar }), - ...(agentModel && { agentModel: agentModel.model }), + ...(agentModel && { agentModel }), + ...(vlmModel && { vlmModel }), ...(websiteConfig && { websiteConfig }), ...(status && { status }), ...(intro !== undefined && { intro }), @@ -212,7 +214,7 @@ const updateTraining = async ({ $set: { model: agentModel, retryCount: 5, - lockTime: new Date() + lockTime: new Date('2000/1/1') } } ); diff --git a/projects/app/src/pages/api/support/wallet/usage/createTrainingUsage.ts b/projects/app/src/pages/api/support/wallet/usage/createTrainingUsage.ts index 3b96351a3..47dafcf42 100644 --- a/projects/app/src/pages/api/support/wallet/usage/createTrainingUsage.ts +++ b/projects/app/src/pages/api/support/wallet/usage/createTrainingUsage.ts @@ -1,7 +1,7 @@ -import type { NextApiRequest, NextApiResponse } from 'next'; +import type { NextApiRequest } from 'next'; import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants'; import { CreateTrainingUsageProps } from '@fastgpt/global/support/wallet/usage/api.d'; -import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model'; +import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model'; import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; @@ -24,7 +24,8 @@ async function handler(req: NextApiRequest) { appName: name, billSource: UsageSourceEnum.training, vectorModel: getEmbeddingModel(dataset.vectorModel).name, - agentModel: getLLMModel(dataset.agentModel).name + agentModel: getLLMModel(dataset.agentModel).name, + vllmModel: getVlmModel(dataset.vlmModel)?.name }); return billId; diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index f6298e2ad..ae77ce77e 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -8,12 +8,60 @@ import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/con import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils'; import { jiebaSplit } from '@fastgpt/service/common/string/jieba'; import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller'; -import { DatasetDataItemType } from '@fastgpt/global/core/dataset/type'; +import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type'; import { getEmbeddingModel } from '@fastgpt/service/core/ai/model'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { ClientSession } from '@fastgpt/service/common/mongo'; import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema'; +import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; +const formatIndexes = ({ + indexes, + q, + a = '' +}: { + indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[]; + q: string; + a?: string; +}) => { + indexes = indexes || []; + // If index not type, set it to custom + indexes = indexes + .map((item) => ({ + text: typeof item.text === 'string' ? item.text : String(item.text), + type: item.type || DatasetDataIndexTypeEnum.custom, + dataId: item.dataId + })) + .filter((item) => !!item.text.trim()); + + // Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds + const defaultIndexes = getDefaultIndex({ q, a }); + const concatDefaultIndexes = defaultIndexes.map((item) => { + const oldIndex = indexes!.find((index) => index.text === item.text); + if (oldIndex) { + return { + type: DatasetDataIndexTypeEnum.default, + text: item.text, + dataId: oldIndex.dataId + }; + } else { + return item; + } + }); + indexes = indexes.filter((item) => item.type !== DatasetDataIndexTypeEnum.default); + indexes.push(...concatDefaultIndexes); + + // Filter same text + indexes = indexes.filter( + (item, index, self) => index === self.findIndex((t) => t.text === item.text) + ); + + return indexes.map((index) => ({ + type: index.type, + text: index.text, + dataId: index.dataId + })); +}; /* insert data. * 1. create data id * 2. insert pg @@ -41,42 +89,28 @@ export async function insertData2Dataset({ return Promise.reject("teamId and tmbId can't be the same"); } - const qaStr = getDefaultIndex({ q, a }).text; - // 1. Get vector indexes and insert // Empty indexes check, if empty, create default index - indexes = - Array.isArray(indexes) && indexes.length > 0 - ? indexes.map((index) => ({ - text: index.text, - dataId: undefined, - defaultIndex: index.text.trim() === qaStr - })) - : [getDefaultIndex({ q, a })]; - - if (!indexes.find((index) => index.defaultIndex)) { - indexes.unshift(getDefaultIndex({ q, a })); - } else if (q && a && !indexes.find((index) => index.text === q)) { - // push a q index - indexes.push({ - defaultIndex: false, - text: q - }); - } - - indexes = indexes.slice(0, 6); + const newIndexes = formatIndexes({ indexes, q, a }); // insert to vector store const result = await Promise.all( - indexes.map((item) => - insertDatasetDataVector({ + newIndexes.map(async (item) => { + const result = await insertDatasetDataVector({ query: item.text, model: getEmbeddingModel(model), teamId, datasetId, collectionId - }) - ) + }); + return { + tokens: result.tokens, + index: { + ...item, + dataId: result.insertId + } + }; + }) ); // 2. Create mongo data @@ -89,13 +123,8 @@ export async function insertData2Dataset({ collectionId, q, a, - // FullText tmp - // fullTextToken: jiebaSplit({ text: qaStr }), chunkIndex, - indexes: indexes?.map((item, i) => ({ - ...item, - dataId: result[i].insertId - })) + indexes: result.map((item) => item.index) } ], { session, ordered: true } @@ -109,7 +138,7 @@ export async function insertData2Dataset({ datasetId, collectionId, dataId: _id, - fullTextToken: jiebaSplit({ text: qaStr }) + fullTextToken: jiebaSplit({ text: `${q}\n${a}`.trim() }) } ], { session, ordered: true } @@ -122,7 +151,7 @@ export async function insertData2Dataset({ } /** - * update data + * Update data(indexes overwrite) * 1. compare indexes * 2. insert new pg data * session run: @@ -139,30 +168,19 @@ export async function updateData2Dataset({ if (!Array.isArray(indexes)) { return Promise.reject('indexes is required'); } - const qaStr = getDefaultIndex({ q, a }).text; - // patch index and update pg + // 1. Get mongo data const mongoData = await MongoDatasetData.findById(dataId); if (!mongoData) return Promise.reject('core.dataset.error.Data not found'); - // remove defaultIndex - let formatIndexes = indexes.map((index) => ({ - ...index, - text: index.text.trim(), - defaultIndex: index.text.trim() === qaStr - })); - if (!formatIndexes.find((index) => index.defaultIndex)) { - const defaultIndex = mongoData.indexes.find((index) => index.defaultIndex); - formatIndexes.unshift(defaultIndex ? defaultIndex : getDefaultIndex({ q, a })); - } - formatIndexes = formatIndexes.slice(0, 6); + // 2. Compute indexes + const formatIndexesResult = formatIndexes({ indexes, q, a }); - // patch indexes, create, update, delete + // 3. Patch indexes, create, update, delete const patchResult: PatchIndexesProps[] = []; - // find database indexes in new Indexes, if have not, delete it for (const item of mongoData.indexes) { - const index = formatIndexes.find((index) => index.dataId === item.dataId); + const index = formatIndexesResult.find((index) => index.dataId === item.dataId); if (!index) { patchResult.push({ type: 'delete', @@ -170,53 +188,48 @@ export async function updateData2Dataset({ }); } } - for (const item of formatIndexes) { - const index = mongoData.indexes.find((index) => index.dataId === item.dataId); - // in database, update - if (index) { - // default index update - if (index.defaultIndex && index.text !== qaStr) { - patchResult.push({ - type: 'update', - index: { - //@ts-ignore - ...index.toObject(), - text: qaStr - } - }); - continue; - } - // custom index update - if (index.text !== item.text) { - patchResult.push({ - type: 'update', - index: item - }); - continue; - } - patchResult.push({ - type: 'unChange', - index: item - }); - } else { - // not in database, create + for (const item of formatIndexesResult) { + if (!item.dataId) { patchResult.push({ type: 'create', index: item }); + } else { + const index = mongoData.indexes.find((index) => index.dataId === item.dataId); + if (!index) continue; + + // Not change + if (index.text === item.text) { + patchResult.push({ + type: 'unChange', + index: { + ...item, + dataId: index.dataId + } + }); + } else { + // index Update + patchResult.push({ + type: 'update', + index: { + ...item, + dataId: index.dataId + } + }); + } } } - // update mongo updateTime + // 4. Update mongo updateTime(便于脏数据检查器识别) mongoData.updateTime = new Date(); await mongoData.save(); - // insert vector - const clonePatchResult2Insert: PatchIndexesProps[] = JSON.parse(JSON.stringify(patchResult)); + // 5. Insert vector const insertResult = await Promise.all( - clonePatchResult2Insert.map(async (item) => { - // insert new vector and update dateId - if (item.type === 'create' || item.type === 'update') { + patchResult + .filter((item) => item.type === 'create' || item.type === 'update') + .map(async (item) => { + // insert new vector and update dateId const result = await insertDatasetDataVector({ query: item.index.text, model: getEmbeddingModel(model), @@ -225,26 +238,22 @@ export async function updateData2Dataset({ collectionId: mongoData.collectionId }); item.index.dataId = result.insertId; - return result; - } - return { - tokens: 0 - }; - }) + return { + tokens: result.tokens + }; + }) ); const tokens = insertResult.reduce((acc, cur) => acc + cur.tokens, 0); + + const newIndexes = patchResult + .filter((item) => item.type !== 'delete') + .map((item) => item.index) as DatasetDataIndexItemType[]; + // console.log(clonePatchResult2Insert); await mongoSessionRun(async (session) => { - // update mongo - const newIndexes = clonePatchResult2Insert - .filter((item) => item.type !== 'delete') - .map((item) => item.index); - // update mongo other data + // Update MongoData mongoData.q = q || mongoData.q; mongoData.a = a ?? mongoData.a; - // FullText tmp - // mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() }); - // @ts-ignore mongoData.indexes = newIndexes; await mongoData.save({ session }); @@ -255,15 +264,15 @@ export async function updateData2Dataset({ { session } ); - // delete vector + // Delete vector const deleteIdList = patchResult .filter((item) => item.type === 'delete' || item.type === 'update') .map((item) => item.index.dataId) - .filter(Boolean); + .filter(Boolean) as string[]; if (deleteIdList.length > 0) { await deleteDatasetDataVector({ teamId: mongoData.teamId, - idList: deleteIdList as string[] + idList: deleteIdList }); } }); diff --git a/projects/app/src/service/events/generateQA.ts b/projects/app/src/service/events/generateQA.ts index 4da78fa5b..4a335cc22 100644 --- a/projects/app/src/service/events/generateQA.ts +++ b/projects/app/src/service/events/generateQA.ts @@ -142,7 +142,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`; teamId: data.teamId, tmbId: data.tmbId, collectionId: data.collectionId, - trainingMode: TrainingModeEnum.chunk, + mode: TrainingModeEnum.chunk, data: qaArr.map((item) => ({ ...item, chunkIndex: data.chunkIndex @@ -179,9 +179,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`; } } -/** - * 检查文本是否按格式返回 - */ +// Format qa answer function formatSplitText(text: string, rawText: string) { text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格 const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式 @@ -194,13 +192,7 @@ function formatSplitText(text: string, rawText: string) { if (q) { result.push({ q, - a, - indexes: [ - { - defaultIndex: true, - text: `${q}\n${a.trim().replace(/\n\s*/g, '\n')}` - } - ] + a }); } } @@ -211,13 +203,7 @@ function formatSplitText(text: string, rawText: string) { chunks.forEach((chunk) => { result.push({ q: chunk, - a: '', - indexes: [ - { - defaultIndex: true, - text: chunk - } - ] + a: '' }); }); } diff --git a/projects/app/src/service/events/generateVector.ts b/projects/app/src/service/events/generateVector.ts index 8dc8bb87b..a8e4f5ca9 100644 --- a/projects/app/src/service/events/generateVector.ts +++ b/projects/app/src/service/events/generateVector.ts @@ -20,6 +20,16 @@ const reduceQueue = () => { return global.vectorQueueLen === 0; }; +const reduceQueueAndReturn = (delay = 0) => { + reduceQueue(); + if (delay) { + setTimeout(() => { + generateVector(); + }, delay); + } else { + generateVector(); + } +}; /* 索引生成队列。每导入一次,就是一个单独的线程 */ export async function generateVector(): Promise<any> { @@ -45,20 +55,7 @@ export async function generateVector(): Promise<any> { lockTime: new Date(), $inc: { retryCount: -1 } } - ).select({ - _id: 1, - teamId: 1, - tmbId: 1, - datasetId: 1, - collectionId: 1, - q: 1, - a: 1, - chunkIndex: 1, - dataId: 1, - indexes: 1, - model: 1, - billId: 1 - }); + ); // task preemption if (!data) { @@ -85,14 +82,12 @@ export async function generateVector(): Promise<any> { } if (error) { addLog.error(`[Vector Queue] Error`, { error }); - reduceQueue(); - return generateVector(); + return reduceQueueAndReturn(); } // auth balance if (!(await checkTeamAiPointsAndLock(data.teamId))) { - reduceQueue(); - return generateVector(); + return reduceQueueAndReturn(); } addLog.info(`[Vector Queue] Start`); @@ -119,15 +114,10 @@ export async function generateVector(): Promise<any> { time: Date.now() - start }); - reduceQueue(); - generateVector(); + return reduceQueueAndReturn(); } catch (err: any) { addLog.error(`[Vector Queue] Error`, err); - reduceQueue(); - - setTimeout(() => { - generateVector(); - }, 1000); + return reduceQueueAndReturn(1000); } } diff --git a/projects/app/src/service/support/wallet/usage/push.ts b/projects/app/src/service/support/wallet/usage/push.ts index fe14f7058..c6580ca91 100644 --- a/projects/app/src/service/support/wallet/usage/push.ts +++ b/projects/app/src/service/support/wallet/usage/push.ts @@ -127,12 +127,12 @@ export const pushGenerateVectorUsage = ({ createUsage({ teamId, tmbId, - appName: i18nT('common:support.wallet.moduleName.index'), + appName: i18nT('account_usage:embedding_index'), totalPoints, source, list: [ { - moduleName: i18nT('common:support.wallet.moduleName.index'), + moduleName: i18nT('account_usage:embedding_index'), amount: totalVector, model: vectorModelName, inputTokens @@ -203,7 +203,7 @@ export const pushQuestionGuideUsage = ({ }); }; -export function pushAudioSpeechUsage({ +export const pushAudioSpeechUsage = ({ appName = i18nT('common:support.wallet.usage.Audio Speech'), model, charsLength, @@ -217,7 +217,7 @@ export function pushAudioSpeechUsage({ teamId: string; tmbId: string; source: UsageSourceEnum; -}) { +}) => { const { totalPoints, modelName } = formatModelChars2Points({ model, inputTokens: charsLength, @@ -239,9 +239,9 @@ export function pushAudioSpeechUsage({ } ] }); -} +}; -export function pushWhisperUsage({ +export const pushWhisperUsage = ({ teamId, tmbId, duration @@ -249,7 +249,7 @@ export function pushWhisperUsage({ teamId: string; tmbId: string; duration: number; -}) { +}) => { const whisperModel = getDefaultTTSModel(); if (!whisperModel) return; @@ -278,4 +278,4 @@ export function pushWhisperUsage({ } ] }); -} +}; diff --git a/projects/app/src/web/common/file/api.ts b/projects/app/src/web/common/file/api.ts index 32a899699..d11ebabd0 100644 --- a/projects/app/src/web/common/file/api.ts +++ b/projects/app/src/web/common/file/api.ts @@ -1,4 +1,3 @@ -import type { PreviewContextProps } from '@/pages/api/common/file/previewContent'; import { GET, POST } from '@/web/common/api/request'; import type { UploadImgProps } from '@fastgpt/global/common/file/api.d'; import { AxiosProgressEvent } from 'axios'; @@ -19,11 +18,3 @@ export const postUploadFiles = ( 'Content-Type': 'multipart/form-data; charset=utf-8' } }); - -export const getPreviewFileContent = (data: PreviewContextProps) => - POST<{ - previewContent: string; - totalLength: number; - }>('/common/file/previewContent', data, { - timeout: 600000 - }); diff --git a/projects/app/src/web/common/system/useSystemStore.ts b/projects/app/src/web/common/system/useSystemStore.ts index 68feecd3f..373b389e4 100644 --- a/projects/app/src/web/common/system/useSystemStore.ts +++ b/projects/app/src/web/common/system/useSystemStore.ts @@ -53,6 +53,7 @@ type State = { defaultModels: SystemDefaultModelType; llmModelList: LLMModelItemType[]; datasetModelList: LLMModelItemType[]; + getVlmModelList: () => LLMModelItemType[]; embeddingModelList: EmbeddingModelItemType[]; ttsModelList: TTSModelType[]; reRankModelList: ReRankModelItemType[]; @@ -134,6 +135,9 @@ export const useSystemStore = create<State>()( ttsModelList: [], reRankModelList: [], sttModelList: [], + getVlmModelList: () => { + return get().llmModelList.filter((item) => item.vision); + }, initStaticData(res) { set((state) => { state.initDataBufferId = res.bufferId; diff --git a/projects/app/src/web/core/ai/channel.ts b/projects/app/src/web/core/ai/channel.ts index b7a10061b..fd4534a3f 100644 --- a/projects/app/src/web/core/ai/channel.ts +++ b/projects/app/src/web/core/ai/channel.ts @@ -130,7 +130,8 @@ export const postCreateChannel = (data: CreateChannelProps) => base_url: data.base_url, models: data.models, model_mapping: data.model_mapping, - key: data.key + key: data.key, + priority: 1 }); export const putChannelStatus = (id: number, status: ChannelStatusEnum) => @@ -146,7 +147,7 @@ export const putChannel = (data: ChannelInfoType) => model_mapping: data.model_mapping, key: data.key, status: data.status, - priority: data.priority + priority: data.priority ? Math.max(data.priority, 1) : undefined }); export const deleteChannel = (id: number) => DELETE(`/channel/${id}`); diff --git a/projects/app/src/web/core/app/utils.ts b/projects/app/src/web/core/app/utils.ts index 016a6bcef..546efea02 100644 --- a/projects/app/src/web/core/app/utils.ts +++ b/projects/app/src/web/core/app/utils.ts @@ -504,6 +504,13 @@ export function form2AppWorkflow( label: '', valueType: WorkflowIOValueTypeEnum.boolean, value: true + }, + { + key: NodeInputKeyEnum.aiChatReasoning, + renderTypeList: [FlowNodeInputTypeEnum.hidden], + label: '', + valueType: WorkflowIOValueTypeEnum.boolean, + value: formData.aiSettings.aiChatReasoning } ], outputs: ToolModule.outputs diff --git a/projects/app/src/web/core/dataset/api.ts b/projects/app/src/web/core/dataset/api.ts index 94a467c46..e44c67f09 100644 --- a/projects/app/src/web/core/dataset/api.ts +++ b/projects/app/src/web/core/dataset/api.ts @@ -215,7 +215,10 @@ export const getDatasetTrainingQueue = (datasetId: string) => }); export const getPreviewChunks = (data: PostPreviewFilesChunksProps) => - POST<PreviewChunksResponse>('/core/dataset/file/getPreviewChunks', data); + POST<PreviewChunksResponse>('/core/dataset/file/getPreviewChunks', data, { + maxQuantity: 1, + timeout: 600000 + }); /* ================== read source ======================== */ export const getCollectionSource = (data: readCollectionSourceBody) => diff --git a/projects/app/src/web/core/dataset/constants.ts b/projects/app/src/web/core/dataset/constants.ts index be67d3736..860e73502 100644 --- a/projects/app/src/web/core/dataset/constants.ts +++ b/projects/app/src/web/core/dataset/constants.ts @@ -1,8 +1,8 @@ import { defaultQAModels, defaultVectorModels } from '@fastgpt/global/core/ai/model'; import { + DatasetCollectionDataProcessModeEnum, DatasetCollectionTypeEnum, - DatasetTypeEnum, - TrainingModeEnum + DatasetTypeEnum } from '@fastgpt/global/core/dataset/constants'; import type { DatasetCollectionItemType, @@ -25,6 +25,7 @@ export const defaultDatasetDetail: DatasetItemType = { permission: new DatasetPermission(), vectorModel: defaultVectorModels[0], agentModel: defaultQAModels[0], + vlmModel: defaultQAModels[0], inheritPermission: true }; @@ -57,13 +58,13 @@ export const defaultCollectionDetail: DatasetCollectionItemType = { sourceName: '', sourceId: '', createTime: new Date(), - trainingType: TrainingModeEnum.chunk, + trainingType: DatasetCollectionDataProcessModeEnum.chunk, chunkSize: 0, permission: new DatasetPermission(), indexAmount: 0 }; -export enum ImportProcessWayEnum { +export enum ChunkSettingModeEnum { auto = 'auto', custom = 'custom' } diff --git a/projects/app/src/web/core/dataset/context/datasetPageContext.tsx b/projects/app/src/web/core/dataset/context/datasetPageContext.tsx index 46ab86361..26601605f 100644 --- a/projects/app/src/web/core/dataset/context/datasetPageContext.tsx +++ b/projects/app/src/web/core/dataset/context/datasetPageContext.tsx @@ -18,6 +18,7 @@ import { DatasetItemType, DatasetTagType } from '@fastgpt/global/core/dataset/ty import { useSystemStore } from '@/web/common/system/useSystemStore'; import { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; +import { getWebLLMModel } from '@/web/common/system/utils'; type DatasetPageContextType = { datasetId: string; @@ -116,6 +117,8 @@ export const DatasetPageContextProvider = ({ setDatasetDetail((state) => ({ ...state, ...data, + agentModel: getWebLLMModel(data.agentModel), + vlmModel: getWebLLMModel(data.vlmModel), apiServer: data.apiServer ? { baseUrl: data.apiServer.baseUrl, diff --git a/projects/app/src/web/core/dataset/type.d.ts b/projects/app/src/web/core/dataset/type.d.ts index 6470e783b..a095bc798 100644 --- a/projects/app/src/web/core/dataset/type.d.ts +++ b/projects/app/src/web/core/dataset/type.d.ts @@ -1,6 +1,6 @@ import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; -import { ImportProcessWayEnum } from './constants'; +import { ChunkSettingModeEnum } from './constants'; import { UseFormReturn } from 'react-hook-form'; import { APIFileItem } from '@fastgpt/global/core/dataset/apiDataset'; @@ -44,7 +44,7 @@ export type ImportSourceParamsType = UseFormReturn< customSplitChar: string; prompt: string; mode: TrainingModeEnum; - way: ImportProcessWayEnum; + way: ChunkSettingModeEnum; }, any >; diff --git a/scripts/postinstall.sh b/scripts/postinstall.sh index f332b8dc7..6661aa20e 100755 --- a/scripts/postinstall.sh +++ b/scripts/postinstall.sh @@ -1,4 +1,2 @@ -# 创建临时文件目录 -mkdir -p projects/app/tmp # 初始化UI库的自定义ts类型 pnpm run gen:theme-typings