Compare commits
17 Commits
v4.7
...
v4.7.1-alp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
adfad8ff7f | ||
|
|
1fbc407ecf | ||
|
|
3b0b2d68cc | ||
|
|
64db0e4f25 | ||
|
|
5cfa43287f | ||
|
|
a01b945bc9 | ||
|
|
3b99e05cdc | ||
|
|
8a46372418 | ||
|
|
9ae581e09b | ||
|
|
21288d1736 | ||
|
|
f9d266a6af | ||
|
|
692e75627b | ||
|
|
018424c0fa | ||
|
|
0490b83b9e | ||
|
|
00ace0b69c | ||
|
|
3f892bd810 | ||
|
|
d127060bc8 |
34
.github/imgs/logo.svg
vendored
@@ -1,14 +1,20 @@
|
||||
<svg width="32" height="32" viewBox="0 0 1041 1348" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path d="M340.837 0.33933L681.068 0.338989V0.455643C684.032 0.378397 686.999 0.339702 689.967 0.339702C735.961 0.3397 781.504 9.62899 823.997 27.6772C866.49 45.7254 905.099 72.1791 937.622 105.528C970.144 138.877 995.942 178.467 1013.54 222.04C1031.14 265.612 1040.2 312.312 1040.2 359.474L340.836 359.474L340.836 1347.84C296.157 1347.84 251.914 1338.55 210.636 1320.49C169.357 1302.43 131.85 1275.95 100.257 1242.58C68.6636 1209.21 43.6023 1169.59 26.5041 1125.99C11.3834 1087.43 2.75216 1046.42 0.957956 1004.81H0.605869L0.605897 368.098H0.70363C0.105752 341.831 2.23741 315.443 7.14306 289.411C20.2709 219.745 52.6748 155.754 100.257 105.528C147.839 55.3017 208.462 21.0975 274.461 7.24017C296.426 2.62833 318.657 0.339101 340.837 0.33933Z" fill="url(#paint0_linear_1172_228)"/>
|
||||
<path d="M633.639 904.645H513.029V576.37H635.422V576.377C678.161 576.607 720.454 585.093 759.951 601.37C799.997 617.874 836.384 642.064 867.033 672.559C897.683 703.054 921.996 739.257 938.583 779.101C955.171 818.944 963.709 861.648 963.709 904.775H633.639V904.645Z" fill="url(#paint1_linear_1172_228)"/>
|
||||
<defs>
|
||||
<linearGradient id="paint0_linear_1172_228" x1="520.404" y1="0.338989" x2="520.404" y2="1347.84" gradientUnits="userSpaceOnUse">
|
||||
<stop stop-color="#326DFF"/>
|
||||
<stop offset="1" stop-color="#8EAEFF"/>
|
||||
</linearGradient>
|
||||
<linearGradient id="paint1_linear_1172_228" x1="738.369" y1="576.37" x2="738.369" y2="904.775" gradientUnits="userSpaceOnUse">
|
||||
<stop stop-color="#326DFF"/>
|
||||
<stop offset="1" stop-color="#8EAEFF"/>
|
||||
</linearGradient>
|
||||
</defs>
|
||||
</svg>
|
||||
<svg width="49" height="48" viewBox="0 0 49 48" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path
|
||||
d="M20.3692 7.00001L28.9536 7V7.00294C29.0284 7.00099 29.1033 7.00002 29.1782 7.00002C30.3387 7.00002 31.4878 7.2344 32.5599 7.68979C33.6321 8.14518 34.6062 8.81265 35.4268 9.6541C36.2474 10.4956 36.8983 11.4945 37.3424 12.5939C37.7865 13.6933 38.0151 14.8716 38.0151 16.0616L20.3691 16.0616L20.3691 41C19.2418 41 18.1255 40.7655 17.084 40.3097C16.0425 39.854 15.0961 39.1861 14.299 38.344C13.5018 37.502 12.8695 36.5024 12.4381 35.4022C12.0566 34.4292 11.8388 33.3945 11.7935 32.3446H11.7846L11.7846 16.2792H11.7871C11.772 15.6165 11.8258 14.9506 11.9496 14.2938C12.2808 12.536 13.0984 10.9214 14.299 9.6541C15.4995 8.38681 17.0291 7.52377 18.6944 7.17413C19.2486 7.05776 19.8095 7 20.3692 7.00001Z"
|
||||
fill="url(#paint0_linear_1008_3495)" />
|
||||
<path
|
||||
d="M27.7569 29.8173H24.7138V21.5343H27.8019V21.5345C28.8803 21.5403 29.9474 21.7544 30.944 22.1651C31.9544 22.5815 32.8725 23.1919 33.6458 23.9613C34.4191 24.7308 35.0326 25.6442 35.4511 26.6496C35.8696 27.6549 36.085 28.7324 36.085 29.8205H27.7569V29.8173Z"
|
||||
fill="url(#paint1_linear_1008_3495)" />
|
||||
<defs>
|
||||
<linearGradient id="paint0_linear_1008_3495" x1="24.8999" y1="7" x2="24.8999" y2="41"
|
||||
gradientUnits="userSpaceOnUse">
|
||||
<stop stop-color="#326DFF" />
|
||||
<stop offset="1" stop-color="#8EAEFF" />
|
||||
</linearGradient>
|
||||
<linearGradient id="paint1_linear_1008_3495" x1="30.3994" y1="21.5343" x2="30.3994" y2="29.8205"
|
||||
gradientUnits="userSpaceOnUse">
|
||||
<stop stop-color="#326DFF" />
|
||||
<stop offset="1" stop-color="#8EAEFF" />
|
||||
</linearGradient>
|
||||
</defs>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.7 KiB |
@@ -1,4 +1,4 @@
|
||||
name: Build docs images and copy image to docker hub
|
||||
name: Deploy image by kubeconfig
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
@@ -68,7 +68,7 @@ jobs:
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
outputs:
|
||||
tags: ${{ steps.datetime.outputs.datetime }}
|
||||
tags: ${{ steps.datetime.outputs.datetime }}
|
||||
update-docs-image:
|
||||
needs: build-fastgpt-docs-images
|
||||
runs-on: ubuntu-20.04
|
||||
@@ -85,4 +85,4 @@ jobs:
|
||||
env:
|
||||
KUBE_CONFIG: ${{ secrets.KUBE_CONFIG }}
|
||||
with:
|
||||
args: annotate deployment/fastgpt-docs originImageName="registry.cn-hangzhou.aliyuncs.com/${{ secrets.ALI_HUB_USERNAME }}/fastgpt-docs:${{ needs.build-fastgpt-docs-images.outputs.tags }}" --overwrite
|
||||
args: annotate deployment/fastgpt-docs originImageName="registry.cn-hangzhou.aliyuncs.com/${{ secrets.ALI_HUB_USERNAME }}/fastgpt-docs:${{ needs.build-fastgpt-docs-images.outputs.tags }}" --overwrite
|
||||
@@ -1,4 +1,4 @@
|
||||
name: deploy-docs
|
||||
name: Deploy image to vercel
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -47,7 +47,7 @@ jobs:
|
||||
|
||||
- name: Add cdn for images
|
||||
run: |
|
||||
sed -i "s#\](/imgs/#\](https://cdn.jsdelivr.us/gh/yangchuansheng/fastgpt-imgs@main/imgs/#g" $(grep -rl "\](/imgs/" docSite/content/docs)
|
||||
sed -i "s#\](/imgs/#\](https://cdn.jsdelivr.net/gh/yangchuansheng/fastgpt-imgs@main/imgs/#g" $(grep -rl "\](/imgs/" docSite/content/docs)
|
||||
|
||||
# Step 3 - Install Hugo (specific version)
|
||||
- name: Install Hugo
|
||||
4
.github/workflows/docs-preview.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: preview-docs
|
||||
name: Preview FastGPT docs
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
@@ -47,7 +47,7 @@ jobs:
|
||||
|
||||
- name: Add cdn for images
|
||||
run: |
|
||||
sed -i "s#\](/imgs/#\](https://cdn.jsdelivr.us/gh/yangchuansheng/fastgpt-imgs@main/imgs/#g" $(grep -rl "\](/imgs/" docSite/content/docs)
|
||||
sed -i "s#\](/imgs/#\](https://cdn.jsdelivr.net/gh/yangchuansheng/fastgpt-imgs@main/imgs/#g" $(grep -rl "\](/imgs/" docSite/content/docs)
|
||||
|
||||
# Step 3 - Install Hugo (specific version)
|
||||
- name: Install Hugo
|
||||
|
||||
6
.github/workflows/helm-release.yaml
vendored
@@ -1,11 +1,10 @@
|
||||
name: Release
|
||||
name: Release helm chart
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*.*.*'
|
||||
branches:
|
||||
- master
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
helm:
|
||||
@@ -30,5 +29,6 @@ jobs:
|
||||
unset APP_VERSION
|
||||
unset HELM_VERSION
|
||||
fi
|
||||
helm dependency update files/helm/fastgpt
|
||||
helm package files/helm/fastgpt --version ${HELM_VERSION}-helm --app-version ${APP_VERSION} -d bin
|
||||
helm push bin/fastgpt-${HELM_VERSION}-helm.tgz oci://${HELM_REPO}
|
||||
|
||||
1
.github/workflows/preview-image.yml
vendored
@@ -62,5 +62,6 @@ jobs:
|
||||
uses: actions/checkout@v3
|
||||
- name: Helm Check
|
||||
run: |
|
||||
helm dependency update files/helm/fastgpt
|
||||
helm lint files/helm/fastgpt
|
||||
helm package files/helm/fastgpt
|
||||
|
||||
21
README.md
@@ -38,8 +38,6 @@ https://github.com/labring/FastGPT/assets/15308462/7d3a38df-eb0e-4388-9250-2409b
|
||||
|
||||
- 🌍 海外版:[fastgpt.in](https://fastgpt.in/)
|
||||
|
||||
fastgpt.run 域名会弃用。
|
||||
|
||||
| | |
|
||||
| ---------------------------------- | ---------------------------------- |
|
||||
|  |  |
|
||||
@@ -53,24 +51,21 @@ fastgpt.run 域名会弃用。
|
||||
|
||||
`1` 应用编排能力
|
||||
- [x] 提供简易模式,无需操作编排
|
||||
- [x] 对话下一步指引
|
||||
- [x] 工作流编排
|
||||
- [x] 源文件引用追踪
|
||||
- [x] 模块封装,实现多级复用
|
||||
- [x] 混合检索 & 重排
|
||||
- [x] Tool 模块
|
||||
- [ ] 嵌入 [Laf](https://github.com/labring/laf),实现在线编写 HTTP 模块
|
||||
- [ ] 插件封装功能
|
||||
- [ ] 嵌入 [Laf](https://github.com/labring/laf),实现在线编写 HTTP 模块。初版已完成。
|
||||
- [ ] 插件封装功能,支持低代码渲染
|
||||
|
||||
`2` 知识库能力
|
||||
- [x] 多库复用,混用
|
||||
- [x] chunk 记录修改和删除
|
||||
- [x] 支持知识库单独设置向量模型
|
||||
- [x] 源文件存储
|
||||
- [x] 支持手动输入,直接分段,QA 拆分导入
|
||||
- [x] 支持 pdf,docx,txt,html,md,csv
|
||||
- [x] 支持 txt,md,html,pdf,docx,pptx,csv,xlsx (有需要更多可 PR file loader)
|
||||
- [x] 支持 url 读取、CSV 批量导入
|
||||
- [ ] 支持 PPT、Excel 导入
|
||||
- [x] 混合检索 & 重排
|
||||
- [ ] 支持文件阅读器
|
||||
- [ ] 更多的数据预处理方案
|
||||
|
||||
@@ -91,6 +86,8 @@ fastgpt.run 域名会弃用。
|
||||
- [x] Iframe 一键嵌入
|
||||
- [x] 聊天窗口嵌入支持自定义 Icon,默认打开,拖拽等功能
|
||||
- [x] 统一查阅对话记录,并对数据进行标注
|
||||
`6` 其他
|
||||
- [x] 支持语音输入和输出 (可配置语音输入语音回答)
|
||||
|
||||
<a href="#readme">
|
||||
<img src="https://img.shields.io/badge/-返回顶部-7d09f1.svg" alt="#" align="right">
|
||||
@@ -104,7 +101,7 @@ fastgpt.run 域名会弃用。
|
||||
|
||||
> [Sealos](https://sealos.io) 的服务器在国外,不需要额外处理网络问题,无需服务器、无需魔法、无需域名,支持高并发 & 动态伸缩。点击以下按钮即可一键部署 👇
|
||||
|
||||
[](https://cloud.sealos.io/?openapp=system-fastdeploy%3FtemplateName%3Dfastgpt)
|
||||
[](https://cloud.sealos.io/?openapp=system-fastdeploy%3FtemplateName%3Dfastgpt)
|
||||
|
||||
由于需要部署数据库,部署完后需要等待 2~4 分钟才能正常访问。默认用了最低配置,首次访问时会有些慢。相关使用教程可查看:[Sealos 部署 FastGPT](https://doc.fastgpt.in/docs/development/sealos/)
|
||||
|
||||
@@ -114,7 +111,7 @@ fastgpt.run 域名会弃用。
|
||||
* [多模型配置](https://doc.fastgpt.in/docs/development/one-api/)
|
||||
* [版本更新/升级介绍](https://doc.fastgpt.in/docs/development/upgrading)
|
||||
* [OpenAPI API 文档](https://doc.fastgpt.in/docs/development/openapi/)
|
||||
* [知识库结构详解](https://doc.fastgpt.in/docs/course/datasetengine/)
|
||||
* [知识库结构详解](https://doc.fastgpt.in/docs/course/dataset_engine/)
|
||||
|
||||
<a href="#readme">
|
||||
<img src="https://img.shields.io/badge/-返回顶部-7d09f1.svg" alt="#" align="right">
|
||||
@@ -124,7 +121,7 @@ fastgpt.run 域名会弃用。
|
||||
|
||||
wx 扫一下加入:
|
||||
|
||||

|
||||

|
||||
|
||||
<a href="#readme">
|
||||
<img src="https://img.shields.io/badge/-返回顶部-7d09f1.svg" alt="#" align="right">
|
||||
|
||||
@@ -106,7 +106,7 @@ Project tech stack: NextJs + TS + ChakraUI + Mongo + Postgres (Vector plugin)
|
||||
|
||||
- **⚡ Deployment**
|
||||
|
||||
[](https://cloud.sealos.io/?openapp=system-fastdeploy%3FtemplateName%3Dfastgpt)
|
||||
[](https://cloud.sealos.io/?openapp=system-fastdeploy%3FtemplateName%3Dfastgpt)
|
||||
|
||||
Give it a 2-4 minute wait after deployment as it sets up the database. Initially, it might be a tad slow since we're using the basic settings.
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ https://github.com/labring/FastGPT/assets/15308462/7d3a38df-eb0e-4388-9250-2409b
|
||||
|
||||
- **⚡ デプロイ**
|
||||
|
||||
[](https://cloud.sealos.io/?openapp=system-fastdeploy%3FtemplateName%3Dfastgpt)
|
||||
[](https://cloud.sealos.io/?openapp=system-fastdeploy%3FtemplateName%3Dfastgpt)
|
||||
|
||||
デプロイ 後、データベースをセットアップするので、2~4分待 ってください。基本設定 を 使 っているので、最初 は 少 し 遅 いかもしれません。
|
||||
|
||||
|
||||
BIN
docSite/assets/imgs/gapierTool13.webp
Normal file
|
After Width: | Height: | Size: 186 KiB |
BIN
docSite/assets/imgs/laf1.webp
Normal file
|
After Width: | Height: | Size: 46 KiB |
BIN
docSite/assets/imgs/laf2.webp
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
docSite/assets/imgs/laf3.webp
Normal file
|
After Width: | Height: | Size: 58 KiB |
BIN
docSite/assets/imgs/laf4.png
Normal file
|
After Width: | Height: | Size: 89 KiB |
BIN
docSite/assets/imgs/rerank1.png
Normal file
|
After Width: | Height: | Size: 91 KiB |
@@ -64,7 +64,7 @@ Tips: 可以通过点击上下文按键查看完整的上下文组成,便于
|
||||
|
||||
FastGPT 知识库采用 QA 对(不一定都是问答格式,仅代表两个变量)的格式存储,在转义成字符串时候会根据**引用模板**来进行格式化。知识库包含多个可用变量: q, a, sourceId(数据的ID), index(第n个数据), source(数据的集合名、文件名),score(距离得分,0-1) 可以通过 {{q}} {{a}} {{sourceId}} {{index}} {{source}} {{score}} 按需引入。下面一个模板例子:
|
||||
|
||||
可以通过 [知识库结构讲解](/docs/course/datasetEngine/) 了解详细的知识库的结构。
|
||||
可以通过 [知识库结构讲解](/docs/course/dataset_engine/) 了解详细的知识库的结构。
|
||||
|
||||
#### 引用模板
|
||||
|
||||
|
||||
@@ -1,93 +0,0 @@
|
||||
---
|
||||
title: "知识库结构讲解"
|
||||
description: "本节会详细介绍 FastGPT 知识库结构设计,理解其 QA 的存储格式和多向量映射,以便更好的构建知识库。这篇介绍主要以使用为主,详细原理不多介绍。"
|
||||
icon: "dataset"
|
||||
draft: false
|
||||
toc: true
|
||||
weight: 102
|
||||
---
|
||||
|
||||
## 理解向量
|
||||
|
||||
FastGPT 采用了 RAG 中的 Embedding 方案构建知识库,要使用好 FastGPT 需要简单的理解`Embedding`向量是如何工作的及其特点。
|
||||
|
||||
人类的文字、图片、视频等媒介是无法直接被计算机理解的,要想让计算机理解两段文字是否有相似性、相关性,通常需要将它们转成计算机可以理解的语言,向量是其中的一种方式。
|
||||
|
||||
向量可以简单理解为一个数字数组,两个向量之间可以通过数学公式得出一个`距离`,距离越小代表两个向量的相似度越大。从而映射到文字、图片、视频等媒介上,可以用来判断两个媒介之间的相似度。向量搜索便是利用了这个原理。
|
||||
|
||||
而由于文字是有多种类型,并且拥有成千上万种组合方式,因此在转成向量进行相似度匹配时,很难保障其精确性。在向量方案构建的知识库中,通常使用`topk`召回的方式,也就是查找前`k`个最相似的内容,丢给大模型去做更进一步的`语义判断`、`逻辑推理`和`归纳总结`,从而实现知识库问答。因此,在知识库问答中,向量搜索的环节是最为重要的。
|
||||
|
||||
影响向量搜索精度的因素非常多,主要包括:向量模型的质量、数据的质量(长度,完整性,多样性)、检索器的精度(速度与精度之间的取舍)。与数据质量对应的就是检索词的质量。
|
||||
|
||||
检索器的精度比较容易解决,向量模型的训练略复杂,因此数据和检索词质量优化成了一个重要的环节。
|
||||
|
||||
## FastGPT 中向量的结构设计
|
||||
|
||||
FastGPT 采用了 `PostgresSQL` 的 `PG Vector` 插件作为向量检索器,索引为`HNSW`。且`PostgresSQL`仅用于向量检索,`MongoDB`用于其他数据的存取。
|
||||
|
||||
在`MongoDB`的`dataset.datas`表中,会存储向量原数据的信息,同时有一个`indexes`字段,会记录其对应的向量ID,这是一个数组,也就是说,一组向量可以对应多组数据。
|
||||
|
||||
在`PostgresSQL`的表中,设置一个 `index` 字段用于存储向量。在检索时,会先召回向量,再根据向量的ID,去`MongoDB`中寻找原数据内容,如果对应了同一组原数据,则进行合并,向量得分取最高得分。
|
||||
|
||||

|
||||
|
||||
### 多向量的目的和使用方式
|
||||
|
||||
在一组向量中,内容的长度和语义的丰富度通常是矛盾的,无法兼得。因此,FastGPT 采用了多向量映射的方式,将一组数据映射到多组向量中,从而保障数据的完整性和语义的丰富度。
|
||||
|
||||
你可以为一组较长的文本,添加多组向量,从而在检索时,只要其中一组向量被检索到,该数据也将被召回。
|
||||
|
||||
### 提高向量搜索精度的方法
|
||||
|
||||
1. 更好分词分段:当一段话的结构和语义是完整的,并且是单一的,精度也会提高。因此,许多系统都会优化分词器,尽可能的保障每组数据的完整性。
|
||||
2. 精简`index`的内容,减少向量内容的长度:当`index`的内容更少,更准确时,检索精度自然会提高。但与此同时,会牺牲一定的检索范围,适合答案较为严格的场景。
|
||||
3. 丰富`index`的数量,可以为同一个`chunk`内容增加多组`index`。
|
||||
4. 优化检索词:在实际使用过程中,用户的问题通常是模糊的或是缺失的,并不一定是完整清晰的问题。因此优化用户的问题(检索词)很大程度上也可以提高精度。
|
||||
5. 微调向量模型:由于市面上直接使用的向量模型都是通用型模型,在特定领域的检索精度并不高,因此微调向量模型可以很大程度上提高专业领域的检索效果。
|
||||
|
||||
## FastGPT 构建知识库方案
|
||||
|
||||
在 FastGPT 中,整个知识库由库、集合和数据 3 部分组成。集合可以简单理解为一个`文件`。一个`库`中可以包含多个`集合`,一个`集合`中可以包含多组`数据`。最小的搜索单位是`库`,也就是说,知识库搜索时,是对整个`库`进行搜索,而集合仅是为了对数据进行分类管理,与搜索效果无关。(起码目前还是)
|
||||
|
||||
| 库 | 集合 | 数据 |
|
||||
| --- | --- | --- |
|
||||
|  |  |  |
|
||||
|
||||
### 导入数据方案1 - 直接分段导入
|
||||
|
||||
选择文件导入时,可以选择直接分段方案。直接分段会利用`句子分词器`对文本进行一定长度拆分,最终分割中多组的`q`。如果使用了直接分段方案,我们建议在`应用`设置`引用提示词`时,使用`通用模板`即可,无需选择`问答模板`。
|
||||
|
||||
| 交互 | 结果 |
|
||||
| --- | --- |
|
||||
|  |  |
|
||||
|
||||
|
||||
### 导入数据方案2 - QA导入
|
||||
|
||||
选择文件导入时,可以选择QA拆分方案。仍然需要使用到`句子分词器`对文本进行拆分,但长度比直接分段大很多。在导入后,会先调用`大模型`对分段进行学习,并给出一些`问题`和`答案`,最终问题和答案会一起被存储到`q`中。注意,新版的 FastGPT 为了提高搜索的范围,不再将问题和答案分别存储到 qa 中。
|
||||
|
||||
| 交互 | 结果 |
|
||||
| --- | --- |
|
||||
|  |  |
|
||||
|
||||
### 导入数据方案3 - 手动录入
|
||||
|
||||
在 FastGPT 中,你可以在任何一个`集合`中点击右上角的`插入`手动录入知识点,或者使用`标注`功能手动录入。被搜索的内容为`q`,补充内容(可选)为`a`。
|
||||
|
||||
| | | |
|
||||
| --- | --- | --- |
|
||||
|  |  |  |
|
||||
|
||||
### 导入数据方案4 - CSV录入
|
||||
|
||||
有些数据较为独特,可能需要单独的进行预处理分割后再导入 FastGPT,此时可以选择 csv 导入,可批量的将处理好的数据导入。
|
||||
|
||||

|
||||
|
||||
### 导入数据方案5 - API导入
|
||||
|
||||
参考[FastGPT OpenAPI使用](/docs/development/openapi)。
|
||||
|
||||
## QA的组合与引用提示词构建
|
||||
|
||||
参考[引用模板与引用提示词示例](/docs/course/ai_settings/#示例)
|
||||
136
docSite/content/docs/course/dataset_engine.md
Normal file
@@ -0,0 +1,136 @@
|
||||
---
|
||||
title: '知识库搜索方案和参数'
|
||||
description: '本节会详细介绍 FastGPT 知识库结构设计,理解其 QA 的存储格式和多向量映射,以便更好的构建知识库。同时会介绍每个搜索参数的功能。这篇介绍主要以使用为主,详细原理不多介绍。'
|
||||
icon: 'language'
|
||||
draft: false
|
||||
toc: true
|
||||
weight: 106
|
||||
---
|
||||
|
||||
## 理解向量
|
||||
|
||||
FastGPT 采用了 RAG 中的 Embedding 方案构建知识库,要使用好 FastGPT 需要简单的理解`Embedding`向量是如何工作的及其特点。
|
||||
|
||||
人类的文字、图片、视频等媒介是无法直接被计算机理解的,要想让计算机理解两段文字是否有相似性、相关性,通常需要将它们转成计算机可以理解的语言,向量是其中的一种方式。
|
||||
|
||||
向量可以简单理解为一个数字数组,两个向量之间可以通过数学公式得出一个`距离`,距离越小代表两个向量的相似度越大。从而映射到文字、图片、视频等媒介上,可以用来判断两个媒介之间的相似度。向量搜索便是利用了这个原理。
|
||||
|
||||
而由于文字是有多种类型,并且拥有成千上万种组合方式,因此在转成向量进行相似度匹配时,很难保障其精确性。在向量方案构建的知识库中,通常使用`topk`召回的方式,也就是查找前`k`个最相似的内容,丢给大模型去做更进一步的`语义判断`、`逻辑推理`和`归纳总结`,从而实现知识库问答。因此,在知识库问答中,向量搜索的环节是最为重要的。
|
||||
|
||||
影响向量搜索精度的因素非常多,主要包括:向量模型的质量、数据的质量(长度,完整性,多样性)、检索器的精度(速度与精度之间的取舍)。与数据质量对应的就是检索词的质量。
|
||||
|
||||
检索器的精度比较容易解决,向量模型的训练略复杂,因此数据和检索词质量优化成了一个重要的环节。
|
||||
|
||||
|
||||
### 提高向量搜索精度的方法
|
||||
|
||||
1. 更好分词分段:当一段话的结构和语义是完整的,并且是单一的,精度也会提高。因此,许多系统都会优化分词器,尽可能的保障每组数据的完整性。
|
||||
2. 精简`index`的内容,减少向量内容的长度:当`index`的内容更少,更准确时,检索精度自然会提高。但与此同时,会牺牲一定的检索范围,适合答案较为严格的场景。
|
||||
3. 丰富`index`的数量,可以为同一个`chunk`内容增加多组`index`。
|
||||
4. 优化检索词:在实际使用过程中,用户的问题通常是模糊的或是缺失的,并不一定是完整清晰的问题。因此优化用户的问题(检索词)很大程度上也可以提高精度。
|
||||
5. 微调向量模型:由于市面上直接使用的向量模型都是通用型模型,在特定领域的检索精度并不高,因此微调向量模型可以很大程度上提高专业领域的检索效果。
|
||||
|
||||
## FastGPT 构建知识库方案
|
||||
|
||||
### 数据存储结构
|
||||
|
||||
在 FastGPT 中,整个知识库由库、集合和数据 3 部分组成。集合可以简单理解为一个`文件`。一个`库`中可以包含多个`集合`,一个`集合`中可以包含多组`数据`。最小的搜索单位是`库`,也就是说,知识库搜索时,是对整个`库`进行搜索,而集合仅是为了对数据进行分类管理,与搜索效果无关。(起码目前还是)
|
||||
|
||||

|
||||
|
||||
### 向量存储结构
|
||||
|
||||
FastGPT 采用了`PostgresSQL`的`PG Vector`插件作为向量检索器,索引为`HNSW`。且`PostgresSQL`仅用于向量检索(该引擎可以替换成其它数据库),`MongoDB`用于其他数据的存取。
|
||||
|
||||
在`MongoDB`的`dataset.datas`表中,会存储向量原数据的信息,同时有一个`indexes`字段,会记录其对应的向量ID,这是一个数组,也就是说,一组向量可以对应多组数据。
|
||||
|
||||
在`PostgresSQL`的表中,设置一个`vector`字段用于存储向量。在检索时,会先召回向量,再根据向量的ID,去`MongoDB`中寻找原数据内容,如果对应了同一组原数据,则进行合并,向量得分取最高得分。
|
||||
|
||||

|
||||
|
||||
### 多向量的目的和使用方式
|
||||
|
||||
在一组向量中,内容的长度和语义的丰富度通常是矛盾的,无法兼得。因此,FastGPT 采用了多向量映射的方式,将一组数据映射到多组向量中,从而保障数据的完整性和语义的丰富度。
|
||||
|
||||
你可以为一组较长的文本,添加多组向量,从而在检索时,只要其中一组向量被检索到,该数据也将被召回。
|
||||
|
||||
意味着,你可以通过标注数据块的方式,不断提高数据块的精度。
|
||||
|
||||
### 检索方案
|
||||
|
||||
1. 通过`问题优化`实现指代消除和问题扩展,从而增加连续对话的检索能力以及语义丰富度。
|
||||
2. 通过`Concat query`来增加`Rerank`连续对话的时,排序的准确性。
|
||||
3. 通过`RRF`合并方式,综合多个渠道的检索效果。
|
||||
4. 通过`Rerank`来二次排序,提高精度。
|
||||
|
||||

|
||||
|
||||
|
||||
## 搜索参数
|
||||
| | | |
|
||||
| --- |---| --- |
|
||||
||  |  |
|
||||
|
||||
### 搜索模式
|
||||
|
||||
#### 语义检索
|
||||
|
||||
语义检索是通过向量距离,计算用户问题与知识库内容的距离,从而得出“相似度”,当然这并不是语文上的相似度,而是数学上的。
|
||||
|
||||
优点:
|
||||
- 相近语义理解
|
||||
- 跨多语言理解(例如输入中文问题匹配英文知识点)
|
||||
- 多模态理解(文本,图片,音视频等)
|
||||
|
||||
缺点:
|
||||
- 依赖模型训练效果
|
||||
- 精度不稳定
|
||||
- 受关键词和句子完整度影响
|
||||
|
||||
#### 全文检索
|
||||
|
||||
采用传统的全文检索方式。适合查找关键的主谓语等。
|
||||
|
||||
#### 混合检索
|
||||
|
||||
同时使用向量检索和全文检索,并通过 RRF 公式进行两个搜索结果合并,一般情况下搜索结果会更加丰富准确。
|
||||
|
||||
由于混合检索后的查找范围很大,并且无法直接进行相似度过滤,通常需要进行利用重排模型进行一次结果重新排序,并利用重排的得分进行过滤。
|
||||
|
||||
#### 结果重排
|
||||
|
||||
利用`ReRank`模型对搜索结果进行重排,绝大多数情况下,可以有效提高搜索结果的准确率。不过,重排模型与问题的完整度(主谓语齐全)有一些关系,通常会先走问题优化后再进行搜索-重排。重排后可以得到一个`0-1`的得分,代表着搜索内容与问题的相关度,该分数通常比向量的得分更加精确,可以根据得分进行过滤。
|
||||
|
||||
FastGPT 会使用 `RRF` 对重排结果、向量搜索结果、全文检索结果进行合并,得到最终的搜索结果。
|
||||
|
||||
### 搜索过滤
|
||||
|
||||
#### 引用上限
|
||||
|
||||
每次搜索最多引用`n`个`tokens`的内容。
|
||||
|
||||
之所以不采用`top k`,是发现在混合知识库(问答库、文档库)时,不同`chunk`的长度差距很大,会导致`top k`的结果不稳定,因此采用了`tokens`的方式进行引用上限的控制。
|
||||
|
||||
#### 最低相关度
|
||||
|
||||
一个`0-1`的数值,会过滤掉一些低相关度的搜索结果。
|
||||
|
||||
该值仅在`语义检索`或使用`结果重排`时生效。
|
||||
|
||||
### 问题优化
|
||||
|
||||
#### 背景
|
||||
|
||||
在 RAG 中,我们需要根据输入的问题去数据库里执行 embedding 搜索,查找相关的内容,从而查找到相似的内容(简称知识库搜索)。
|
||||
|
||||
在搜索的过程中,尤其是连续对话的搜索,我们通常会发现后续的问题难以搜索到合适的内容,其中一个原因是知识库搜索只会使用“当前”的问题去执行。看下面的例子:
|
||||
|
||||

|
||||
|
||||
用户在提问“第二点是什么”的时候,只会去知识库里查找“第二点是什么”,压根查不到内容。实际上需要查询的是“QA结构是什么”。因此我们需要引入一个【问题优化】模块,来对用户当前的问题进行补全,从而使得知识库搜索能够搜索到合适的内容。使用补全后效果如下:
|
||||
|
||||

|
||||
|
||||
#### 实现方式
|
||||
|
||||
在进行`数据检索`前,会先让模型进行`指代消除`与`问题扩展`,一方面可以可以解决指代对象不明确问题,同时可以扩展问题的语义丰富度。你可以通过每次对话后的对话详情,查看补全的结果。
|
||||
@@ -19,6 +19,9 @@ llm模型全部合并
|
||||
|
||||
```json
|
||||
{
|
||||
"feConfigs": {
|
||||
"lafEnv": "https://laf.dev" // laf环境
|
||||
},
|
||||
"systemEnv": {
|
||||
"vectorMaxProcess": 15,
|
||||
"qaMaxProcess": 15,
|
||||
@@ -153,7 +156,7 @@ llm模型全部合并
|
||||
|
||||
请使用 4.6.6-alpha 以上版本,配置文件中的 `reRankModels` 为重排模型,虽然是数组,不过目前仅有第1个生效。
|
||||
|
||||
1. [部署 ReRank 模型](/docs/development/custom-models/reranker/)
|
||||
1. [部署 ReRank 模型](/docs/development/custom-models/bge-rerank/)
|
||||
1. 找到 FastGPT 的配置文件中的 `reRankModels`, 4.6.6 以前是 `ReRankModels`。
|
||||
2. 修改对应的值:(记得去掉注释)
|
||||
|
||||
@@ -164,7 +167,7 @@ llm模型全部合并
|
||||
"model": "bge-reranker-base", // 随意
|
||||
"name": "检索重排-base", // 随意
|
||||
"charsPointsPrice": 0,
|
||||
"requestUrl": "{{host}}/api/v1/rerank",
|
||||
"requestUrl": "{{host}}/v1/rerank",
|
||||
"requestAuth": "安全凭证,已自动补 Bearer"
|
||||
}
|
||||
]
|
||||
|
||||
121
docSite/content/docs/development/custom-models/bge-rerank.md
Normal file
@@ -0,0 +1,121 @@
|
||||
---
|
||||
title: '接入 bge-rerank 重排模型'
|
||||
description: '接入 bge-rerank 重排模型'
|
||||
icon: 'sort'
|
||||
draft: false
|
||||
toc: true
|
||||
weight: 910
|
||||
---
|
||||
|
||||
## 不同模型推荐配置
|
||||
|
||||
推荐配置如下:
|
||||
|
||||
{{< table "table-hover table-striped-columns" >}}
|
||||
| 模型名 | 内存 | 显存 | 硬盘空间 | 启动命令 |
|
||||
|------|---------|---------|----------|--------------------------|
|
||||
| bge-rerank-base | >=4GB | >=4GB | >=8GB | python app.py |
|
||||
| bge-rerank-large | >=8GB | >=8GB | >=8GB | python app.py |
|
||||
| bge-rerank-v2-m3 | >=8GB | >=8GB | >=8GB | python app.py |
|
||||
{{< /table >}}
|
||||
|
||||
## 源码部署
|
||||
|
||||
### 1. 安装环境
|
||||
|
||||
- Python 3.9, 3.10
|
||||
- CUDA 11.7
|
||||
- 科学上网环境
|
||||
|
||||
### 2. 下载代码
|
||||
|
||||
3 个模型代码分别为:
|
||||
|
||||
1. [https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-base](https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-base)
|
||||
2. [https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-large](https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-large)
|
||||
3. [https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-rerank-v2-m3](https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-rerank-v2-m3)
|
||||
|
||||
### 3. 安装依赖
|
||||
|
||||
```sh
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 4. 下载模型
|
||||
|
||||
3个模型的 huggingface 仓库地址如下:
|
||||
|
||||
1. [https://huggingface.co/BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base)
|
||||
2. [https://huggingface.co/BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large)
|
||||
3. [https://huggingface.co/BAAI/bge-rerank-v2-m3](https://huggingface.co/BAAI/bge-rerank-v2-m3)
|
||||
|
||||
在对应代码目录下 clone 模型。目录结构:
|
||||
|
||||
```
|
||||
bge-reranker-base/
|
||||
app.py
|
||||
Dockerfile
|
||||
requirements.txt
|
||||
```
|
||||
|
||||
### 5. 运行代码
|
||||
|
||||
```bash
|
||||
python app.py
|
||||
```
|
||||
|
||||
启动成功后应该会显示如下地址:
|
||||
|
||||

|
||||
|
||||
> 这里的 `http://0.0.0.0:6006` 就是连接地址。
|
||||
|
||||
## docker 部署
|
||||
|
||||
**镜像名分别为:**
|
||||
|
||||
1. registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-base:v0.1 (4 GB+)
|
||||
2. registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-large:v0.1 (5 GB+)
|
||||
3. registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-v2-m3:v0.1 (5 GB+)
|
||||
|
||||
**端口**
|
||||
|
||||
6006
|
||||
|
||||
**环境变量**
|
||||
|
||||
```
|
||||
ACCESS_TOKEN=访问安全凭证,请求时,Authorization: Bearer ${ACCESS_TOKEN}
|
||||
```
|
||||
|
||||
**运行命令示例**
|
||||
|
||||
```sh
|
||||
# auth token 为mytoken
|
||||
docker run -d --name reranker -p 6006:6006 -e ACCESS_TOKEN=mytoken --gpus all registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-base:v0.1
|
||||
```
|
||||
|
||||
**docker-compose.yml示例**
|
||||
```
|
||||
version: "3"
|
||||
services:
|
||||
reranker:
|
||||
image: registry.cn-hangzhou.aliyuncs.com/fastgpt/rerank:v0.2
|
||||
container_name: reranker
|
||||
# GPU运行环境,如果宿主机未安装,将deploy配置隐藏即可
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
ports:
|
||||
- 6006:6006
|
||||
environment:
|
||||
- ACCESS_TOKEN=mytoken
|
||||
|
||||
```
|
||||
## 接入 FastGPT
|
||||
|
||||
参考 [ReRank模型接入](/docs/development/configuration/#rerank-接入),host 变量为部署的域名。
|
||||
@@ -1,90 +0,0 @@
|
||||
---
|
||||
title: '接入 ReRank 重排模型'
|
||||
description: '接入 ReRank 重排模型'
|
||||
icon: 'sort'
|
||||
draft: false
|
||||
toc: true
|
||||
weight: 910
|
||||
---
|
||||
|
||||
## 推荐配置
|
||||
|
||||
推荐配置如下:
|
||||
|
||||
{{< table "table-hover table-striped-columns" >}}
|
||||
| 类型 | 内存 | 显存 | 硬盘空间 | 启动命令 |
|
||||
|------|---------|---------|----------|--------------------------|
|
||||
| base | >=4GB | >=3GB | >=8GB | python app.py |
|
||||
{{< /table >}}
|
||||
|
||||
## 部署
|
||||
|
||||
### 环境要求
|
||||
|
||||
- Python 3.10.11
|
||||
- CUDA 11.7
|
||||
- 科学上网环境
|
||||
|
||||
### 源码部署
|
||||
|
||||
1. 根据上面的环境配置配置好环境,具体教程自行 GPT;
|
||||
2. 下载 [python 文件](https://github.com/labring/FastGPT/tree/main/python/reranker/bge-reranker-base)
|
||||
3. 在命令行输入命令 `pip install -r requirements.txt`;
|
||||
4. 按照[https://huggingface.co/BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base)下载模型仓库到app.py同级目录
|
||||
5. 添加环境变量 `export ACCESS_TOKEN=XXXXXX` 配置 token,这里的 token 只是加一层验证,防止接口被人盗用,默认值为 `ACCESS_TOKEN` ;
|
||||
6. 执行命令 `python app.py`。
|
||||
|
||||
然后等待模型下载,直到模型加载完毕为止。如果出现报错先问 GPT。
|
||||
|
||||
启动成功后应该会显示如下地址:
|
||||
|
||||

|
||||
|
||||
> 这里的 `http://0.0.0.0:6006` 就是连接地址。
|
||||
|
||||
### docker 部署
|
||||
|
||||
+ 镜像名: `luanshaotong/reranker:v0.2`
|
||||
+ 端口号: 6006
|
||||
+ 大小:约8GB
|
||||
|
||||
**设置安全凭证(即oneapi中的渠道密钥)**
|
||||
```
|
||||
ACCESS_TOKEN=mytoken
|
||||
```
|
||||
|
||||
**运行命令示例**
|
||||
- 无需GPU环境,使用CPU运行
|
||||
```sh
|
||||
docker run -d --name reranker -p 6006:6006 -e ACCESS_TOKEN=mytoken luanshaotong/reranker:v0.2
|
||||
```
|
||||
|
||||
- 需要CUDA 11.7环境
|
||||
```sh
|
||||
docker run -d --gpus all --name reranker -p 6006:6006 -e ACCESS_TOKEN=mytoken luanshaotong/reranker:v0.2
|
||||
```
|
||||
|
||||
**docker-compose.yml示例**
|
||||
```
|
||||
version: "3"
|
||||
services:
|
||||
reranker:
|
||||
image: luanshaotong/reranker:v0.2
|
||||
container_name: reranker
|
||||
# GPU运行环境,如果宿主机未安装,将deploy配置隐藏即可
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
ports:
|
||||
- 6006:6006
|
||||
environment:
|
||||
- ACCESS_TOKEN=mytoken
|
||||
|
||||
```
|
||||
## 接入 FastGPT
|
||||
|
||||
参考 [ReRank模型接入](/docs/development/configuration/#rerank-接入),host 变量为部署的域名。
|
||||
@@ -32,7 +32,7 @@ FastGPT 使用了 one-api 项目来管理模型池,其可以兼容 OpenAI 、A
|
||||
|
||||
可选择 [Sealos 快速部署 OneAPI](/docs/development/one-api),更多部署方法可参考该项目的 [README](https://github.com/songquanpeng/one-api),也可以直接通过以下按钮一键部署:
|
||||
|
||||
<a href="https://template.cloud.sealos.io/deploy?templateName=one-api" rel="external" target="_blank"><img src="https://cdn.jsdelivr.us/gh/labring-actions/templates@main/Deploy-on-Sealos.svg" alt="Deploy on Sealos"/></a>
|
||||
<a href="https://template.cloud.sealos.io/deploy?templateName=one-api" rel="external" target="_blank"><img src="https://cdn.jsdelivr.net/gh/labring-actions/templates@main/Deploy-on-Sealos.svg" alt="Deploy on Sealos"/></a>
|
||||
|
||||
## 一、安装 Docker 和 docker-compose
|
||||
|
||||
@@ -270,3 +270,7 @@ mongo连接失败,查看mongo的运行状态对应日志。
|
||||
### 首次部署,root用户提示未注册
|
||||
|
||||
日志会有错误提示。大概率是没有启动 Mongo 副本集模式。
|
||||
|
||||
### 无法导出知识库、无法使用语音输入/播报
|
||||
|
||||
没配置 SSL 证书,无权使用部分功能。
|
||||
@@ -29,7 +29,7 @@ MySQL 版本支持多实例,高并发。
|
||||
|
||||
直接点击以下按钮即可一键部署 👇
|
||||
|
||||
<a href="https://template.cloud.sealos.io/deploy?templateName=one-api" rel="external" target="_blank"><img src="https://cdn.jsdelivr.us/gh/labring-actions/templates@main/Deploy-on-Sealos.svg" alt="Deploy on Sealos"/></a>
|
||||
<a href="https://template.cloud.sealos.io/deploy?templateName=one-api" rel="external" target="_blank"><img src="https://cdn.jsdelivr.net/gh/labring-actions/templates@main/Deploy-on-Sealos.svg" alt="Deploy on Sealos"/></a>
|
||||
|
||||
部署完后会跳转「应用管理」,数据库在另一个应用「数据库」中。需要等待 1~3 分钟数据库运行后才能访问成功。
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ FastGPT 使用了 one-api 项目来管理模型池,其可以兼容 OpenAI 、A
|
||||
## 一键部署
|
||||
Sealos 的服务器在国外,不需要额外处理网络问题,无需服务器、无需魔法、无需域名,支持高并发 & 动态伸缩。点击以下按钮即可一键部署 👇
|
||||
|
||||
<a href="https://template.cloud.sealos.io/deploy?templateName=fastgpt" rel="external" target="_blank"><img src="https://cdn.jsdelivr.us/gh/labring-actions/templates@main/Deploy-on-Sealos.svg" alt="Deploy on Sealos"/></a>
|
||||
<a href="https://template.cloud.sealos.io/deploy?templateName=fastgpt" rel="external" target="_blank"><img src="https://cdn.jsdelivr.net/gh/labring-actions/templates@main/Deploy-on-Sealos.svg" alt="Deploy on Sealos"/></a>
|
||||
|
||||
由于需要部署数据库,部署完后需要等待 2~4 分钟才能正常访问。默认用了最低配置,首次访问时会有些慢。
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
title: 'V4.7(进行中)'
|
||||
title: 'V4.7(需要初始化)'
|
||||
description: 'FastGPT V4.7更新说明'
|
||||
icon: 'upgrade'
|
||||
draft: false
|
||||
@@ -26,7 +26,7 @@ curl --location --request POST 'https://{{host}}/api/admin/initv47' \
|
||||
|
||||
## 3. 升级 ReRank 模型
|
||||
|
||||
4.7对ReRank模型进行了格式变动,兼容 cohere 的格式,可以直接使用 cohere 提供的 API。如果是本地的 ReRank 模型,需要修改镜像为:`luanshaotong/reranker:v0.2` 。
|
||||
4.7对ReRank模型进行了格式变动,兼容 cohere 的格式,可以直接使用 cohere 提供的 API。如果是本地的 ReRank 模型,需要修改镜像为:`registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-base:v0.1` 。
|
||||
|
||||
cohere的重排模型对中文不是很好,感觉不如 bge 的好用,接入教程如下:
|
||||
|
||||
|
||||
33
docSite/content/docs/development/upgrading/471.md
Normal file
@@ -0,0 +1,33 @@
|
||||
---
|
||||
title: 'V4.7.1(进行中)'
|
||||
description: 'FastGPT V4.7.1 更新说明'
|
||||
icon: 'upgrade'
|
||||
draft: false
|
||||
toc: true
|
||||
weight: 825
|
||||
---
|
||||
|
||||
## 初始化脚本
|
||||
|
||||
从任意终端,发起 1 个 HTTP 请求。其中 {{rootkey}} 替换成环境变量里的 `rootkey`;{{host}} 替换成FastGPT的域名。
|
||||
|
||||
```bash
|
||||
curl --location --request POST 'https://{{host}}/api/admin/clearInvalidData' \
|
||||
--header 'rootkey: {{rootkey}}' \
|
||||
--header 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
该请求会执行脏数据清理(清理无效的文件、清理无效的图片、清理无效的知识库集合、清理无效的向量)
|
||||
|
||||
## V4.7.1 更新说明
|
||||
|
||||
1. 新增 - 语音输入完整配置。支持选择是否打开语音输入(包括分享页面),支持语音输入后自动发送,支持语音输入后自动语音播放(流式)。
|
||||
2. 新增 - Pptx 和 xlsx 文件读取。但所有文件读取都放服务端,会消耗更多的服务器资源,以及无法在上传时预览更多内容。
|
||||
3. 新增 - 集成 Laf 云函数,可以读取 Laf 账号中的云函数作为 HTTP 模块。
|
||||
4. 新增 - 定时器,清理垃圾数据。(采用小范围清理,会清理最近n个小时的,所以请保证服务持续运行,长时间不允许,可以继续执行 clearInvalidData 的接口进行全量清理。)
|
||||
5. 商业版新增 - 后台配置系统通知。
|
||||
6. 修改 - csv导入模板,取消 header 校验,自动获取前两列。
|
||||
7. 修复 - 工具调用模块连线数据类型校验错误。
|
||||
8. 修复 - 自定义索引输入时,解构数据失败。
|
||||
9. 修复 - rerank 模型数据格式。
|
||||
10. 修复 - 问题补全历史记录BUG
|
||||
100
docSite/content/docs/workflow/modules/laf.md
Normal file
@@ -0,0 +1,100 @@
|
||||
---
|
||||
title: "Laf 函数调用"
|
||||
description: "FastGPT Laf 函数调用模块介绍"
|
||||
icon: "code"
|
||||
draft: false
|
||||
toc: true
|
||||
weight: 355
|
||||
---
|
||||
|
||||
|
||||

|
||||
|
||||
## 介绍
|
||||
|
||||
`Laf 函数调用`模块可以调用 Laf 账号下的云函数,其工作原理与 HTTP 模块相同,有以下特殊特征:
|
||||
|
||||
- 只能使用 POST 请求
|
||||
- 请求自带系统参数 systemParams,无需通过变量传递。
|
||||
|
||||
## 绑定 Laf 账号
|
||||
|
||||
要调用 Laf 云函数,首先需要绑定 Laf 账号和应用,并且在应用中创建云函数。
|
||||
|
||||
Laf 提供了 PAT(访问凭证) 来实现 Laf 平台外的快捷登录,可以访问 [Laf 文档](https://doc.Laf.run/zh/cli/#%E7%99%BB%E5%BD%95)查看详细如何获取 PAT。
|
||||
|
||||
在获取到 PAT 后,我们可以进入 FastGPT 的`账号页`或是在高级编排中的 `Laf模块` 对 Laf 账号进行绑定。Laf 账号是团队共享的,仅团队管理员可配置。
|
||||
|
||||
填入 PAT 验证后,选择需要绑定的应用(应用需要是 Running 状态),即可调用该应用下的云函数。
|
||||
|
||||
> 如果需要解绑则取消绑定后,点击“更新”即可
|
||||
|
||||

|
||||
|
||||
## 编写云函数
|
||||
|
||||
Laf 云函数拥有根据 interface 自动生成 OpenAPI 的能力,可以参照下面的代码编写云函数,以便自动生成 OpenAPI 文档。
|
||||
|
||||
`Laf模块`可以根据 OpenAPI 文档,自动识别出入参,无需手动添加数据类型。如果不会写 TS,可忽略,手动在 FastGPT 中添加参数即可。
|
||||
|
||||
```ts
|
||||
import cloud from '@lafjs/cloud'
|
||||
|
||||
interface IRequestBody { // 自定义入参,FastGPT 传入的均为POST请求。
|
||||
data1: string // 必填参数
|
||||
data2?: string // 可选参数
|
||||
}
|
||||
|
||||
interface RequestProps extends IRequestBody { // 完整入参,这个无需改动。
|
||||
systemParams: { // 这是FastGPT默认会传递过来的参数
|
||||
appId: string,
|
||||
variables: string,
|
||||
histories: string,
|
||||
cTime: string,
|
||||
chatId: string,
|
||||
responseChatItemId: string
|
||||
}
|
||||
}
|
||||
|
||||
interface IResponse { // 响应内容
|
||||
message: string // 必返回的参数
|
||||
msg?: string; // 可选的返回参数
|
||||
}
|
||||
|
||||
export default async function (ctx: FunctionContext): Promise<IResponse> {
|
||||
const {
|
||||
data1,
|
||||
data2,
|
||||
systemParams
|
||||
}: RequestProps = ctx.body;
|
||||
|
||||
console.log({
|
||||
data1,
|
||||
data2,
|
||||
systemParams
|
||||
});
|
||||
|
||||
return {
|
||||
message: 'ok',
|
||||
msg: 'msg'
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
当然,你也可以在 Laf 平台上选择 fastgpt_template,快速生成该函数模板。
|
||||
|
||||
具体操作可以是,进入 Laf 的函数页面,新建函数(注意 fastgpt 只会调用 post 请求的函数),然后复制上面的代码或者点击更多模板搜索“fastgpt”,使用下面的模板
|
||||
|
||||

|
||||
|
||||
## FastGPT 中使用
|
||||
|
||||
在选择函数后,可以通过点击“同步参数”,自动同步云函数的参数到 FastGPT 中。当然也可以手动添加,手动修改后的参数不会被“同步参数”修改。
|
||||
|
||||

|
||||
|
||||
## 使用注意事项
|
||||
|
||||
### 调用报错
|
||||
|
||||
先在 laf 中调试函数,看是否正常调用。可以通过 console.log,打印入参,将入参放在 Laf 测试页面的 Body 中进行测试。
|
||||
@@ -22,7 +22,7 @@ weight: 356
|
||||
|
||||
## 工具是如何运行的
|
||||
|
||||
要了解工具如何允许,首先需要知道它的运行条件。
|
||||
要了解工具如何运行的,首先需要知道它的运行条件。
|
||||
|
||||
1. 需要工具的介绍(或者叫描述)。这个介绍会告诉LLM,这个工具的作用是什么,LLM会根据上下文语义,决定是否需要调用这个工具。
|
||||
2. 工具的参数。有些工具调用时,可能需要一些特殊的参数。参数中有2个关键的值:`参数介绍`和`是否必须`。
|
||||
|
||||
@@ -58,7 +58,7 @@
|
||||
|
||||
<!-- change -->
|
||||
<script
|
||||
src="https://cdn.jsdelivr.us/npm/medium-zoom/dist/medium-zoom.min.js"
|
||||
src="https://cdn.jsdelivr.net/npm/medium-zoom/dist/medium-zoom.min.js"
|
||||
crossorigin="anonymous"
|
||||
referrerpolicy="no-referrer"
|
||||
></script>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
<head>
|
||||
<script defer type="text/javascript" src="{{ "js/jsdelivr-auto-fallback.js" | absURL }}"></script>
|
||||
<!-- <script defer type="text/javascript" src="{{ "js/jsdelivr-auto-fallback.js" | absURL }}"></script> -->
|
||||
<meta charset="utf-8" />
|
||||
<title>
|
||||
{{- $url := replace .Permalink ( printf "%s" .Site.BaseURL) "" }}
|
||||
@@ -106,6 +106,6 @@
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
<!-- change -->
|
||||
<link rel="preload" href="https://cdn.jsdelivr.us/npm/lxgw-wenkai-screen-webfont@1.1.0/style.css" as="style" />
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.us/npm/lxgw-wenkai-screen-webfont@1.1.0/style.css" />
|
||||
<link rel="preload" href="https://cdn.jsdelivr.net/npm/lxgw-wenkai-screen-webfont@1.1.0/style.css" as="style" />
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/lxgw-wenkai-screen-webfont@1.1.0/style.css" />
|
||||
</head>
|
||||
@@ -4,7 +4,7 @@
|
||||
let failed;
|
||||
let isRunning;
|
||||
const DEST_LIST = [
|
||||
'cdn.jsdelivr.us',
|
||||
'cdn.jsdelivr.net',
|
||||
'jsd.cdn.zzko.cn',
|
||||
'jsd.onmicrosoft.cn'
|
||||
];
|
||||
|
||||
@@ -51,7 +51,7 @@ services:
|
||||
})
|
||||
}' > /data/initReplicaSet.js
|
||||
# 启动MongoDB服务
|
||||
exec docker-entrypoint.sh "$@" &
|
||||
exec docker-entrypoint.sh "$$@" &
|
||||
|
||||
# 等待MongoDB服务启动
|
||||
until mongo -u myusername -p mypassword --authenticationDatabase admin --eval "print('waited for connection')" > /dev/null 2>&1; do
|
||||
@@ -63,11 +63,11 @@ services:
|
||||
mongo -u myusername -p mypassword --authenticationDatabase admin /data/initReplicaSet.js
|
||||
|
||||
# 等待docker-entrypoint.sh脚本执行的MongoDB服务进程
|
||||
wait $!
|
||||
wait $$!
|
||||
fastgpt:
|
||||
container_name: fastgpt
|
||||
image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.6.9 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.6.9 # 阿里云
|
||||
image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.7 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.7 # 阿里云
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
apiVersion: v2
|
||||
name: fastgpt
|
||||
description: A Helm chart for Kubernetes
|
||||
description: A Helm chart for FastGPT
|
||||
|
||||
# A chart can be either an 'application' or a 'library' chart.
|
||||
#
|
||||
|
||||
67
files/helm/fastgpt/README.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# fastgpt
|
||||
|
||||
  
|
||||
|
||||
A Helm chart for FastGPT
|
||||
|
||||
## Requirements
|
||||
|
||||
| Repository | Name | Version |
|
||||
|------------|------|---------|
|
||||
| oci://registry-1.docker.io/bitnamicharts | mongodb | 15.0.1 |
|
||||
| oci://registry-1.docker.io/bitnamicharts | postgresql | 15.0.0 |
|
||||
|
||||
## Values
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
|-----|------|---------|-------------|
|
||||
| affinity | object | `{}` | |
|
||||
| autoscaling.enabled | bool | `false` | |
|
||||
| autoscaling.maxReplicas | int | `100` | |
|
||||
| autoscaling.minReplicas | int | `1` | |
|
||||
| autoscaling.targetCPUUtilizationPercentage | int | `80` | |
|
||||
| fullnameOverride | string | `""` | |
|
||||
| image.pullPolicy | string | `"IfNotPresent"` | |
|
||||
| image.repository | string | `"ghcr.io/labring/fastgpt"` | |
|
||||
| image.tag | string | `""` | |
|
||||
| imagePullSecrets | list | `[]` | |
|
||||
| ingress.annotations | object | `{}` | |
|
||||
| ingress.className | string | `""` | |
|
||||
| ingress.enabled | bool | `false` | |
|
||||
| ingress.hosts[0].host | string | `"chart-example.local"` | |
|
||||
| ingress.hosts[0].paths[0].path | string | `"/"` | |
|
||||
| ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` | |
|
||||
| ingress.tls | list | `[]` | |
|
||||
| livenessProbe.httpGet.path | string | `"/"` | |
|
||||
| livenessProbe.httpGet.port | string | `"http"` | |
|
||||
| mongodb.architecture | string | `"replicaset"` | |
|
||||
| mongodb.auth.rootPassword | string | `"123456"` | |
|
||||
| mongodb.auth.rootUser | string | `"root"` | |
|
||||
| mongodb.enabled | bool | `true` | Enable or disable the built-in MangoDB |
|
||||
| nameOverride | string | `""` | |
|
||||
| nodeSelector | object | `{}` | |
|
||||
| podAnnotations | object | `{}` | |
|
||||
| podLabels | object | `{}` | |
|
||||
| podSecurityContext | object | `{}` | |
|
||||
| postgresql.enabled | bool | `true` | Enable or disable the built-in PostgreSQL |
|
||||
| postgresql.global.postgresql.auth.database | string | `"postgres"` | The default database of PostgreSQL |
|
||||
| postgresql.global.postgresql.auth.postgresPassword | string | `"postgres"` | The password of PostgreSQL, default username is `postgres` |
|
||||
| postgresql.image.repository | string | `"linuxsuren/pgvector"` | The PostgreSQL image which include the pgvector extension. See also the source code from https://github.com/LinuxSuRen/pgvector-docker |
|
||||
| postgresql.image.tag | string | `"v0.0.1"` | |
|
||||
| readinessProbe.httpGet.path | string | `"/"` | |
|
||||
| readinessProbe.httpGet.port | string | `"http"` | |
|
||||
| replicaCount | int | `1` | |
|
||||
| resources | object | `{}` | |
|
||||
| securityContext | object | `{}` | |
|
||||
| service.port | int | `3000` | |
|
||||
| service.type | string | `"ClusterIP"` | |
|
||||
| serviceAccount.annotations | object | `{}` | |
|
||||
| serviceAccount.automount | bool | `true` | |
|
||||
| serviceAccount.create | bool | `true` | |
|
||||
| serviceAccount.name | string | `""` | |
|
||||
| tolerations | list | `[]` | |
|
||||
| volumeMounts | list | `[]` | |
|
||||
| volumes | list | `[]` | |
|
||||
|
||||
----------------------------------------------
|
||||
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
|
||||
@@ -112,6 +112,7 @@ tolerations: []
|
||||
affinity: {}
|
||||
|
||||
mongodb:
|
||||
# -- Enable or disable the built-in MangoDB
|
||||
enabled: true
|
||||
architecture: replicaset
|
||||
auth:
|
||||
@@ -119,13 +120,17 @@ mongodb:
|
||||
rootPassword: "123456"
|
||||
|
||||
postgresql:
|
||||
# -- Enable or disable the built-in PostgreSQL
|
||||
enabled: true
|
||||
image:
|
||||
# registry: 172.11.0.6:30002
|
||||
# -- The PostgreSQL image which include the pgvector extension. See also the source code from https://github.com/LinuxSuRen/pgvector-docker
|
||||
repository: linuxsuren/pgvector
|
||||
tag: v0.0.1
|
||||
global:
|
||||
postgresql:
|
||||
auth:
|
||||
# -- The password of PostgreSQL, default username is `postgres`
|
||||
postgresPassword: postgres
|
||||
# -- The default database of PostgreSQL
|
||||
database: postgres
|
||||
|
||||
@@ -3,12 +3,17 @@ import { ErrType } from '../errorCode';
|
||||
/* dataset: 507000 */
|
||||
const startCode = 507000;
|
||||
export enum CommonErrEnum {
|
||||
fileNotFound = 'fileNotFound'
|
||||
fileNotFound = 'fileNotFound',
|
||||
unAuthFile = 'unAuthFile'
|
||||
}
|
||||
const datasetErr = [
|
||||
{
|
||||
statusText: CommonErrEnum.fileNotFound,
|
||||
message: 'error.fileNotFound'
|
||||
},
|
||||
{
|
||||
statusText: CommonErrEnum.unAuthFile,
|
||||
message: 'error.unAuthFile'
|
||||
}
|
||||
];
|
||||
export default datasetErr.reduce((acc, cur, index) => {
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
export const fileImgs = [
|
||||
{ suffix: 'pdf', src: 'file/fill/pdf' },
|
||||
{ suffix: 'ppt', src: 'file/fill/ppt' },
|
||||
{ suffix: 'xlsx', src: 'file/fill/xlsx' },
|
||||
{ suffix: 'csv', src: 'file/fill/csv' },
|
||||
{ suffix: '(doc|docs)', src: 'file/fill/doc' },
|
||||
{ suffix: 'txt', src: 'file/fill/txt' },
|
||||
|
||||
@@ -11,5 +11,5 @@ export const formatFileSize = (bytes: number): string => {
|
||||
};
|
||||
|
||||
export const detectFileEncoding = (buffers: string | Buffer) => {
|
||||
return detect(buffers)?.encoding || 'utf-8';
|
||||
return (detect(buffers)?.encoding || 'utf-8') as BufferEncoding;
|
||||
};
|
||||
|
||||
@@ -40,9 +40,9 @@ export const splitText2Chunks = (props: {
|
||||
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
|
||||
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
|
||||
|
||||
// ------ There's no overlap on the top
|
||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([!]|!\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.4 },
|
||||
@@ -56,7 +56,7 @@ export const splitText2Chunks = (props: {
|
||||
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
|
||||
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
|
||||
|
||||
// if use markdown title split, Separate record title title
|
||||
// if use markdown title split, Separate record title
|
||||
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
|
||||
if (step >= stepReges.length) {
|
||||
return [
|
||||
@@ -97,6 +97,7 @@ export const splitText2Chunks = (props: {
|
||||
.filter((item) => item.text.trim());
|
||||
};
|
||||
|
||||
/* Gets the overlap at the end of a text as the beginning of the next block */
|
||||
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
|
||||
const forbidOverlap = checkForbidOverlap(step);
|
||||
const maxOverlapLen = chunkLen * 0.4;
|
||||
|
||||
1
packages/global/common/system/api.d.ts
vendored
Normal file
@@ -0,0 +1 @@
|
||||
export type AuthGoogleTokenProps = { googleToken: string; remoteip?: string | null };
|
||||
@@ -55,7 +55,9 @@ export type FastGPTFeConfigsType = {
|
||||
customApiDomain?: string;
|
||||
customSharePageDomain?: string;
|
||||
|
||||
uploadFileMaxAmount?: number;
|
||||
uploadFileMaxSize?: number;
|
||||
lafEnv?: string;
|
||||
};
|
||||
|
||||
export type SystemEnvType = {
|
||||
|
||||
2
packages/global/core/app/api.d.ts
vendored
@@ -1,6 +1,6 @@
|
||||
import type { LLMModelItemType } from '../ai/model.d';
|
||||
import { AppTypeEnum } from './constants';
|
||||
import { AppSchema, AppSimpleEditFormType } from './type';
|
||||
import { AppSchema } from './type';
|
||||
|
||||
export type CreateAppParams = {
|
||||
name?: string;
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import { AppWhisperConfigType } from './type';
|
||||
|
||||
export enum AppTypeEnum {
|
||||
simple = 'simple',
|
||||
advanced = 'advanced'
|
||||
@@ -10,3 +12,9 @@ export const AppTypeMap = {
|
||||
label: 'advanced'
|
||||
}
|
||||
};
|
||||
|
||||
export const defaultWhisperConfig: AppWhisperConfigType = {
|
||||
open: false,
|
||||
autoSend: false,
|
||||
autoTTSResponse: false
|
||||
};
|
||||
|
||||
34
packages/global/core/app/type.d.ts
vendored
@@ -1,9 +1,5 @@
|
||||
import type {
|
||||
AppTTSConfigType,
|
||||
FlowNodeTemplateType,
|
||||
ModuleItemType,
|
||||
VariableItemType
|
||||
} from '../module/type.d';
|
||||
import type { FlowNodeTemplateType, ModuleItemType } from '../module/type.d';
|
||||
|
||||
import { AppTypeEnum } from './constants';
|
||||
import { PermissionTypeEnum } from '../../support/permission/constant';
|
||||
import type { DatasetModuleProps } from '../module/node/type.d';
|
||||
@@ -82,5 +78,31 @@ export type AppSimpleEditFormType = {
|
||||
voice?: string | undefined;
|
||||
speed?: number | undefined;
|
||||
};
|
||||
whisper: AppWhisperConfigType;
|
||||
};
|
||||
};
|
||||
|
||||
/* app function config */
|
||||
// variable
|
||||
export type VariableItemType = {
|
||||
id: string;
|
||||
key: string;
|
||||
label: string;
|
||||
type: `${VariableInputEnum}`;
|
||||
required: boolean;
|
||||
maxLen: number;
|
||||
enums: { value: string }[];
|
||||
};
|
||||
// tts
|
||||
export type AppTTSConfigType = {
|
||||
type: 'none' | 'web' | 'model';
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
};
|
||||
// whisper
|
||||
export type AppWhisperConfigType = {
|
||||
open: boolean;
|
||||
autoSend: boolean;
|
||||
autoTTSResponse: boolean;
|
||||
};
|
||||
|
||||
@@ -9,6 +9,7 @@ import type { FlowNodeInputItemType } from '../module/node/type.d';
|
||||
import { getGuideModule, splitGuideModule } from '../module/utils';
|
||||
import { ModuleItemType } from '../module/type.d';
|
||||
import { DatasetSearchModeEnum } from '../dataset/constants';
|
||||
import { defaultWhisperConfig } from './constants';
|
||||
|
||||
export const getDefaultAppForm = (): AppSimpleEditFormType => {
|
||||
return {
|
||||
@@ -36,7 +37,8 @@ export const getDefaultAppForm = (): AppSimpleEditFormType => {
|
||||
questionGuide: false,
|
||||
tts: {
|
||||
type: 'web'
|
||||
}
|
||||
},
|
||||
whisper: defaultWhisperConfig
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -107,14 +109,15 @@ export const appModules2Form = ({ modules }: { modules: ModuleItemType[] }) => {
|
||||
ModuleInputKeyEnum.datasetSearchExtensionBg
|
||||
);
|
||||
} else if (module.flowType === FlowNodeTypeEnum.userGuide) {
|
||||
const { welcomeText, variableModules, questionGuide, ttsConfig } = splitGuideModule(
|
||||
getGuideModule(modules)
|
||||
);
|
||||
const { welcomeText, variableModules, questionGuide, ttsConfig, whisperConfig } =
|
||||
splitGuideModule(getGuideModule(modules));
|
||||
|
||||
defaultAppForm.userGuide = {
|
||||
welcomeText: welcomeText,
|
||||
variables: variableModules,
|
||||
questionGuide: questionGuide,
|
||||
tts: ttsConfig
|
||||
tts: ttsConfig,
|
||||
whisper: whisperConfig
|
||||
};
|
||||
} else if (module.flowType === FlowNodeTypeEnum.pluginModule) {
|
||||
defaultAppForm.selectedTools.push({
|
||||
|
||||
2
packages/global/core/chat/type.d.ts
vendored
@@ -109,7 +109,7 @@ export type ChatItemType = (UserChatItemType | SystemChatItemType | AIChatItemTy
|
||||
};
|
||||
|
||||
export type ChatSiteItemType = (UserChatItemType | SystemChatItemType | AIChatItemType) & {
|
||||
dataId?: string;
|
||||
dataId: string;
|
||||
status: `${ChatStatusEnum}`;
|
||||
moduleName?: string;
|
||||
ttsBuffer?: Uint8Array;
|
||||
|
||||
12
packages/global/core/dataset/api.d.ts
vendored
@@ -44,14 +44,18 @@ export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
|
||||
export type LinkCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
link: string;
|
||||
};
|
||||
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
fileId: string;
|
||||
};
|
||||
export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
name: string;
|
||||
rawTextLength: number;
|
||||
hashRawText: string;
|
||||
|
||||
fileMetadata?: Record<string, any>;
|
||||
collectionMetadata?: Record<string, any>;
|
||||
};
|
||||
export type CsvTableCreateDatasetCollectionParams = {
|
||||
datasetId: string;
|
||||
parentId?: string;
|
||||
fileId: string;
|
||||
};
|
||||
|
||||
/* ================= data ===================== */
|
||||
export type PgSearchRawType = {
|
||||
|
||||
@@ -73,6 +73,13 @@ export const DatasetCollectionSyncResultMap = {
|
||||
/* ------------ data -------------- */
|
||||
|
||||
/* ------------ training -------------- */
|
||||
export enum ImportDataSourceEnum {
|
||||
fileLocal = 'fileLocal',
|
||||
fileLink = 'fileLink',
|
||||
fileCustom = 'fileCustom',
|
||||
csvTable = 'csvTable'
|
||||
}
|
||||
|
||||
export enum TrainingModeEnum {
|
||||
chunk = 'chunk',
|
||||
auto = 'auto',
|
||||
|
||||
@@ -37,6 +37,7 @@ export enum ModuleInputKeyEnum {
|
||||
userChatInput = 'userChatInput',
|
||||
questionGuide = 'questionGuide',
|
||||
tts = 'tts',
|
||||
whisper = 'whisper',
|
||||
answerText = 'text',
|
||||
agents = 'agents', // cq agent key
|
||||
|
||||
|
||||
@@ -61,7 +61,8 @@ export enum FlowNodeTypeEnum {
|
||||
pluginOutput = 'pluginOutput',
|
||||
queryExtension = 'cfr',
|
||||
tools = 'tools',
|
||||
stopTool = 'stopTool'
|
||||
stopTool = 'stopTool',
|
||||
lafModule = 'lafModule'
|
||||
|
||||
// abandon
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ import { AiQueryExtension } from './system/queryExtension';
|
||||
|
||||
import type { FlowNodeTemplateType, moduleTemplateListType } from '../../module/type.d';
|
||||
import { FlowNodeTemplateTypeEnum } from '../../module/constants';
|
||||
import { lafModule } from './system/laf';
|
||||
|
||||
/* app flow module templates */
|
||||
export const appSystemModuleTemplates: FlowNodeTemplateType[] = [
|
||||
@@ -35,7 +36,8 @@ export const appSystemModuleTemplates: FlowNodeTemplateType[] = [
|
||||
ClassifyQuestionModule,
|
||||
ContextExtractModule,
|
||||
HttpModule468,
|
||||
AiQueryExtension
|
||||
AiQueryExtension,
|
||||
lafModule
|
||||
];
|
||||
/* plugin flow module templates */
|
||||
export const pluginSystemModuleTemplates: FlowNodeTemplateType[] = [
|
||||
@@ -51,7 +53,8 @@ export const pluginSystemModuleTemplates: FlowNodeTemplateType[] = [
|
||||
ClassifyQuestionModule,
|
||||
ContextExtractModule,
|
||||
HttpModule468,
|
||||
AiQueryExtension
|
||||
AiQueryExtension,
|
||||
lafModule
|
||||
];
|
||||
|
||||
/* all module */
|
||||
@@ -73,7 +76,8 @@ export const moduleTemplatesFlat: FlowNodeTemplateType[] = [
|
||||
PluginInputModule,
|
||||
PluginOutputModule,
|
||||
RunPluginModule,
|
||||
AiQueryExtension
|
||||
AiQueryExtension,
|
||||
lafModule
|
||||
];
|
||||
|
||||
export const moduleTemplatesList: moduleTemplateListType = [
|
||||
|
||||
86
packages/global/core/module/template/system/laf.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
import {
|
||||
FlowNodeInputTypeEnum,
|
||||
FlowNodeOutputTypeEnum,
|
||||
FlowNodeTypeEnum
|
||||
} from '../../node/constant';
|
||||
import { FlowNodeTemplateType } from '../../type';
|
||||
import {
|
||||
ModuleIOValueTypeEnum,
|
||||
ModuleInputKeyEnum,
|
||||
ModuleOutputKeyEnum,
|
||||
FlowNodeTemplateTypeEnum
|
||||
} from '../../constants';
|
||||
import {
|
||||
Input_Template_DynamicInput,
|
||||
Input_Template_Switch,
|
||||
Input_Template_AddInputParam
|
||||
} from '../input';
|
||||
import { Output_Template_Finish, Output_Template_AddOutput } from '../output';
|
||||
|
||||
export const lafModule: FlowNodeTemplateType = {
|
||||
id: FlowNodeTypeEnum.lafModule,
|
||||
templateType: FlowNodeTemplateTypeEnum.externalCall,
|
||||
flowType: FlowNodeTypeEnum.lafModule,
|
||||
avatar: '/imgs/module/laf.png',
|
||||
name: 'Laf 函数调用(测试)',
|
||||
intro: '可以调用Laf账号下的云函数。',
|
||||
showStatus: true,
|
||||
isTool: true,
|
||||
inputs: [
|
||||
Input_Template_Switch,
|
||||
{
|
||||
key: ModuleInputKeyEnum.httpReqUrl,
|
||||
type: FlowNodeInputTypeEnum.hidden,
|
||||
valueType: ModuleIOValueTypeEnum.string,
|
||||
label: '',
|
||||
description: 'core.module.input.description.Http Request Url',
|
||||
placeholder: 'https://api.ai.com/getInventory',
|
||||
required: false,
|
||||
showTargetInApp: false,
|
||||
showTargetInPlugin: false
|
||||
},
|
||||
Input_Template_DynamicInput,
|
||||
{
|
||||
...Input_Template_AddInputParam,
|
||||
editField: {
|
||||
key: true,
|
||||
description: true,
|
||||
dataType: true
|
||||
},
|
||||
defaultEditField: {
|
||||
label: '',
|
||||
key: '',
|
||||
description: '',
|
||||
inputType: FlowNodeInputTypeEnum.target,
|
||||
valueType: ModuleIOValueTypeEnum.string
|
||||
}
|
||||
}
|
||||
],
|
||||
outputs: [
|
||||
{
|
||||
key: ModuleOutputKeyEnum.httpRawResponse,
|
||||
label: '原始响应',
|
||||
description: 'HTTP请求的原始响应。只能接受字符串或JSON类型响应数据。',
|
||||
valueType: ModuleIOValueTypeEnum.any,
|
||||
type: FlowNodeOutputTypeEnum.source,
|
||||
targets: []
|
||||
},
|
||||
{
|
||||
...Output_Template_AddOutput,
|
||||
editField: {
|
||||
key: true,
|
||||
description: true,
|
||||
dataType: true,
|
||||
defaultValue: true
|
||||
},
|
||||
defaultEditField: {
|
||||
label: '',
|
||||
key: '',
|
||||
description: '',
|
||||
outputType: FlowNodeOutputTypeEnum.source,
|
||||
valueType: ModuleIOValueTypeEnum.string
|
||||
}
|
||||
},
|
||||
Output_Template_Finish
|
||||
]
|
||||
};
|
||||
19
packages/global/core/module/type.d.ts
vendored
@@ -9,6 +9,7 @@ import { DispatchNodeResponseKeyEnum } from './runtime/constants';
|
||||
import { FlowNodeInputItemType, FlowNodeOutputItemType } from './node/type';
|
||||
import { UserModelSchema } from 'support/user/type';
|
||||
import {
|
||||
ChatItemType,
|
||||
ChatItemValueItemType,
|
||||
ToolRunResponseItemType,
|
||||
UserChatItemValueItemType
|
||||
@@ -62,24 +63,6 @@ export type ModuleItemType = {
|
||||
};
|
||||
|
||||
/* --------------- function type -------------------- */
|
||||
// variable
|
||||
export type VariableItemType = {
|
||||
id: string;
|
||||
key: string;
|
||||
label: string;
|
||||
type: `${VariableInputEnum}`;
|
||||
required: boolean;
|
||||
maxLen: number;
|
||||
enums: { value: string }[];
|
||||
};
|
||||
// tts
|
||||
export type AppTTSConfigType = {
|
||||
type: 'none' | 'web' | 'model';
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
};
|
||||
|
||||
export type SelectAppItemType = {
|
||||
id: string;
|
||||
name: string;
|
||||
|
||||
@@ -6,10 +6,12 @@ import {
|
||||
variableMap
|
||||
} from './constants';
|
||||
import { FlowNodeInputItemType, FlowNodeOutputItemType } from './node/type';
|
||||
import { AppTTSConfigType, ModuleItemType, VariableItemType } from './type';
|
||||
import { ModuleItemType } from './type';
|
||||
import type { VariableItemType, AppTTSConfigType, AppWhisperConfigType } from '../app/type';
|
||||
import { Input_Template_Switch } from './template/input';
|
||||
import { EditorVariablePickerType } from '../../../web/components/common/Textarea/PromptEditor/type';
|
||||
import { Output_Template_Finish } from './template/output';
|
||||
import { defaultWhisperConfig } from '../app/constants';
|
||||
|
||||
/* module */
|
||||
export const getGuideModule = (modules: ModuleItemType[]) =>
|
||||
@@ -30,11 +32,16 @@ export const splitGuideModule = (guideModules?: ModuleItemType) => {
|
||||
(item) => item.key === ModuleInputKeyEnum.tts
|
||||
)?.value || { type: 'web' };
|
||||
|
||||
const whisperConfig: AppWhisperConfigType =
|
||||
guideModules?.inputs?.find((item) => item.key === ModuleInputKeyEnum.whisper)?.value ||
|
||||
defaultWhisperConfig;
|
||||
|
||||
return {
|
||||
welcomeText,
|
||||
variableModules,
|
||||
questionGuide,
|
||||
ttsConfig
|
||||
ttsConfig,
|
||||
whisperConfig
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ export type PathDataType = {
|
||||
path: string;
|
||||
params: any[];
|
||||
request: any;
|
||||
response: any;
|
||||
};
|
||||
|
||||
export type OpenApiJsonSchema = {
|
||||
|
||||
@@ -41,9 +41,10 @@ export const str2OpenApiSchema = async (yamlStr = ''): Promise<OpenApiJsonSchema
|
||||
path,
|
||||
method,
|
||||
name: methodInfo.operationId || path,
|
||||
description: methodInfo.description,
|
||||
description: methodInfo.description || methodInfo.summary,
|
||||
params: methodInfo.parameters,
|
||||
request: methodInfo?.requestBody
|
||||
request: methodInfo?.requestBody,
|
||||
response: methodInfo.responses
|
||||
};
|
||||
return result;
|
||||
});
|
||||
|
||||
@@ -2,18 +2,18 @@
|
||||
"name": "@fastgpt/global",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"@apidevtools/swagger-parser": "^10.1.0",
|
||||
"axios": "^1.5.1",
|
||||
"dayjs": "^1.11.7",
|
||||
"encoding": "^0.1.13",
|
||||
"js-tiktoken": "^1.0.7",
|
||||
"openapi-types": "^12.1.3",
|
||||
"openai": "4.28.0",
|
||||
"nanoid": "^4.0.1",
|
||||
"js-yaml": "^4.1.0",
|
||||
"timezones-list": "^3.0.2",
|
||||
"next": "13.5.2",
|
||||
"jschardet": "3.1.1",
|
||||
"@apidevtools/swagger-parser": "^10.1.0"
|
||||
"nanoid": "^4.0.1",
|
||||
"next": "13.5.2",
|
||||
"openai": "4.28.0",
|
||||
"openapi-types": "^12.1.3",
|
||||
"timezones-list": "^3.0.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { TeamMemberRoleEnum } from './constant';
|
||||
import { TeamMemberSchema } from './type';
|
||||
import { LafAccountType, TeamMemberSchema } from './type';
|
||||
|
||||
export type AuthTeamRoleProps = {
|
||||
teamId: string;
|
||||
@@ -10,12 +10,14 @@ export type CreateTeamProps = {
|
||||
name: string;
|
||||
avatar?: string;
|
||||
defaultTeam?: boolean;
|
||||
lafAccount?: LafAccountType;
|
||||
};
|
||||
export type UpdateTeamProps = {
|
||||
teamId: string;
|
||||
name?: string;
|
||||
avatar?: string;
|
||||
teamDomain?: string;
|
||||
lafAccount?: null | LafAccountType;
|
||||
};
|
||||
|
||||
/* ------------- member ----------- */
|
||||
|
||||
8
packages/global/support/user/team/type.d.ts
vendored
@@ -1,5 +1,6 @@
|
||||
import type { UserModelSchema } from '../type';
|
||||
import type { TeamMemberRoleEnum, TeamMemberStatusEnum } from './constant';
|
||||
import { LafAccountType } from './type';
|
||||
|
||||
export type TeamSchema = {
|
||||
_id: string;
|
||||
@@ -13,6 +14,7 @@ export type TeamSchema = {
|
||||
lastExportDatasetTime: Date;
|
||||
lastWebsiteSyncTime: Date;
|
||||
};
|
||||
lafAccount: LafAccountType;
|
||||
};
|
||||
export type tagsType = {
|
||||
label: string;
|
||||
@@ -58,6 +60,7 @@ export type TeamItemType = {
|
||||
role: `${TeamMemberRoleEnum}`;
|
||||
status: `${TeamMemberStatusEnum}`;
|
||||
canWrite: boolean;
|
||||
lafAccount?: LafAccountType;
|
||||
};
|
||||
|
||||
export type TeamMemberItemType = {
|
||||
@@ -74,3 +77,8 @@ export type TeamTagItemType = {
|
||||
label: string;
|
||||
key: string;
|
||||
};
|
||||
|
||||
export type LafAccountType = {
|
||||
token: string;
|
||||
appid: string;
|
||||
};
|
||||
|
||||
33
packages/service/common/buffer/rawText/schema.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import { connectionMongo, type Model } from '../../mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { RawTextBufferSchemaType } from './type';
|
||||
|
||||
export const collectionName = 'buffer.rawText';
|
||||
|
||||
const RawTextBufferSchema = new Schema({
|
||||
sourceId: {
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
rawText: {
|
||||
type: String,
|
||||
default: ''
|
||||
},
|
||||
createTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
metadata: Object
|
||||
});
|
||||
|
||||
try {
|
||||
RawTextBufferSchema.index({ sourceId: 1 });
|
||||
// 20 minutes
|
||||
RawTextBufferSchema.index({ createTime: 1 }, { expireAfterSeconds: 20 * 60 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
export const MongoRwaTextBuffer: Model<RawTextBufferSchemaType> =
|
||||
models[collectionName] || model(collectionName, RawTextBufferSchema);
|
||||
MongoRwaTextBuffer.syncIndexes();
|
||||
8
packages/service/common/buffer/rawText/type.d.ts
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
export type RawTextBufferSchemaType = {
|
||||
sourceId: string;
|
||||
rawText: string;
|
||||
createTime: Date;
|
||||
metadata?: {
|
||||
filename: string;
|
||||
};
|
||||
};
|
||||
@@ -2,7 +2,7 @@ import { connectionMongo, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { TTSBufferSchemaType } from './type.d';
|
||||
|
||||
export const collectionName = 'ttsbuffers';
|
||||
export const collectionName = 'buffer.tts';
|
||||
|
||||
const TTSBufferSchema = new Schema({
|
||||
bufferId: {
|
||||
|
||||
@@ -4,6 +4,11 @@ import fsp from 'fs/promises';
|
||||
import fs from 'fs';
|
||||
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoFileSchema } from './schema';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { ReadFileByBufferParams } from '../read/type';
|
||||
import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
|
||||
import { readFileRawContent } from '../read/utils';
|
||||
|
||||
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
|
||||
MongoFileSchema;
|
||||
@@ -111,3 +116,106 @@ export async function getDownloadStream({
|
||||
|
||||
return bucket.openDownloadStream(new Types.ObjectId(fileId));
|
||||
}
|
||||
|
||||
export const readFileEncode = async ({
|
||||
bucketName,
|
||||
fileId
|
||||
}: {
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
fileId: string;
|
||||
}) => {
|
||||
const encodeStream = await getDownloadStream({ bucketName, fileId });
|
||||
let buffers: Buffer = Buffer.from([]);
|
||||
for await (const chunk of encodeStream) {
|
||||
buffers = Buffer.concat([buffers, chunk]);
|
||||
if (buffers.length > 10) {
|
||||
encodeStream.abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const encoding = detectFileEncoding(buffers);
|
||||
|
||||
return encoding as BufferEncoding;
|
||||
};
|
||||
|
||||
export const readFileContentFromMongo = async ({
|
||||
teamId,
|
||||
bucketName,
|
||||
fileId,
|
||||
csvFormat = false
|
||||
}: {
|
||||
teamId: string;
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
fileId: string;
|
||||
csvFormat?: boolean;
|
||||
}): Promise<{
|
||||
rawText: string;
|
||||
filename: string;
|
||||
}> => {
|
||||
// read buffer
|
||||
const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
|
||||
if (fileBuffer) {
|
||||
return {
|
||||
rawText: fileBuffer.rawText,
|
||||
filename: fileBuffer.metadata?.filename || ''
|
||||
};
|
||||
}
|
||||
|
||||
const [file, encoding, fileStream] = await Promise.all([
|
||||
getFileById({ bucketName, fileId }),
|
||||
readFileEncode({ bucketName, fileId }),
|
||||
getDownloadStream({ bucketName, fileId })
|
||||
]);
|
||||
|
||||
if (!file) {
|
||||
return Promise.reject(CommonErrEnum.fileNotFound);
|
||||
}
|
||||
|
||||
const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';
|
||||
|
||||
const fileBuffers = await (() => {
|
||||
return new Promise<Buffer>((resolve, reject) => {
|
||||
let buffers = Buffer.from([]);
|
||||
fileStream.on('data', (chunk) => {
|
||||
buffers = Buffer.concat([buffers, chunk]);
|
||||
});
|
||||
fileStream.on('end', () => {
|
||||
resolve(buffers);
|
||||
});
|
||||
fileStream.on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
})();
|
||||
|
||||
const params: ReadFileByBufferParams = {
|
||||
teamId,
|
||||
buffer: fileBuffers,
|
||||
encoding,
|
||||
metadata: {
|
||||
relatedId: fileId
|
||||
}
|
||||
};
|
||||
|
||||
const { rawText } = await readFileRawContent({
|
||||
extension,
|
||||
csvFormat,
|
||||
params
|
||||
});
|
||||
|
||||
if (rawText.trim()) {
|
||||
MongoRwaTextBuffer.create({
|
||||
sourceId: fileId,
|
||||
rawText,
|
||||
metadata: {
|
||||
filename: file.filename
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
rawText,
|
||||
filename: file.filename
|
||||
};
|
||||
};
|
||||
|
||||
@@ -14,7 +14,6 @@ export async function uploadMongoImg({
|
||||
teamId,
|
||||
expiredTime,
|
||||
metadata,
|
||||
|
||||
shareId
|
||||
}: UploadImgProps & {
|
||||
teamId: string;
|
||||
@@ -30,9 +29,8 @@ export async function uploadMongoImg({
|
||||
type,
|
||||
teamId,
|
||||
binary,
|
||||
expiredTime: expiredTime,
|
||||
expiredTime,
|
||||
metadata,
|
||||
|
||||
shareId
|
||||
});
|
||||
|
||||
|
||||
@@ -25,13 +25,13 @@ const ImageSchema = new Schema({
|
||||
enum: Object.keys(mongoImageTypeMap),
|
||||
required: true
|
||||
},
|
||||
|
||||
metadata: {
|
||||
type: Object
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
// tts expired
|
||||
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
|
||||
ImageSchema.index({ type: 1 });
|
||||
ImageSchema.index({ createTime: 1 });
|
||||
|
||||
@@ -37,7 +37,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
|
||||
async doUpload<T = Record<string, any>>(
|
||||
req: NextApiRequest,
|
||||
res: NextApiResponse,
|
||||
originBuckerName?: `${BucketNameEnum}`
|
||||
originBucketName?: `${BucketNameEnum}`
|
||||
) {
|
||||
return new Promise<{
|
||||
file: FileType;
|
||||
@@ -52,7 +52,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
|
||||
}
|
||||
|
||||
// check bucket name
|
||||
const bucketName = (req.body?.bucketName || originBuckerName) as `${BucketNameEnum}`;
|
||||
const bucketName = (req.body?.bucketName || originBucketName) as `${BucketNameEnum}`;
|
||||
if (bucketName && !bucketNameMap[bucketName]) {
|
||||
return reject('BucketName is invalid');
|
||||
}
|
||||
|
||||
21
packages/service/common/file/read/csv.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
import Papa from 'papaparse';
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
// 加载源文件内容
|
||||
export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const { rawText } = readFileRawText(params);
|
||||
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
|
||||
const header = csvArr[0];
|
||||
|
||||
const formatText = header
|
||||
? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
|
||||
: '';
|
||||
|
||||
return {
|
||||
rawText,
|
||||
formatText
|
||||
};
|
||||
};
|
||||
23
packages/service/common/file/read/html.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { initMarkdownText } from './utils';
|
||||
import { htmlToMarkdown } from '../../string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readHtmlRawText = async (
|
||||
params: ReadFileByBufferParams
|
||||
): Promise<ReadFileResponse> => {
|
||||
const { teamId, metadata } = params;
|
||||
const { rawText: html } = readFileRawText(params);
|
||||
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
||||
18
packages/service/common/file/read/markdown.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { initMarkdownText } from './utils';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readMarkdown = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const { teamId, metadata } = params;
|
||||
const { rawText: md } = readFileRawText(params);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
||||
119
packages/service/common/file/read/parseOffice.ts
Normal file
@@ -0,0 +1,119 @@
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
import fs from 'fs';
|
||||
import decompress from 'decompress';
|
||||
import { DOMParser } from '@xmldom/xmldom';
|
||||
import { clearDirFiles } from '../utils';
|
||||
import { addLog } from '../../system/log';
|
||||
|
||||
const DEFAULTDECOMPRESSSUBLOCATION = '/tmp';
|
||||
|
||||
function getNewFileName(ext: string) {
|
||||
return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`;
|
||||
}
|
||||
|
||||
const parseString = (xml: string) => {
|
||||
let parser = new DOMParser();
|
||||
return parser.parseFromString(xml, 'text/xml');
|
||||
};
|
||||
|
||||
const parsePowerPoint = async ({
|
||||
filepath,
|
||||
decompressPath,
|
||||
encoding
|
||||
}: {
|
||||
filepath: string;
|
||||
decompressPath: string;
|
||||
encoding: BufferEncoding;
|
||||
}) => {
|
||||
// Files regex that hold our content of interest
|
||||
const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
|
||||
const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
|
||||
|
||||
/** The decompress location which contains the filename in it */
|
||||
|
||||
const files = await decompress(filepath, decompressPath, {
|
||||
filter: (x) => !!x.path.match(allFilesRegex)
|
||||
});
|
||||
|
||||
// Verify if atleast the slides xml files exist in the extracted files list.
|
||||
if (
|
||||
files.length == 0 ||
|
||||
!files.map((file) => file.path).some((filename) => filename.match(slidesRegex))
|
||||
) {
|
||||
return Promise.reject('解析 PPT 失败');
|
||||
}
|
||||
|
||||
// Returning an array of all the xml contents read using fs.readFileSync
|
||||
const xmlContentArray = files.map((file) =>
|
||||
fs.readFileSync(`${decompressPath}/${file.path}`, encoding)
|
||||
);
|
||||
|
||||
let responseArr: string[] = [];
|
||||
|
||||
xmlContentArray.forEach((xmlContent) => {
|
||||
/** Find text nodes with a:p tags */
|
||||
const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p');
|
||||
|
||||
/** Store all the text content to respond */
|
||||
responseArr.push(
|
||||
Array.from(xmlParagraphNodesList)
|
||||
// Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag
|
||||
.filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0)
|
||||
.map((paragraphNode) => {
|
||||
/** Find text nodes with a:t tags */
|
||||
const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t');
|
||||
return Array.from(xmlTextNodeList)
|
||||
.filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
|
||||
.map((textNode) => textNode.childNodes[0].nodeValue)
|
||||
.join('');
|
||||
})
|
||||
.join('\n')
|
||||
);
|
||||
});
|
||||
|
||||
return responseArr.join('\n');
|
||||
};
|
||||
|
||||
export const parseOffice = async ({
|
||||
buffer,
|
||||
encoding,
|
||||
extension
|
||||
}: {
|
||||
buffer: Buffer;
|
||||
encoding: BufferEncoding;
|
||||
extension: string;
|
||||
}) => {
|
||||
// Prepare file for processing
|
||||
// create temp file subdirectory if it does not exist
|
||||
if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) {
|
||||
fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true });
|
||||
}
|
||||
|
||||
// temp file name
|
||||
const filepath = getNewFileName(extension);
|
||||
const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`;
|
||||
// const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`;
|
||||
|
||||
// write new file
|
||||
fs.writeFileSync(filepath, buffer, {
|
||||
encoding
|
||||
});
|
||||
|
||||
const text = await (async () => {
|
||||
try {
|
||||
switch (extension) {
|
||||
case 'pptx':
|
||||
return parsePowerPoint({ filepath, decompressPath, encoding });
|
||||
default:
|
||||
return Promise.reject('只能读取 .pptx 文件');
|
||||
}
|
||||
} catch (error) {
|
||||
addLog.error(`Load ppt error`, { error });
|
||||
}
|
||||
return '';
|
||||
})();
|
||||
|
||||
fs.unlinkSync(filepath);
|
||||
clearDirFiles(decompressPath);
|
||||
return text;
|
||||
};
|
||||
@@ -1,5 +1,7 @@
|
||||
/* read file to txt */
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
// @ts-ignore
|
||||
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
@@ -11,9 +13,9 @@ type TokenType = {
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
|
||||
|
||||
export const readPdfFile = async ({
|
||||
buffer
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
@@ -51,14 +53,19 @@ export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
|
||||
.join('');
|
||||
};
|
||||
|
||||
const doc = await pdfjsLib.getDocument(pdf).promise;
|
||||
const loadingTask = pdfjs.getDocument(buffer.buffer);
|
||||
const doc = await loadingTask.promise;
|
||||
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
|
||||
loadingTask.destroy();
|
||||
|
||||
return {
|
||||
rawText: pageTexts.join('')
|
||||
rawText: pageTexts.join(''),
|
||||
metadata: {}
|
||||
};
|
||||
};
|
||||
14
packages/service/common/file/read/pptx.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
// import { parseOfficeAsync } from 'officeparser';
|
||||
import { parseOffice } from './parseOffice';
|
||||
|
||||
export const readPptxRawText = async ({
|
||||
buffer,
|
||||
encoding
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const result = await parseOffice({ buffer, encoding, extension: 'pptx' });
|
||||
|
||||
return {
|
||||
rawText: result
|
||||
};
|
||||
};
|
||||
10
packages/service/common/file/read/rawText.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
|
||||
// 加载源文件内容
|
||||
export const readFileRawText = ({ buffer, encoding }: ReadFileByBufferParams): ReadFileResponse => {
|
||||
const content = buffer.toString(encoding);
|
||||
|
||||
return {
|
||||
rawText: content
|
||||
};
|
||||
};
|
||||
12
packages/service/common/file/read/type.d.ts
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
export type ReadFileByBufferParams = {
|
||||
teamId: string;
|
||||
buffer: Buffer;
|
||||
encoding: BufferEncoding;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
|
||||
export type ReadFileResponse = {
|
||||
rawText: string;
|
||||
formatText?: string;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
81
packages/service/common/file/read/utils.ts
Normal file
@@ -0,0 +1,81 @@
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||
import { addHours } from 'date-fns';
|
||||
import { ReadFileByBufferParams } from './type';
|
||||
import { readFileRawText } from '../read/rawText';
|
||||
import { readMarkdown } from '../read/markdown';
|
||||
import { readHtmlRawText } from '../read/html';
|
||||
import { readPdfFile } from '../read/pdf';
|
||||
import { readWordFile } from '../read/word';
|
||||
import { readCsvRawText } from '../read/csv';
|
||||
import { readPptxRawText } from '../read/pptx';
|
||||
import { readXlsxRawText } from '../read/xlsx';
|
||||
|
||||
export const initMarkdownText = ({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
}: {
|
||||
md: string;
|
||||
teamId: string;
|
||||
metadata?: Record<string, any>;
|
||||
}) =>
|
||||
markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController: (base64Img) =>
|
||||
uploadMongoImg({
|
||||
type: MongoImageTypeEnum.collectionImage,
|
||||
base64Img,
|
||||
teamId,
|
||||
metadata,
|
||||
expiredTime: addHours(new Date(), 2)
|
||||
})
|
||||
});
|
||||
|
||||
export const readFileRawContent = async ({
|
||||
extension,
|
||||
csvFormat,
|
||||
params
|
||||
}: {
|
||||
csvFormat?: boolean;
|
||||
extension: string;
|
||||
params: ReadFileByBufferParams;
|
||||
}) => {
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(params);
|
||||
case 'md':
|
||||
return readMarkdown(params);
|
||||
case 'html':
|
||||
return readHtmlRawText(params);
|
||||
case 'pdf':
|
||||
return readPdfFile(params);
|
||||
case 'docx':
|
||||
return readWordFile(params);
|
||||
case 'pptx':
|
||||
return readPptxRawText(params);
|
||||
case 'xlsx':
|
||||
const xlsxResult = await readXlsxRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: xlsxResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: xlsxResult.rawText
|
||||
};
|
||||
case 'csv':
|
||||
const csvResult = await readCsvRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: csvResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: csvResult.rawText
|
||||
};
|
||||
default:
|
||||
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
|
||||
}
|
||||
};
|
||||
35
packages/service/common/file/read/word.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import mammoth from 'mammoth';
|
||||
import { htmlToMarkdown } from '../../string/markdown';
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type';
|
||||
import { initMarkdownText } from './utils';
|
||||
|
||||
/**
|
||||
* read docx to markdown
|
||||
*/
|
||||
export const readWordFile = async ({
|
||||
teamId,
|
||||
buffer,
|
||||
metadata = {}
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
try {
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
buffer
|
||||
});
|
||||
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText,
|
||||
metadata: {}
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('error doc read:', error);
|
||||
return Promise.reject('Can not read doc file, please convert to PDF');
|
||||
}
|
||||
};
|
||||
45
packages/service/common/file/read/xlsx.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import xlsx from 'node-xlsx';
|
||||
import Papa from 'papaparse';
|
||||
|
||||
export const readXlsxRawText = async ({
|
||||
buffer
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const result = xlsx.parse(buffer, {
|
||||
skipHidden: false,
|
||||
defval: ''
|
||||
});
|
||||
|
||||
const format2Csv = result.map(({ name, data }) => {
|
||||
return {
|
||||
title: `#${name}`,
|
||||
csvText: data.map((item) => item.join(',')).join('\n')
|
||||
};
|
||||
});
|
||||
|
||||
const rawText = format2Csv.map((item) => item.csvText).join('\n');
|
||||
const formatText = format2Csv
|
||||
.map((item) => {
|
||||
const csvArr = Papa.parse(item.csvText).data as string[][];
|
||||
const header = csvArr[0];
|
||||
|
||||
const formatText = header
|
||||
? csvArr
|
||||
.map((item) =>
|
||||
item
|
||||
.map((item, i) => (item ? `${header[i]}:${item}` : ''))
|
||||
.filter(Boolean)
|
||||
.join('\n')
|
||||
)
|
||||
.join('\n')
|
||||
: '';
|
||||
|
||||
return `${item.title}\n${formatText}`;
|
||||
})
|
||||
.join('\n');
|
||||
|
||||
return {
|
||||
rawText: rawText,
|
||||
formatText
|
||||
};
|
||||
};
|
||||
@@ -1,4 +1,6 @@
|
||||
import { isProduction } from '../system/constants';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
export const removeFilesByPaths = (paths: string[]) => {
|
||||
paths.forEach((path) => {
|
||||
@@ -33,12 +35,34 @@ export const clearDirFiles = (dirPath: string) => {
|
||||
return;
|
||||
}
|
||||
|
||||
fs.readdirSync(dirPath).forEach((file) => {
|
||||
const curPath = `${dirPath}/${file}`;
|
||||
if (fs.lstatSync(curPath).isDirectory()) {
|
||||
clearDirFiles(curPath);
|
||||
} else {
|
||||
fs.unlinkSync(curPath);
|
||||
fs.rmdirSync(dirPath, {
|
||||
recursive: true
|
||||
});
|
||||
};
|
||||
|
||||
export const clearTmpUploadFiles = () => {
|
||||
if (!isProduction) return;
|
||||
const tmpPath = '/tmp';
|
||||
|
||||
fs.readdir(tmpPath, (err, files) => {
|
||||
if (err) return;
|
||||
|
||||
for (const file of files) {
|
||||
if (file === 'v8-compile-cache-0') continue;
|
||||
|
||||
const filePath = path.join(tmpPath, file);
|
||||
|
||||
fs.stat(filePath, (err, stats) => {
|
||||
if (err) return;
|
||||
|
||||
// 如果文件是在2小时前上传的,则认为是临时文件并删除它
|
||||
if (Date.now() - stats.mtime.getTime() > 2 * 60 * 60 * 1000) {
|
||||
fs.unlink(filePath, (err) => {
|
||||
if (err) return;
|
||||
console.log(`Deleted temp file: ${filePath}`);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
@@ -1 +1,3 @@
|
||||
export const FastGPTProUrl = process.env.PRO_URL ? `${process.env.PRO_URL}/api` : '';
|
||||
|
||||
export const isProduction = process.env.NODE_ENV === 'production';
|
||||
|
||||
15
packages/service/common/system/timerLock/constants.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
export enum TimerIdEnum {
|
||||
checkInValidDatasetFiles = 'checkInValidDatasetFiles',
|
||||
checkInvalidDatasetData = 'checkInvalidDatasetData',
|
||||
checkInvalidVector = 'checkInvalidVector',
|
||||
clearExpiredSubPlan = 'clearExpiredSubPlan',
|
||||
updateStandardPlan = 'updateStandardPlan'
|
||||
}
|
||||
|
||||
export const timerIdMap = {
|
||||
[TimerIdEnum.checkInValidDatasetFiles]: 'checkInValidDatasetFiles',
|
||||
[TimerIdEnum.checkInvalidDatasetData]: 'checkInvalidDatasetData',
|
||||
[TimerIdEnum.checkInvalidVector]: 'checkInvalidVector',
|
||||
[TimerIdEnum.clearExpiredSubPlan]: 'clearExpiredSubPlan',
|
||||
[TimerIdEnum.updateStandardPlan]: 'updateStandardPlan'
|
||||
};
|
||||
29
packages/service/common/system/timerLock/schema.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { connectionMongo, type Model } from '../../mongo';
|
||||
import { timerIdMap } from './constants';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { TimerLockSchemaType } from './type.d';
|
||||
|
||||
export const collectionName = 'systemtimerlocks';
|
||||
|
||||
const TimerLockSchema = new Schema({
|
||||
timerId: {
|
||||
type: String,
|
||||
required: true,
|
||||
unique: true,
|
||||
enum: Object.keys(timerIdMap)
|
||||
},
|
||||
expiredTime: {
|
||||
type: Date,
|
||||
required: true
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
TimerLockSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 5 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
export const MongoTimerLock: Model<TimerLockSchemaType> =
|
||||
models[collectionName] || model(collectionName, TimerLockSchema);
|
||||
MongoTimerLock.syncIndexes();
|
||||
5
packages/service/common/system/timerLock/type.d.ts
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
export type TimerLockSchemaType = {
|
||||
_id: string;
|
||||
timerId: string;
|
||||
expiredTime: Date;
|
||||
};
|
||||
25
packages/service/common/system/timerLock/utils.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { TimerIdEnum } from './constants';
|
||||
import { MongoTimerLock } from './schema';
|
||||
import { addMinutes } from 'date-fns';
|
||||
|
||||
/*
|
||||
利用唯一健,使得同一时间只有一个任务在执行,后创建的锁,会因唯一健创建失败,从而无法继续执行任务
|
||||
*/
|
||||
export const checkTimerLock = async ({
|
||||
timerId,
|
||||
lockMinuted
|
||||
}: {
|
||||
timerId: `${TimerIdEnum}`;
|
||||
lockMinuted: number;
|
||||
}) => {
|
||||
try {
|
||||
await MongoTimerLock.create({
|
||||
timerId,
|
||||
expiredTime: addMinutes(new Date(), lockMinuted)
|
||||
});
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
@@ -3,6 +3,7 @@ import { getAIApi } from '../config';
|
||||
import { ChatItemType } from '@fastgpt/global/core/chat/type';
|
||||
import { countGptMessagesTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||
import { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type';
|
||||
import { chatValue2RuntimePrompt } from '@fastgpt/global/core/chat/adapt';
|
||||
|
||||
/*
|
||||
query extension - 问题扩展
|
||||
@@ -117,7 +118,7 @@ A: ${chatBg}
|
||||
const historyFewShot = histories
|
||||
.map((item) => {
|
||||
const role = item.obj === 'Human' ? 'Q' : 'A';
|
||||
return `${role}: ${item.value}`;
|
||||
return `${role}: ${chatValue2RuntimePrompt(item.value).text}`;
|
||||
})
|
||||
.join('\n');
|
||||
const concatFewShot = `${systemFewShot}${historyFewShot}`.trim();
|
||||
|
||||
@@ -9,7 +9,6 @@ import {
|
||||
DatasetCollectionSchemaType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { MongoDatasetData } from '../data/schema';
|
||||
import { delImgByRelatedId } from '../../../common/file/image/controller';
|
||||
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
|
||||
|
||||
6
packages/service/core/dataset/training/constants.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
export enum ImportDataSourceEnum {
|
||||
fileLocal = 'fileLocal',
|
||||
fileLink = 'fileLink',
|
||||
fileCustom = 'fileCustom',
|
||||
tableLocal = 'tableLocal'
|
||||
}
|
||||
@@ -1,14 +1,16 @@
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { MongoDatasetTraining } from './schema';
|
||||
import type {
|
||||
PushDatasetDataChunkProps,
|
||||
PushDatasetDataProps,
|
||||
PushDatasetDataResponse
|
||||
} from '@fastgpt/global/core/dataset/api.d';
|
||||
import { getCollectionWithDataset } from '../controller';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||
import { ClientSession } from '../../../common/mongo';
|
||||
import { getLLMModel, getVectorModel } from '../../ai/model';
|
||||
import { addLog } from '../../../common/system/log';
|
||||
import { getCollectionWithDataset } from '../controller';
|
||||
|
||||
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
||||
try {
|
||||
@@ -23,31 +25,52 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
|
||||
} catch (error) {}
|
||||
};
|
||||
|
||||
export async function pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
export const pushDataListToTrainingQueueByCollectionId = async ({
|
||||
collectionId,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
trainingMode = TrainingModeEnum.chunk
|
||||
...props
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
||||
const vectorModelList = global.vectorModels;
|
||||
const datasetModelList = global.llmModels;
|
||||
|
||||
session?: ClientSession;
|
||||
} & PushDatasetDataProps) => {
|
||||
const {
|
||||
datasetId: { _id: datasetId, vectorModel, agentModel }
|
||||
datasetId: { _id: datasetId, agentModel, vectorModel }
|
||||
} = await getCollectionWithDataset(collectionId);
|
||||
return pushDataListToTrainingQueue({
|
||||
...props,
|
||||
datasetId,
|
||||
collectionId,
|
||||
agentModel,
|
||||
vectorModel
|
||||
});
|
||||
};
|
||||
|
||||
export async function pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
agentModel,
|
||||
vectorModel,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
trainingMode = TrainingModeEnum.chunk,
|
||||
session
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
datasetId: string;
|
||||
agentModel: string;
|
||||
vectorModel: string;
|
||||
session?: ClientSession;
|
||||
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
||||
const checkModelValid = async () => {
|
||||
const agentModelData = datasetModelList?.find((item) => item.model === agentModel);
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(`File model ${agentModel} is inValid`);
|
||||
}
|
||||
const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
|
||||
const vectorModelData = getVectorModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Vector model ${vectorModel} is inValid`);
|
||||
}
|
||||
@@ -107,7 +130,7 @@ export async function pushDataListToTrainingQueue({
|
||||
const text = item.q + item.a;
|
||||
|
||||
// count q token
|
||||
const token = countPromptTokens(item.q);
|
||||
const token = item.q.length;
|
||||
|
||||
if (token > maxToken) {
|
||||
filterResult.overToken.push(item);
|
||||
@@ -124,52 +147,43 @@ export async function pushDataListToTrainingQueue({
|
||||
});
|
||||
|
||||
// insert data to db
|
||||
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
|
||||
try {
|
||||
const results = await MongoDatasetTraining.insertMany(
|
||||
dataList.map((item, i) => ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: trainingMode,
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? 0,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes
|
||||
}))
|
||||
);
|
||||
await delay(500);
|
||||
return results.length;
|
||||
} catch (error) {
|
||||
if (retry > 0) {
|
||||
await delay(500);
|
||||
return insertData(dataList, retry - 1);
|
||||
}
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
||||
const insertLen = filterResult.success.length;
|
||||
const failedDocuments: PushDatasetDataChunkProps[] = [];
|
||||
|
||||
let insertLen = 0;
|
||||
const chunkSize = 50;
|
||||
const chunkList = filterResult.success.reduce(
|
||||
(acc, cur) => {
|
||||
const lastChunk = acc[acc.length - 1];
|
||||
if (lastChunk.length < chunkSize) {
|
||||
lastChunk.push(cur);
|
||||
} else {
|
||||
acc.push([cur]);
|
||||
// 使用 insertMany 批量插入
|
||||
try {
|
||||
await MongoDatasetTraining.insertMany(
|
||||
filterResult.success.map((item) => ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: trainingMode,
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? 0,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes
|
||||
})),
|
||||
{
|
||||
session
|
||||
}
|
||||
return acc;
|
||||
},
|
||||
[[]] as PushDatasetDataChunkProps[][]
|
||||
);
|
||||
for await (const chunks of chunkList) {
|
||||
insertLen += await insertData(chunks);
|
||||
);
|
||||
} catch (error: any) {
|
||||
addLog.error(`Insert error`, error);
|
||||
// 如果有错误,将失败的文档添加到失败列表中
|
||||
error.writeErrors.forEach((writeError: any) => {
|
||||
failedDocuments.push(data[writeError.index]);
|
||||
});
|
||||
console.log('failed', failedDocuments);
|
||||
}
|
||||
|
||||
// 对于失败的文档,尝试单独插入
|
||||
for await (const item of failedDocuments) {
|
||||
await MongoDatasetTraining.create(item);
|
||||
}
|
||||
|
||||
delete filterResult.success;
|
||||
|
||||
@@ -2,6 +2,7 @@ import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||
import { addLog } from '../../../common/system/log';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { MongoDatasetTraining } from './schema';
|
||||
import Papa from 'papaparse';
|
||||
|
||||
export const checkInvalidChunkAndLock = async ({
|
||||
err,
|
||||
@@ -39,3 +40,18 @@ export const checkInvalidChunkAndLock = async ({
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
export const parseCsvTable2Chunks = (rawText: string) => {
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
|
||||
const chunks = csvArr
|
||||
.map((item) => ({
|
||||
q: item[0] || '',
|
||||
a: item[1] || ''
|
||||
}))
|
||||
.filter((item) => item.q || item.a);
|
||||
|
||||
return {
|
||||
chunks
|
||||
};
|
||||
};
|
||||
|
||||
@@ -210,7 +210,6 @@ export const runToolWithToolChoice = async (
|
||||
).filter(Boolean) as ToolRunResponseType;
|
||||
|
||||
const flatToolsResponseData = toolsRunResponse.map((item) => item.moduleRunResponse).flat();
|
||||
|
||||
if (toolCalls.length > 0 && !res.closed) {
|
||||
// Run the tool, combine its results, and perform another round of AI calls
|
||||
const assistantToolMsgParams: ChatCompletionAssistantToolParam = {
|
||||
|
||||
@@ -37,6 +37,7 @@ import { dispatchRunTools } from './agent/runTool/index';
|
||||
import { ChatItemValueTypeEnum } from '@fastgpt/global/core/chat/constants';
|
||||
import { DispatchFlowResponse } from './type';
|
||||
import { dispatchStopToolCall } from './agent/runTool/stopTool';
|
||||
import { dispatchLafRequest } from './tools/runLaf';
|
||||
|
||||
const callbackMap: Record<`${FlowNodeTypeEnum}`, Function> = {
|
||||
[FlowNodeTypeEnum.historyNode]: dispatchHistory,
|
||||
@@ -56,6 +57,7 @@ const callbackMap: Record<`${FlowNodeTypeEnum}`, Function> = {
|
||||
[FlowNodeTypeEnum.queryExtension]: dispatchQueryExtension,
|
||||
[FlowNodeTypeEnum.tools]: dispatchRunTools,
|
||||
[FlowNodeTypeEnum.stopTool]: dispatchStopToolCall,
|
||||
[FlowNodeTypeEnum.lafModule]: dispatchLafRequest,
|
||||
|
||||
// none
|
||||
[FlowNodeTypeEnum.userGuide]: () => Promise.resolve()
|
||||
|
||||
@@ -10,6 +10,7 @@ import { valueTypeFormat } from '../utils';
|
||||
import { SERVICE_LOCAL_HOST } from '../../../../common/system/tools';
|
||||
import { addLog } from '../../../../common/system/log';
|
||||
import { DispatchNodeResultType } from '@fastgpt/global/core/module/runtime/type';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
|
||||
type PropsArrType = {
|
||||
key: string;
|
||||
@@ -61,7 +62,7 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise<H
|
||||
chatId,
|
||||
responseChatItemId,
|
||||
...variables,
|
||||
histories: histories.slice(0, 10),
|
||||
histories: histories.slice(-10),
|
||||
...body
|
||||
};
|
||||
|
||||
@@ -139,7 +140,8 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise<H
|
||||
body: Object.keys(requestBody).length > 0 ? requestBody : undefined,
|
||||
headers: Object.keys(headers).length > 0 ? headers : undefined,
|
||||
httpResult: { error: formatHttpError(error) }
|
||||
}
|
||||
},
|
||||
[ModuleOutputKeyEnum.httpRawResponse]: getErrText(error)
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -165,6 +167,7 @@ async function fetchData({
|
||||
'Content-Type': 'application/json',
|
||||
...headers
|
||||
},
|
||||
timeout: 120000,
|
||||
params: params,
|
||||
data: ['POST', 'PUT', 'PATCH'].includes(method) ? body : undefined
|
||||
});
|
||||
|
||||
212
packages/service/core/workflow/dispatch/tools/runLaf.ts
Normal file
@@ -0,0 +1,212 @@
|
||||
import type { ModuleDispatchProps } from '@fastgpt/global/core/module/type.d';
|
||||
import {
|
||||
DYNAMIC_INPUT_KEY,
|
||||
ModuleInputKeyEnum,
|
||||
ModuleOutputKeyEnum
|
||||
} from '@fastgpt/global/core/module/constants';
|
||||
import { DispatchNodeResponseKeyEnum } from '@fastgpt/global/core/module/runtime/constants';
|
||||
import axios from 'axios';
|
||||
import { valueTypeFormat } from '../utils';
|
||||
import { SERVICE_LOCAL_HOST } from '../../../../common/system/tools';
|
||||
import { addLog } from '../../../../common/system/log';
|
||||
import { DispatchNodeResultType } from '@fastgpt/global/core/module/runtime/type';
|
||||
|
||||
type LafRequestProps = ModuleDispatchProps<{
|
||||
[ModuleInputKeyEnum.httpReqUrl]: string;
|
||||
[DYNAMIC_INPUT_KEY]: Record<string, any>;
|
||||
[key: string]: any;
|
||||
}>;
|
||||
type LafResponse = DispatchNodeResultType<{
|
||||
[ModuleOutputKeyEnum.failed]?: boolean;
|
||||
[key: string]: any;
|
||||
}>;
|
||||
|
||||
const UNDEFINED_SIGN = 'UNDEFINED_SIGN';
|
||||
|
||||
export const dispatchLafRequest = async (props: LafRequestProps): Promise<LafResponse> => {
|
||||
let {
|
||||
appId,
|
||||
chatId,
|
||||
responseChatItemId,
|
||||
variables,
|
||||
module: { outputs },
|
||||
histories,
|
||||
params: { system_httpReqUrl: httpReqUrl, [DYNAMIC_INPUT_KEY]: dynamicInput, ...body }
|
||||
} = props;
|
||||
|
||||
if (!httpReqUrl) {
|
||||
return Promise.reject('Http url is empty');
|
||||
}
|
||||
|
||||
const concatVariables = {
|
||||
appId,
|
||||
chatId,
|
||||
responseChatItemId,
|
||||
...variables,
|
||||
...body
|
||||
};
|
||||
|
||||
httpReqUrl = replaceVariable(httpReqUrl, concatVariables);
|
||||
|
||||
const requestBody = {
|
||||
systemParams: {
|
||||
appId,
|
||||
chatId,
|
||||
responseChatItemId,
|
||||
histories: histories.slice(0, 10)
|
||||
},
|
||||
variables,
|
||||
...dynamicInput,
|
||||
...body
|
||||
};
|
||||
|
||||
try {
|
||||
const { formatResponse, rawResponse } = await fetchData({
|
||||
method: 'POST',
|
||||
url: httpReqUrl,
|
||||
body: requestBody
|
||||
});
|
||||
|
||||
// format output value type
|
||||
const results: Record<string, any> = {};
|
||||
for (const key in formatResponse) {
|
||||
const output = outputs.find((item) => item.key === key);
|
||||
if (!output) continue;
|
||||
results[key] = valueTypeFormat(formatResponse[key], output.valueType);
|
||||
}
|
||||
|
||||
return {
|
||||
assistantResponses: [],
|
||||
[DispatchNodeResponseKeyEnum.nodeResponse]: {
|
||||
totalPoints: 0,
|
||||
body: Object.keys(requestBody).length > 0 ? requestBody : undefined,
|
||||
httpResult: rawResponse
|
||||
},
|
||||
[DispatchNodeResponseKeyEnum.toolResponses]: rawResponse,
|
||||
[ModuleOutputKeyEnum.httpRawResponse]: rawResponse,
|
||||
...results
|
||||
};
|
||||
} catch (error) {
|
||||
addLog.error('Http request error', error);
|
||||
return {
|
||||
[ModuleOutputKeyEnum.failed]: true,
|
||||
[DispatchNodeResponseKeyEnum.nodeResponse]: {
|
||||
totalPoints: 0,
|
||||
body: Object.keys(requestBody).length > 0 ? requestBody : undefined,
|
||||
httpResult: { error: formatHttpError(error) }
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
async function fetchData({
|
||||
method,
|
||||
url,
|
||||
body
|
||||
}: {
|
||||
method: string;
|
||||
url: string;
|
||||
body: Record<string, any>;
|
||||
}): Promise<Record<string, any>> {
|
||||
const { data: response } = await axios({
|
||||
method,
|
||||
baseURL: `http://${SERVICE_LOCAL_HOST}`,
|
||||
url,
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
data: body
|
||||
});
|
||||
|
||||
const parseJson = (obj: Record<string, any>, prefix = '') => {
|
||||
let result: Record<string, any> = {};
|
||||
|
||||
if (Array.isArray(obj)) {
|
||||
for (let i = 0; i < obj.length; i++) {
|
||||
result[`${prefix}[${i}]`] = obj[i];
|
||||
|
||||
if (Array.isArray(obj[i])) {
|
||||
result = {
|
||||
...result,
|
||||
...parseJson(obj[i], `${prefix}[${i}]`)
|
||||
};
|
||||
} else if (typeof obj[i] === 'object') {
|
||||
result = {
|
||||
...result,
|
||||
...parseJson(obj[i], `${prefix}[${i}].`)
|
||||
};
|
||||
}
|
||||
}
|
||||
} else if (typeof obj == 'object') {
|
||||
for (const key in obj) {
|
||||
result[`${prefix}${key}`] = obj[key];
|
||||
|
||||
if (Array.isArray(obj[key])) {
|
||||
result = {
|
||||
...result,
|
||||
...parseJson(obj[key], `${prefix}${key}`)
|
||||
};
|
||||
} else if (typeof obj[key] === 'object') {
|
||||
result = {
|
||||
...result,
|
||||
...parseJson(obj[key], `${prefix}${key}.`)
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
return {
|
||||
formatResponse:
|
||||
typeof response === 'object' && !Array.isArray(response) ? parseJson(response) : {},
|
||||
rawResponse: response
|
||||
};
|
||||
}
|
||||
|
||||
function replaceVariable(text: string, obj: Record<string, any>) {
|
||||
for (const [key, value] of Object.entries(obj)) {
|
||||
if (value === undefined) {
|
||||
text = text.replace(new RegExp(`{{${key}}}`, 'g'), UNDEFINED_SIGN);
|
||||
} else {
|
||||
const replacement = JSON.stringify(value);
|
||||
const unquotedReplacement =
|
||||
replacement.startsWith('"') && replacement.endsWith('"')
|
||||
? replacement.slice(1, -1)
|
||||
: replacement;
|
||||
text = text.replace(new RegExp(`{{${key}}}`, 'g'), unquotedReplacement);
|
||||
}
|
||||
}
|
||||
return text || '';
|
||||
}
|
||||
function removeUndefinedSign(obj: Record<string, any>) {
|
||||
for (const key in obj) {
|
||||
if (obj[key] === UNDEFINED_SIGN) {
|
||||
obj[key] = undefined;
|
||||
} else if (Array.isArray(obj[key])) {
|
||||
obj[key] = obj[key].map((item: any) => {
|
||||
if (item === UNDEFINED_SIGN) {
|
||||
return undefined;
|
||||
} else if (typeof item === 'object') {
|
||||
removeUndefinedSign(item);
|
||||
}
|
||||
return item;
|
||||
});
|
||||
} else if (typeof obj[key] === 'object') {
|
||||
removeUndefinedSign(obj[key]);
|
||||
}
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
function formatHttpError(error: any) {
|
||||
return {
|
||||
message: error?.message,
|
||||
name: error?.name,
|
||||
method: error?.config?.method,
|
||||
baseURL: error?.config?.baseURL,
|
||||
url: error?.config?.url,
|
||||
code: error?.code,
|
||||
status: error?.status
|
||||
};
|
||||
}
|
||||
@@ -4,27 +4,36 @@
|
||||
"dependencies": {
|
||||
"@fastgpt/global": "workspace:*",
|
||||
"@node-rs/jieba": "1.10.0",
|
||||
"@xmldom/xmldom": "^0.8.10",
|
||||
"axios": "^1.5.1",
|
||||
"cheerio": "1.0.0-rc.12",
|
||||
"cookie": "^0.5.0",
|
||||
"date-fns": "2.30.0",
|
||||
"dayjs": "^1.11.7",
|
||||
"decompress": "^4.2.1",
|
||||
"encoding": "^0.1.13",
|
||||
"file-type": "^19.0.0",
|
||||
"json5": "^2.2.3",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"mammoth": "^1.6.0",
|
||||
"mongoose": "^7.0.2",
|
||||
"multer": "1.4.5-lts.1",
|
||||
"next": "13.5.2",
|
||||
"nextjs-cors": "^2.1.2",
|
||||
"node-cron": "^3.0.3",
|
||||
"node-xlsx": "^0.23.0",
|
||||
"papaparse": "5.4.1",
|
||||
"pdfjs-dist": "4.0.269",
|
||||
"pg": "^8.10.0",
|
||||
"tunnel": "^0.0.6"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/cookie": "^0.5.2",
|
||||
"@types/decompress": "^4.2.7",
|
||||
"@types/jsonwebtoken": "^9.0.3",
|
||||
"@types/multer": "^1.4.10",
|
||||
"@types/node-cron": "^3.0.11",
|
||||
"@types/papaparse": "5.3.7",
|
||||
"@types/pg": "^8.6.6",
|
||||
"@types/tunnel": "^0.0.4"
|
||||
}
|
||||
|
||||
42
packages/service/support/permission/auth/file.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
import { AuthResponseType } from '@fastgpt/global/support/permission/type';
|
||||
import { AuthModeType } from '../type';
|
||||
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
|
||||
import { parseHeaderCert } from '../controller';
|
||||
import { getFileById } from '../../../common/file/gridfs/controller';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
|
||||
export async function authFile({
|
||||
fileId,
|
||||
per = 'owner',
|
||||
...props
|
||||
}: AuthModeType & {
|
||||
fileId: string;
|
||||
}): Promise<
|
||||
AuthResponseType & {
|
||||
file: DatasetFileSchema;
|
||||
}
|
||||
> {
|
||||
const authRes = await parseHeaderCert(props);
|
||||
const { teamId, tmbId } = authRes;
|
||||
|
||||
const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId });
|
||||
|
||||
if (!file) {
|
||||
return Promise.reject(CommonErrEnum.fileNotFound);
|
||||
}
|
||||
|
||||
if (file.metadata?.teamId !== teamId) {
|
||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||
}
|
||||
if (per === 'owner' && file.metadata?.tmbId !== tmbId) {
|
||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||
}
|
||||
|
||||
return {
|
||||
...authRes,
|
||||
isOwner: per === 'owner',
|
||||
canWrite: per === 'owner',
|
||||
file
|
||||
};
|
||||
}
|
||||
@@ -10,7 +10,6 @@ import { MongoTeam } from './teamSchema';
|
||||
|
||||
async function getTeamMember(match: Record<string, any>): Promise<TeamItemType> {
|
||||
const tmb = (await MongoTeamMember.findOne(match).populate('teamId')) as TeamMemberWithTeamSchema;
|
||||
|
||||
if (!tmb) {
|
||||
return Promise.reject('member not exist');
|
||||
}
|
||||
@@ -27,7 +26,8 @@ async function getTeamMember(match: Record<string, any>): Promise<TeamItemType>
|
||||
role: tmb.role,
|
||||
status: tmb.status,
|
||||
defaultTeam: tmb.defaultTeam,
|
||||
canWrite: tmb.role !== TeamMemberRoleEnum.visitor
|
||||
canWrite: tmb.role !== TeamMemberRoleEnum.visitor,
|
||||
lafAccount: tmb.teamId.lafAccount
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -35,6 +35,14 @@ const TeamSchema = new Schema({
|
||||
lastWebsiteSyncTime: {
|
||||
type: Date
|
||||
}
|
||||
},
|
||||
lafAccount: {
|
||||
token: {
|
||||
type: String
|
||||
},
|
||||
appid: {
|
||||
type: String
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
import Papa from 'papaparse';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
/**
|
||||
* read csv to json
|
||||
* @response {
|
||||
* header: string[],
|
||||
* data: string[][]
|
||||
* }
|
||||
*/
|
||||
export const readCsvContent = async ({ file }: { file: File }) => {
|
||||
try {
|
||||
const { rawText: textArr } = await readFileRawText(file);
|
||||
const csvArr = Papa.parse(textArr).data as string[][];
|
||||
if (csvArr.length === 0) {
|
||||
throw new Error('csv 解析失败');
|
||||
}
|
||||
|
||||
const header = csvArr.shift() as string[];
|
||||
|
||||
// add title to data
|
||||
const rawText = csvArr
|
||||
.map((item) =>
|
||||
item.map((value, index) => {
|
||||
if (!header[index]) return value;
|
||||
return `${header[index]}: ${value}`;
|
||||
})
|
||||
)
|
||||
.flat()
|
||||
.join('\n');
|
||||
|
||||
return {
|
||||
rawText,
|
||||
header,
|
||||
data: csvArr.map((item) => item)
|
||||
};
|
||||
} catch (error) {
|
||||
return Promise.reject('解析 csv 文件失败');
|
||||
}
|
||||
};
|
||||
@@ -1,21 +0,0 @@
|
||||
import { htmlStr2Md } from '../../string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
|
||||
export const readHtmlFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController?: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const { rawText } = await readFileRawText(file);
|
||||
const md = htmlStr2Md(rawText);
|
||||
|
||||
const simpleMd = await markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController
|
||||
});
|
||||
|
||||
return { rawText: simpleMd };
|
||||
};
|
||||
@@ -1,49 +0,0 @@
|
||||
import { loadFile2Buffer } from '../utils';
|
||||
import { readCsvContent } from './csv';
|
||||
import { readHtmlFile } from './html';
|
||||
import { readMdFile } from './md';
|
||||
import { readPdfFile } from './pdf';
|
||||
import { readFileRawText } from './rawText';
|
||||
import { readWordFile } from './word';
|
||||
|
||||
export const readFileRawContent = async ({
|
||||
file,
|
||||
uploadBase64Controller
|
||||
}: {
|
||||
file: File;
|
||||
uploadBase64Controller?: (base64: string) => Promise<string>;
|
||||
}): Promise<{
|
||||
rawText: string;
|
||||
}> => {
|
||||
const extension = file?.name?.split('.')?.pop()?.toLowerCase();
|
||||
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(file);
|
||||
case 'md':
|
||||
return readMdFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
case 'html':
|
||||
return readHtmlFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
case 'csv':
|
||||
return readCsvContent({ file });
|
||||
case 'pdf':
|
||||
const pdf = await loadFile2Buffer({ file });
|
||||
return readPdfFile({ pdf });
|
||||
case 'docx':
|
||||
return readWordFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
|
||||
default:
|
||||
return {
|
||||
rawText: ''
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -1,17 +0,0 @@
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readMdFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController?: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const { rawText: md } = await readFileRawText(file);
|
||||
const simpleMd = await markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController
|
||||
});
|
||||
return { rawText: simpleMd };
|
||||
};
|
||||