Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 3fadabd28b
commit 139b142293
106 changed files with 2337 additions and 1454 deletions

View File

@@ -142,7 +142,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
teamId: data.teamId,
tmbId: data.tmbId,
collectionId: data.collectionId,
trainingMode: TrainingModeEnum.chunk,
mode: TrainingModeEnum.chunk,
data: qaArr.map((item) => ({
...item,
chunkIndex: data.chunkIndex
@@ -179,9 +179,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
}
}
/**
* 检查文本是否按格式返回
*/
// Format qa answer
function formatSplitText(text: string, rawText: string) {
text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格
const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式
@@ -194,13 +192,7 @@ function formatSplitText(text: string, rawText: string) {
if (q) {
result.push({
q,
a,
indexes: [
{
defaultIndex: true,
text: `${q}\n${a.trim().replace(/\n\s*/g, '\n')}`
}
]
a
});
}
}
@@ -211,13 +203,7 @@ function formatSplitText(text: string, rawText: string) {
chunks.forEach((chunk) => {
result.push({
q: chunk,
a: '',
indexes: [
{
defaultIndex: true,
text: chunk
}
]
a: ''
});
});
}