feat: 拆分文本增加滑块，增加直接分段导入方式

2023-04-23 22:36:04 +08:00
parent 2774940851
commit e0b1a78344
15 changed files with 317 additions and 155 deletions
--- a/src/pages/api/model/data/pushModelDataInput.ts
+++ b/src/pages/api/model/data/pushModelDataInput.ts
@@ -4,7 +4,7 @@ import { connectToDatabase, Model } from '@/service/mongo';
 import { authToken } from '@/service/utils/tools';
 import { ModelDataSchema } from '@/types/mongoSchema';
 import { generateVector } from '@/service/events/generateVector';
-import { connectPg, PgClient } from '@/service/pg';
+import { PgClient } from '@/service/pg';

 export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
  try {
@@ -26,7 +26,6 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
    const userId = await authToken(authorization);

    await connectToDatabase();
-    const pg = await connectPg();

    // 验证是否是该用户的 model
    const model = await Model.findOne({
--- a/src/pages/api/model/data/splitData.ts
+++ b/src/pages/api/model/data/splitData.ts
@@ -2,14 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next';
 import { jsonRes } from '@/service/response';
 import { connectToDatabase, SplitData, Model } from '@/service/mongo';
 import { authToken } from '@/service/utils/tools';
+import { generateVector } from '@/service/events/generateVector';
 import { generateQA } from '@/service/events/generateQA';
-import { encode } from 'gpt-token-utils';
+import { PgClient } from '@/service/pg';

 /* 拆分数据成QA */
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
  try {
-    const { text, modelId, prompt } = req.body as { text: string; modelId: string; prompt: string };
-    if (!text || !modelId || !prompt) {
+    const { chunks, modelId, prompt, mode } = req.body as {
+      modelId: string;
+      chunks: string[];
+      prompt: string;
+      mode: 'qa' | 'subsection';
+    };
+    if (!chunks || !modelId || !prompt) {
      throw new Error('参数错误');
    }
    await connectToDatabase();
@@ -28,46 +34,31 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
      throw new Error('无权操作该模型');
    }

-    const replaceText = text.replace(/\\n/g, '\n');
+    if (mode === 'qa') {
+      // 批量QA拆分插入数据
+      await SplitData.create({
+        userId,
+        modelId,
+        textList: chunks,
+        prompt
+      });

-    // 文本拆分成 chunk
-    const chunks = replaceText.split('\n').filter((item) => item.trim());
+      generateQA();
+    } else if (mode === 'subsection') {
+      // 插入记录
+      await PgClient.insert('modelData', {
+        values: chunks.map((item) => [
+          { key: 'user_id', value: userId },
+          { key: 'model_id', value: modelId },
+          { key: 'q', value: item },
+          { key: 'a', value: '' },
+          { key: 'status', value: 'waiting' }
+        ])
+      });

-    const textList: string[] = [];
-    let splitText = '';
-
-    /* 取 2.5k ~ 3.5K tokens 内容 */
-    chunks.forEach((chunk) => {
-      const tokens = encode(splitText + chunk).length;
-      if (tokens >= 3500) {
-        // 超过 3500，不要这块内容
-        splitText && textList.push(splitText);
-        splitText = chunk;
-      } else if (tokens >= 2500) {
-        // 超过 3000，取内容
-        splitText && textList.push(splitText + chunk);
-        splitText = '';
-      } else {
-        //没超过 3000，继续添加
-        splitText += chunk;
-      }
-    });
-
-    if (splitText) {
-      textList.push(splitText);
+      generateVector();
    }

-    // 批量插入数据
-    await SplitData.create({
-      userId,
-      modelId,
-      rawText: text,
-      textList,
-      prompt
-    });
-
-    generateQA();
-
    jsonRes(res);
  } catch (err) {
    jsonRes(res, {