feat: 拆分文本增加滑块，增加直接分段导入方式

2023-04-23 22:36:04 +08:00
parent 2774940851
commit e0b1a78344
15 changed files with 317 additions and 155 deletions
--- a/src/utils/file.ts
+++ b/src/utils/file.ts
@@ -1,5 +1,6 @@
 import mammoth from 'mammoth';
 import Papa from 'papaparse';
+import { encode } from 'gpt-token-utils';

 /**
 * 读取 txt 文件内容
@@ -137,3 +138,54 @@ export const fileDownload = ({
  downloadLink.click();
  document.body.removeChild(downloadLink);
 };
+
+/**
+ * text split into chunks
+ * maxLen - one chunk len. max: 3500
+ * slideLen - The size of the before and after Text
+ * maxLen > slideLen
+ */
+export const splitText = ({
+  text,
+  maxLen,
+  slideLen
+}: {
+  text: string;
+  maxLen: number;
+  slideLen: number;
+}) => {
+  const textArr =
+    text.match(/[！？。\n.]+|[^\s]+/g)?.filter((item) => {
+      const text = item.replace(/(\\n)/g, '\n').trim();
+      if (text && text !== '\n') return true;
+      return false;
+    }) || [];
+
+  const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
+
+  for (let i = 0; i < textArr.length; i++) {
+    const tokenLen = encode(textArr[i]).length;
+    chunks[chunks.length - 1].sum += tokenLen;
+    chunks[chunks.length - 1].arr.push(textArr[i]);
+
+    //  current length is over maxLen. create new chunk
+    if (chunks[chunks.length - 1].sum + tokenLen >= maxLen) {
+      // get slide len text as the initial value
+      const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
+      for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
+        const chunkText = chunks[chunks.length - 1].arr[j];
+        const tokenLen = encode(chunkText).length;
+        chunk.sum += tokenLen;
+        chunk.arr.unshift(chunkText);
+
+        if (chunk.sum >= slideLen) {
+          break;
+        }
+      }
+      chunks.push(chunk);
+    }
+  }
+
+  const result = chunks.map((item) => item.arr.join(''));
+  return result;
+};