feat: chunk index independent config (#4271)

* sync collection * remove lock * feat: chunk index independent config * feat: add max chunksize to split chunk function * remove log * update doc * remove * remove log
2025-03-21 16:44:25 +08:00
parent 222ff0d49a
commit e812ad6e84
47 changed files with 784 additions and 443 deletions
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -1,15 +1,17 @@
+import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
 import { getErrText } from '../error/utils';

 export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';

 type SplitProps = {
  text: string;
-  chunkLen: number;
+  chunkSize: number;
+  maxSize?: number;
  overlapRatio?: number;
  customReg?: string[];
 };
-export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
-  chunkLen?: number;
+export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
+  chunkSize?: number;
 };

 type SplitResponse = {
@@ -55,7 +57,7 @@ const strIsMdTable = (str: string) => {
  return true;
 };
 const markdownTableSplit = (props: SplitProps): SplitResponse => {
-  let { text = '', chunkLen } = props;
+  let { text = '', chunkSize } = props;
  const splitText2Lines = text.split('\n');
  const header = splitText2Lines[0];
  const headerSize = header.split('|').length - 2;
@@ -71,7 +73,7 @@ ${mdSplitString}
 `;

  for (let i = 2; i < splitText2Lines.length; i++) {
-    if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) {
+    if (chunk.length + splitText2Lines[i].length > chunkSize * 1.2) {
      chunks.push(chunk);
      chunk = `${header}
 ${mdSplitString}
@@ -98,11 +100,17 @@ ${mdSplitString}
  5. 标点分割：重叠
 */
 const commonSplit = (props: SplitProps): SplitResponse => {
-  let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props;
+  let {
+    text = '',
+    chunkSize,
+    maxSize = defaultMaxChunkSize,
+    overlapRatio = 0.15,
+    customReg = []
+  } = props;

  const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
  const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
-  const overlapLen = Math.round(chunkLen * overlapRatio);
+  const overlapLen = Math.round(chunkSize * overlapRatio);

  // replace code block all \n to codeBlockMarker
  text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
@@ -118,24 +126,24 @@ const commonSplit = (props: SplitProps): SplitResponse => {
  const stepReges: { reg: RegExp | string; maxLen: number }[] = [
    ...customReg.map((text) => ({
      reg: text.replaceAll('\\n', '\n'),
-      maxLen: chunkLen * 1.4
+      maxLen: chunkSize
    })),
-    { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
-    { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
-    { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
-    { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
-    { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
+    { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
+    { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
+    { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
+    { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
+    { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },

-    { reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
-    { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块，尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
-    { reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 },
-    { reg: /([\n])/g, maxLen: chunkLen * 1.2 },
+    { reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
+    { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块，尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
+    { reg: /(\n{2,})/g, maxLen: chunkSize },
+    { reg: /([\n])/g, maxLen: chunkSize },
    // ------ There's no overlap on the top
-    { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
-    { reg: /([！]|!\s)/g, maxLen: chunkLen * 1.2 },
-    { reg: /([？]|\?\s)/g, maxLen: chunkLen * 1.4 },
-    { reg: /([；]|;\s)/g, maxLen: chunkLen * 1.6 },
-    { reg: /([，]|,\s)/g, maxLen: chunkLen * 2 }
+    { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkSize },
+    { reg: /([！]|!\s)/g, maxLen: chunkSize },
+    { reg: /([？]|\?\s)/g, maxLen: chunkSize },
+    { reg: /([；]|;\s)/g, maxLen: chunkSize },
+    { reg: /([，]|,\s)/g, maxLen: chunkSize }
  ];

  const customRegLen = customReg.length;
@@ -203,7 +211,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
  /* Gets the overlap at the end of a text as the beginning of the next block */
  const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
    const forbidOverlap = checkForbidOverlap(step);
-    const maxOverlapLen = chunkLen * 0.4;
+    const maxOverlapLen = chunkSize * 0.4;

    // step >= stepReges.length: Do not overlap incomplete sentences
    if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
@@ -246,13 +254,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {

    // oversize
    if (step >= stepReges.length) {
-      if (text.length < chunkLen * 3) {
+      if (text.length < chunkSize * 3) {
        return [text];
      }
-      // use slice-chunkLen to split text
+      // use slice-chunkSize to split text
      const chunks: string[] = [];
-      for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
-        chunks.push(text.slice(i, i + chunkLen));
+      for (let i = 0; i < text.length; i += chunkSize - overlapLen) {
+        chunks.push(text.slice(i, i + chunkSize));
      }
      return chunks;
    }
@@ -260,8 +268,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
    // split text by special char
    const splitTexts = getSplitTexts({ text, step });

-    const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
-    const minChunkLen = chunkLen * 0.7;
+    const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
+    const minChunkLen = chunkSize * 0.7;

    const chunks: string[] = [];
    for (let i = 0; i < splitTexts.length; i++) {
@@ -297,7 +305,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
        continue;
      }

-      // newText is too large(now, The lastText must be smaller than chunkLen)
+      // newText is too large(now, The lastText must be smaller than chunkSize)
      if (newTextLen > maxLen) {
        // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
        if (lastTextLen > minChunkLen) {
@@ -352,7 +360,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {

    /* If the last chunk is independent, it needs to be push chunks. */
    if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
-      if (lastText.length < chunkLen * 0.4) {
+      if (lastText.length < chunkSize * 0.4) {
        chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
      } else {
        chunks.push(lastText);
@@ -386,9 +394,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {

 /**
 * text split into chunks
- * chunkLen - one chunk len. max: 3500
+ * chunkSize - one chunk len. max: 3500
 * overlapLen - The size of the before and after Text
- * chunkLen > overlapLen
+ * chunkSize > overlapLen
 * markdown
 */
 export const splitText2Chunks = (props: SplitProps): SplitResponse => {
--- a/packages/global/core/dataset/api.d.ts
+++ b/packages/global/core/dataset/api.d.ts
@@ -1,5 +1,10 @@
 import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
-import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
+import {
+  DatasetCollectionTypeEnum,
+  DatasetCollectionDataProcessModeEnum,
+  ChunkSettingModeEnum,
+  DataChunkSplitModeEnum
+} from './constants';
 import type { LLMModelItemType } from '../ai/model.d';
 import { ParentIdType } from 'common/parentFolder/type';

@@ -33,7 +38,13 @@ export type DatasetCollectionChunkMetadataType = {
  trainingType?: DatasetCollectionDataProcessModeEnum;
  imageIndex?: boolean;
  autoIndexes?: boolean;
+
+  chunkSettingMode?: ChunkSettingModeEnum;
+  chunkSplitMode?: DataChunkSplitModeEnum;
+
  chunkSize?: number;
+  indexSize?: number;
+
  chunkSplitter?: string;
  qaPrompt?: string;
  metadata?: Record<string, any>;
--- a/packages/global/core/dataset/constants.ts
+++ b/packages/global/core/dataset/constants.ts
@@ -129,6 +129,16 @@ export const DatasetCollectionDataProcessModeMap = {
  }
 };

+export enum ChunkSettingModeEnum {
+  auto = 'auto',
+  custom = 'custom'
+}
+
+export enum DataChunkSplitModeEnum {
+  size = 'size',
+  char = 'char'
+}
+
 /* ------------ data -------------- */

 /* ------------ training -------------- */
--- a/packages/global/core/dataset/controller.d.ts
+++ b/packages/global/core/dataset/controller.d.ts
@@ -13,6 +13,7 @@ export type CreateDatasetDataProps = {

 export type UpdateDatasetDataProps = {
  dataId: string;
+
  q?: string;
  a?: string;
  indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {
--- a/packages/global/core/dataset/training/type.d.ts
+++ b/packages/global/core/dataset/training/type.d.ts
@@ -15,6 +15,8 @@ export type PushDataToTrainingQueueProps = {
  vectorModel: string;
  vlmModel?: string;

+  indexSize?: number;
+
  billId?: string;
  session?: ClientSession;
 };
--- a/packages/global/core/dataset/training/utils.ts
+++ b/packages/global/core/dataset/training/utils.ts
@@ -0,0 +1,136 @@
+import { EmbeddingModelItemType, LLMModelItemType } from '../../../core/ai/model.d';
+import {
+  ChunkSettingModeEnum,
+  DataChunkSplitModeEnum,
+  DatasetCollectionDataProcessModeEnum
+} from '../constants';
+
+export const minChunkSize = 64; // min index and chunk size
+
+// Chunk size
+export const chunkAutoChunkSize = 1500;
+export const getMaxChunkSize = (model: LLMModelItemType) => {
+  return Math.max(model.maxContext - model.maxResponse, 2000);
+};
+
+// QA
+export const defaultMaxChunkSize = 8000;
+export const getLLMDefaultChunkSize = (model?: LLMModelItemType) => {
+  if (!model) return defaultMaxChunkSize;
+  return Math.max(Math.min(model.maxContext - model.maxResponse, defaultMaxChunkSize), 2000);
+};
+
+export const getLLMMaxChunkSize = (model?: LLMModelItemType) => {
+  if (!model) return 8000;
+  return Math.max(model.maxContext - model.maxResponse, 2000);
+};
+
+// Index size
+export const getMaxIndexSize = (model?: EmbeddingModelItemType) => {
+  return model?.maxToken || 512;
+};
+export const getAutoIndexSize = (model?: EmbeddingModelItemType) => {
+  return model?.defaultToken || 512;
+};
+
+const indexSizeSelectList = [
+  {
+    label: '64',
+    value: 64
+  },
+  {
+    label: '128',
+    value: 128
+  },
+  {
+    label: '256',
+    value: 256
+  },
+  {
+    label: '512',
+    value: 512
+  },
+  {
+    label: '768',
+    value: 768
+  },
+  {
+    label: '1024',
+    value: 1024
+  },
+  {
+    label: '1536',
+    value: 1536
+  },
+  {
+    label: '2048',
+    value: 2048
+  },
+  {
+    label: '3072',
+    value: 3072
+  },
+  {
+    label: '4096',
+    value: 4096
+  },
+  {
+    label: '5120',
+    value: 5120
+  },
+  {
+    label: '6144',
+    value: 6144
+  },
+  {
+    label: '7168',
+    value: 7168
+  },
+  {
+    label: '8192',
+    value: 8192
+  }
+];
+export const getIndexSizeSelectList = (max = 512) => {
+  return indexSizeSelectList.filter((item) => item.value <= max);
+};
+
+// Compute
+export const computeChunkSize = (params: {
+  trainingType: DatasetCollectionDataProcessModeEnum;
+  chunkSettingMode?: ChunkSettingModeEnum;
+  chunkSplitMode?: DataChunkSplitModeEnum;
+  llmModel?: LLMModelItemType;
+  chunkSize?: number;
+}) => {
+  if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
+    if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
+      return getLLMDefaultChunkSize(params.llmModel);
+    }
+  } else {
+    // chunk
+    if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
+      return chunkAutoChunkSize;
+    }
+  }
+
+  if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
+    return getLLMMaxChunkSize(params.llmModel);
+  }
+
+  return Math.min(params.chunkSize || chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
+};
+
+export const computeChunkSplitter = (params: {
+  chunkSettingMode?: ChunkSettingModeEnum;
+  chunkSplitMode?: DataChunkSplitModeEnum;
+  chunkSplitter?: string;
+}) => {
+  if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
+    return undefined;
+  }
+  if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
+    return undefined;
+  }
+  return params.chunkSplitter;
+};
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
 import { PermissionTypeEnum } from '../../support/permission/constant';
 import { PushDatasetDataChunkProps } from './api';
 import {
+  DataChunkSplitModeEnum,
  DatasetCollectionDataProcessModeEnum,
  DatasetCollectionTypeEnum,
  DatasetStatusEnum,
@@ -14,6 +15,7 @@ import { Permission } from '../../support/permission/controller';
 import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
 import { SourceMemberType } from 'support/user/type';
 import { DatasetDataIndexTypeEnum } from './data/constants';
+import { ChunkSettingModeEnum } from './constants';

 export type DatasetSchemaType = {
  _id: string;
@@ -88,7 +90,12 @@ export type DatasetCollectionSchemaType = {
  autoIndexes?: boolean;
  imageIndex?: boolean;
  trainingType: DatasetCollectionDataProcessModeEnum;
-  chunkSize: number;
+
+  chunkSettingMode?: ChunkSettingModeEnum;
+  chunkSplitMode?: DataChunkSplitModeEnum;
+
+  chunkSize?: number;
+  indexSize?: number;
  chunkSplitter?: string;
  qaPrompt?: string;
 };
--- a/packages/global/core/dataset/utils.ts
+++ b/packages/global/core/dataset/utils.ts
@@ -1,7 +1,6 @@
 import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
 import { getFileIcon } from '../../common/file/icon';
 import { strIsLink } from '../../common/string/tools';
-import { DatasetDataIndexTypeEnum } from './data/constants';

 export function getCollectionIcon(
  type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
@@ -38,26 +37,6 @@ export function getSourceNameIcon({
  return 'file/fill/file';
 }

-/* get dataset data default index */
-export function getDefaultIndex(props?: { q?: string; a?: string }) {
-  const { q = '', a } = props || {};
-
-  return [
-    {
-      text: q,
-      type: DatasetDataIndexTypeEnum.default
-    },
-    ...(a
-      ? [
-          {
-            text: a,
-            type: DatasetDataIndexTypeEnum.default
-          }
-        ]
-      : [])
-  ];
-}
-
 export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
  if (mode === TrainingModeEnum.qa) return data.length * 20;
  if (mode === TrainingModeEnum.auto) return data.length * 5;