perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split * update max size computed * perf: i18n * remove table
2025-05-26 18:57:22 +08:00
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -15,7 +15,13 @@
  "backup_dataset_tip": "You can reimport the downloaded csv file when exporting the knowledge base.",
  "backup_mode": "Backup import",
  "chunk_max_tokens": "max_tokens",
+  "chunk_process_params": "Block processing parameters",
  "chunk_size": "Block size",
+  "chunk_trigger": "Blocking conditions",
+  "chunk_trigger_force_chunk": "Forced chunking",
+  "chunk_trigger_max_size": "The original text length is less than the maximum context 70% of the file processing model",
+  "chunk_trigger_min_size": "The original text is greater than",
+  "chunk_trigger_tips": "Block storage is triggered when certain conditions are met, otherwise the original text will be stored in full directly",
  "close_auto_sync": "Are you sure you want to turn off automatic sync?",
  "collection.Create update time": "Creation/Update Time",
  "collection.Training type": "Training",
@@ -29,6 +35,7 @@
  "collection_tags": "Collection Tags",
  "common_dataset": "General Dataset",
  "common_dataset_desc": "Building a knowledge base by importing files, web page links, or manual entry",
+  "condition": "condition",
  "config_sync_schedule": "Configure scheduled synchronization",
  "confirm_to_rebuild_embedding_tip": "Are you sure you want to switch the index for the Dataset?\nSwitching the index is a significant operation that requires re-indexing all data in your Dataset, which may take a long time. Please ensure your account has sufficient remaining points.\n\nAdditionally, you need to update the applications that use this Dataset to avoid conflicts with other indexed model Datasets.",
  "core.dataset.import.Adjust parameters": "Adjust parameters",
@@ -100,6 +107,7 @@
  "is_open_schedule": "Enable scheduled synchronization",
  "keep_image": "Keep the picture",
  "loading": "Loading...",
+  "max_chunk_size": "Maximum chunk size",
  "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
  "noChildren": "No subdirectories",
  "noSelectedFolder": "No selected folder",
@@ -107,8 +115,10 @@
  "noValidId": "No valid ID",
  "open_auto_sync": "After scheduled synchronization is turned on, the system will try to synchronize the collection from time to time every day. During the collection synchronization period, the collection data will not be searched.",
  "other_dataset": "Third-party knowledge base",
+  "paragraph_max_deep": "Maximum paragraph depth",
+  "paragraph_split": "Partition by paragraph",
+  "paragraph_split_tip": "Priority is given to chunking according to the Makdown title paragraph. If the chunking is too long, then chunking is done according to the length.",
  "params_config": "Config",
-  "params_setting": "Parameter settings",
  "pdf_enhance_parse": "PDF enhancement analysis",
  "pdf_enhance_parse_price": "{{price}} points/page",
  "pdf_enhance_parse_tips": "Calling PDF recognition model for parsing, you can convert it into Markdown and retain pictures in the document. At the same time, you can also identify scanned documents, which will take a long time to identify them.",