From 558e3b3fe41ac09429f4bc59b9c20185bb354ef6 Mon Sep 17 00:00:00 2001 From: duanfuxiang Date: Sat, 5 Jul 2025 05:42:39 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20markdown-to-text=20?= =?UTF-8?q?=E5=BA=93=E4=BB=A5=E5=A4=84=E7=90=86=20Markdown=20=E6=96=87?= =?UTF-8?q?=E6=9C=AC=EF=BC=8C=E6=9B=B4=E6=96=B0=E5=90=91=E9=87=8F=E7=AE=A1?= =?UTF-8?q?=E7=90=86=E5=99=A8=E4=BB=A5=E4=BD=BF=E7=94=A8=E9=80=92=E5=BD=92?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E6=96=87=E6=9C=AC=E5=88=86=E5=89=B2=E5=99=A8?= =?UTF-8?q?=EF=BC=8C=E5=B9=B6=E4=BC=98=E5=8C=96=E5=86=85=E5=AE=B9=E5=A4=84?= =?UTF-8?q?=E7=90=86=E9=80=BB=E8=BE=91=E3=80=82=E5=90=8C=E6=97=B6=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E6=9C=AC=E5=9C=B0=E5=B5=8C=E5=85=A5=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E7=9A=84=E9=BB=98=E8=AE=A4=E8=AE=BE=E7=BD=AE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 1 + pnpm-lock.yaml | 21 ++++ src/database/modules/vector/vector-manager.ts | 107 +++++++++++------- .../components/ModelProviderSettings.tsx | 6 +- src/utils/api.ts | 14 ++- 5 files changed, 106 insertions(+), 43 deletions(-) diff --git a/package.json b/package.json index 45ed880..406d1f2 100644 --- a/package.json +++ b/package.json @@ -101,6 +101,7 @@ "lodash.isequal": "^4.5.0", "lru-cache": "^10.1.0", "lucide-react": "^0.447.0", + "markdown-to-text": "^0.1.1", "mermaid": "^11.6.0", "micromatch": "^4.0.5", "minimatch": "^10.0.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1580a32..c3ff286 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -149,6 +149,9 @@ importers: lucide-react: specifier: ^0.447.0 version: 0.447.0(react@18.3.1) + markdown-to-text: + specifier: ^0.1.1 + version: 0.1.1 mermaid: specifier: ^11.6.0 version: 11.6.0 @@ -2741,6 +2744,9 @@ packages: '@types/babel__traverse@7.20.6': resolution: {integrity: sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==} + '@types/chai@4.3.20': + resolution: {integrity: sha512-/pC9HAB5I/xMlc5FP77qjCnI16ChlJfW0tGa0IUcFn38VJrTV6DeZ60NU5KZBtaOZqjdpwTWohz5HU1RrhiYxQ==} + '@types/codemirror@5.60.8': resolution: {integrity: sha512-VjFgDF/eB+Aklcy15TtOTLQeMjTo07k7KAjql8OK5Dirr7a6sJY4T1uVBDuTVG9VEmn1uUsohOpYnVfgC6/jyw==} @@ -2910,6 +2916,9 @@ packages: resolution: {integrity: sha512-AlvLWYer6u4BkO4QzMkHo0t9RkvVIgqggVZmO+5snUiuX2caTKqtdqygX6GeE1VQa/TnXw9WoH0spcmHtG0inQ==} deprecated: This is a stub types definition. mermaid provides its own type definitions, so you do not need this installed. + '@types/mocha@8.2.3': + resolution: {integrity: sha512-ekGvFhFgrc2zYQoX4JeZPmVzZxw6Dtllga7iGHzfbYIYkAMUx/sAFP2GdFpLff+vdHXu5fl7WX9AT+TtqYcsyw==} + '@types/ms@2.1.0': resolution: {integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==} @@ -5411,6 +5420,9 @@ packages: markdown-table@3.0.4: resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==} + markdown-to-text@0.1.1: + resolution: {integrity: sha512-co/J5l8mJ2RK9wD/nQRGwO7JxoeyfvVNtOZll016EdAX2qYkwCWMdtYvJO42b41Ho7GFEJMuly9llf0Nj+ReQw==} + marked@15.0.12: resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==} engines: {node: '>= 18'} @@ -9713,6 +9725,8 @@ snapshots: dependencies: '@babel/types': 7.26.9 + '@types/chai@4.3.20': {} + '@types/codemirror@5.60.8': dependencies: '@types/tern': 0.23.9 @@ -9919,6 +9933,8 @@ snapshots: transitivePeerDependencies: - supports-color + '@types/mocha@8.2.3': {} + '@types/ms@2.1.0': {} '@types/node-fetch@2.6.12': @@ -12926,6 +12942,11 @@ snapshots: markdown-table@3.0.4: {} + markdown-to-text@0.1.1: + dependencies: + '@types/chai': 4.3.20 + '@types/mocha': 8.2.3 + marked@15.0.12: {} matcher@3.0.0: diff --git a/src/database/modules/vector/vector-manager.ts b/src/database/modules/vector/vector-manager.ts index 1421f3e..151c220 100644 --- a/src/database/modules/vector/vector-manager.ts +++ b/src/database/modules/vector/vector-manager.ts @@ -1,22 +1,23 @@ -import { backOff } from 'exponential-backoff' -import { MarkdownTextSplitter } from 'langchain/text_splitter' -import { minimatch } from 'minimatch' -import { App, Notice, TFile } from 'obsidian' -import pLimit from 'p-limit' +import { backOff } from 'exponential-backoff'; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; +import removeMarkdown from "markdown-to-text"; +import { minimatch } from 'minimatch'; +import { App, Notice, TFile } from 'obsidian'; +import pLimit from 'p-limit'; -import { IndexProgress } from '../../../components/chat-view/QueryProgress' +import { IndexProgress } from '../../../components/chat-view/QueryProgress'; import { LLMAPIKeyInvalidException, LLMAPIKeyNotSetException, LLMBaseUrlNotSetException, LLMRateLimitExceededException, -} from '../../../core/llm/exception' -import { InsertVector, SelectVector } from '../../../database/schema' -import { EmbeddingModel } from '../../../types/embedding' -import { openSettingsModalWithError } from '../../../utils/open-settings-modal' -import { DBManager } from '../../database-manager' +} from '../../../core/llm/exception'; +import { InsertVector, SelectVector } from '../../../database/schema'; +import { EmbeddingModel } from '../../../types/embedding'; +import { openSettingsModalWithError } from '../../../utils/open-settings-modal'; +import { DBManager } from '../../database-manager'; -import { VectorRepository } from './vector-repository' +import { VectorRepository } from './vector-repository'; export class VectorManager { private app: App @@ -111,10 +112,25 @@ export class VectorManager { return } - const textSplitter = new MarkdownTextSplitter({ + // Embed the files + const overlap = Math.floor(options.chunkSize * 0.15) + const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: options.chunkSize, - // chunkOverlap: Math.floor(options.chunkSize * 0.15) - }) + chunkOverlap: overlap, + separators: [ + "\n\n", + "\n", + ".", + ",", + " ", + "\u200b", // Zero-width space + "\uff0c", // Fullwidth comma + "\u3001", // Ideographic comma + "\uff0e", // Fullwidth full stop + "\u3002", // Ideographic full stop + "", + ], + }); const skippedFiles: string[] = [] const contentChunks: InsertVector[] = ( @@ -127,18 +143,25 @@ export class VectorManager { const fileDocuments = await textSplitter.createDocuments([ fileContent, ]) - return fileDocuments.map((chunk): InsertVector => { - return { - path: file.path, - mtime: file.stat.mtime, - content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全 - embedding: [], - metadata: { - startLine: Number(chunk.metadata.loc.lines.from), - endLine: Number(chunk.metadata.loc.lines.to), - }, - } - }) + return fileDocuments + .map((chunk): InsertVector | null => { + const content = removeMarkdown(chunk.pageContent).replace(/\0/g, '') + if (!content || content.trim().length === 0) { + console.log("skipped chunk", chunk.pageContent) + return null + } + return { + path: file.path, + mtime: file.stat.mtime, + content, + embedding: [], + metadata: { + startLine: Number(chunk.metadata.loc.lines.from), + endLine: Number(chunk.metadata.loc.lines.to), + }, + } + }) + .filter((chunk): chunk is InsertVector => chunk !== null) } catch (error) { console.warn(`跳过文件 ${file.path}:`, error.message) skippedFiles.push(file.path) @@ -327,18 +350,24 @@ export class VectorManager { fileContent, ]) - const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => { - return { - path: file.path, - mtime: file.stat.mtime, - content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全 - embedding: [], - metadata: { - startLine: Number(chunk.metadata.loc.lines.from), - endLine: Number(chunk.metadata.loc.lines.to), - }, - } - }) + const contentChunks: InsertVector[] = fileDocuments + .map((chunk): InsertVector | null => { + const content = removeMarkdown(chunk.pageContent).replace(/\0/g, '') + if (!content || content.trim().length === 0) { + return null + } + return { + path: file.path, + mtime: file.stat.mtime, + content, + embedding: [], + metadata: { + startLine: Number(chunk.metadata.loc.lines.from), + endLine: Number(chunk.metadata.loc.lines.to), + }, + } + }) + .filter((chunk): chunk is InsertVector => chunk !== null) // 减少批量大小以降低内存压力 const insertBatchSize = 16 // 从64降低到16 diff --git a/src/settings/components/ModelProviderSettings.tsx b/src/settings/components/ModelProviderSettings.tsx index c9fae3b..e1999ff 100644 --- a/src/settings/components/ModelProviderSettings.tsx +++ b/src/settings/components/ModelProviderSettings.tsx @@ -71,7 +71,7 @@ const CustomProviderSettings: React.FC = ({ plugin, }; const providers = GetAllProviders(); // 按照重要程度排序 - const embeddingProviders = GetEmbeddingProviders(); // 按照重要程度排序 + // const embeddingProviders = GetEmbeddingProviders(); // 按照重要程度排序 // 获取已设置API Key的提供商列表 const getSettedProviders = (): ApiProvider[] => { @@ -95,7 +95,8 @@ const CustomProviderSettings: React.FC = ({ plugin, const selectedProvider = providers.find(provider => settedProviders.includes(provider)); // 选择embedding的提供商(按embeddingProviders排序选择最靠前的) - const embeddingProvider = embeddingProviders.find(provider => settedProviders.includes(provider)); + // const embeddingProvider = embeddingProviders.find(provider => settedProviders.includes(provider)); + const embeddingProvider = ApiProvider.LocalProvider; // default to local provider // 准备要更新的设置对象 const newSettings = { ...settings }; @@ -119,6 +120,7 @@ const CustomProviderSettings: React.FC = ({ plugin, } } + // todo: this is a temporary fix for the embedding provider, we should remove this after the embedding provider is implemented if (embeddingProvider) { const embeddingDefaultModels = GetDefaultModelId(embeddingProvider); diff --git a/src/utils/api.ts b/src/utils/api.ts index d8a5c72..d25c0de 100644 --- a/src/utils/api.ts +++ b/src/utils/api.ts @@ -1638,7 +1638,7 @@ export const grokModels = { // LocalProvider (本地嵌入模型) export const localProviderDefaultModelId = null // this is not supported for chat/autocomplete export const localProviderDefaultAutoCompleteModelId = null // this is not supported for chat/autocomplete -export const localProviderDefaultEmbeddingModelId: keyof typeof localProviderEmbeddingModels = "Xenova/all-MiniLM-L6-v2" +export const localProviderDefaultEmbeddingModelId: keyof typeof localProviderEmbeddingModels = "TaylorAI/bge-micro-v2" export const localProviderEmbeddingModels = { 'Xenova/all-MiniLM-L6-v2': { dimensions: 384, description: 'All-MiniLM-L6-v2 (推荐,轻量级)' }, @@ -1650,7 +1650,17 @@ export const localProviderEmbeddingModels = { 'Xenova/multilingual-e5-base': { dimensions: 768, description: 'E5-base (多语言,更高质量)' }, 'Xenova/gte-small': { dimensions: 384, description: 'GTE-small' }, 'Xenova/e5-small-v2': { dimensions: 384, description: 'E5-small-v2' }, - 'Xenova/e5-base-v2': { dimensions: 768, description: 'E5-base-v2 (更高质量)' } + 'Xenova/e5-base-v2': { dimensions: 768, description: 'E5-base-v2 (更高质量)' }, + // 新增的模型 + 'TaylorAI/bge-micro-v2': { dimensions: 384, description: 'BGE-micro-v2 (本地,512令牌,384维)' }, + 'Snowflake/snowflake-arctic-embed-xs': { dimensions: 384, description: 'Snowflake Arctic Embed XS (本地,512令牌,384维)' }, + 'Snowflake/snowflake-arctic-embed-s': { dimensions: 384, description: 'Snowflake Arctic Embed Small (本地,512令牌,384维)' }, + 'Snowflake/snowflake-arctic-embed-m': { dimensions: 768, description: 'Snowflake Arctic Embed Medium (本地,512令牌,768维)' }, + 'TaylorAI/gte-tiny': { dimensions: 384, description: 'GTE-tiny (本地,512令牌,384维)' }, + 'Mihaiii/Ivysaur': { dimensions: 384, description: 'Ivysaur (本地,512令牌,384维)' }, + 'andersonbcdefg/bge-small-4096': { dimensions: 384, description: 'BGE-small-4K (本地,4096令牌,384维)' }, + 'nomic-ai/nomic-embed-text-v1.5': { dimensions: 768, description: 'Nomic-embed-text-v1.5 (本地,2048令牌,768维)' }, + 'nomic-ai/nomic-embed-text-v1': { dimensions: 768, description: 'Nomic-embed-text (本地,2048令牌,768维)' } } as const satisfies Record /// helper functions