添加 markdown-to-text 库以处理 Markdown 文本,更新向量管理器以使用递归字符文本分割器,并优化内容处理逻辑。同时更新本地嵌入模型的默认设置。

This commit is contained in:
duanfuxiang
2025-07-05 05:42:39 +08:00
parent 4e139ecc4f
commit 558e3b3fe4
5 changed files with 106 additions and 43 deletions

View File

@@ -101,6 +101,7 @@
"lodash.isequal": "^4.5.0", "lodash.isequal": "^4.5.0",
"lru-cache": "^10.1.0", "lru-cache": "^10.1.0",
"lucide-react": "^0.447.0", "lucide-react": "^0.447.0",
"markdown-to-text": "^0.1.1",
"mermaid": "^11.6.0", "mermaid": "^11.6.0",
"micromatch": "^4.0.5", "micromatch": "^4.0.5",
"minimatch": "^10.0.1", "minimatch": "^10.0.1",

21
pnpm-lock.yaml generated
View File

@@ -149,6 +149,9 @@ importers:
lucide-react: lucide-react:
specifier: ^0.447.0 specifier: ^0.447.0
version: 0.447.0(react@18.3.1) version: 0.447.0(react@18.3.1)
markdown-to-text:
specifier: ^0.1.1
version: 0.1.1
mermaid: mermaid:
specifier: ^11.6.0 specifier: ^11.6.0
version: 11.6.0 version: 11.6.0
@@ -2741,6 +2744,9 @@ packages:
'@types/babel__traverse@7.20.6': '@types/babel__traverse@7.20.6':
resolution: {integrity: sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==} resolution: {integrity: sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==}
'@types/chai@4.3.20':
resolution: {integrity: sha512-/pC9HAB5I/xMlc5FP77qjCnI16ChlJfW0tGa0IUcFn38VJrTV6DeZ60NU5KZBtaOZqjdpwTWohz5HU1RrhiYxQ==}
'@types/codemirror@5.60.8': '@types/codemirror@5.60.8':
resolution: {integrity: sha512-VjFgDF/eB+Aklcy15TtOTLQeMjTo07k7KAjql8OK5Dirr7a6sJY4T1uVBDuTVG9VEmn1uUsohOpYnVfgC6/jyw==} resolution: {integrity: sha512-VjFgDF/eB+Aklcy15TtOTLQeMjTo07k7KAjql8OK5Dirr7a6sJY4T1uVBDuTVG9VEmn1uUsohOpYnVfgC6/jyw==}
@@ -2910,6 +2916,9 @@ packages:
resolution: {integrity: sha512-AlvLWYer6u4BkO4QzMkHo0t9RkvVIgqggVZmO+5snUiuX2caTKqtdqygX6GeE1VQa/TnXw9WoH0spcmHtG0inQ==} resolution: {integrity: sha512-AlvLWYer6u4BkO4QzMkHo0t9RkvVIgqggVZmO+5snUiuX2caTKqtdqygX6GeE1VQa/TnXw9WoH0spcmHtG0inQ==}
deprecated: This is a stub types definition. mermaid provides its own type definitions, so you do not need this installed. deprecated: This is a stub types definition. mermaid provides its own type definitions, so you do not need this installed.
'@types/mocha@8.2.3':
resolution: {integrity: sha512-ekGvFhFgrc2zYQoX4JeZPmVzZxw6Dtllga7iGHzfbYIYkAMUx/sAFP2GdFpLff+vdHXu5fl7WX9AT+TtqYcsyw==}
'@types/ms@2.1.0': '@types/ms@2.1.0':
resolution: {integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==} resolution: {integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==}
@@ -5411,6 +5420,9 @@ packages:
markdown-table@3.0.4: markdown-table@3.0.4:
resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==} resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==}
markdown-to-text@0.1.1:
resolution: {integrity: sha512-co/J5l8mJ2RK9wD/nQRGwO7JxoeyfvVNtOZll016EdAX2qYkwCWMdtYvJO42b41Ho7GFEJMuly9llf0Nj+ReQw==}
marked@15.0.12: marked@15.0.12:
resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==} resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
engines: {node: '>= 18'} engines: {node: '>= 18'}
@@ -9713,6 +9725,8 @@ snapshots:
dependencies: dependencies:
'@babel/types': 7.26.9 '@babel/types': 7.26.9
'@types/chai@4.3.20': {}
'@types/codemirror@5.60.8': '@types/codemirror@5.60.8':
dependencies: dependencies:
'@types/tern': 0.23.9 '@types/tern': 0.23.9
@@ -9919,6 +9933,8 @@ snapshots:
transitivePeerDependencies: transitivePeerDependencies:
- supports-color - supports-color
'@types/mocha@8.2.3': {}
'@types/ms@2.1.0': {} '@types/ms@2.1.0': {}
'@types/node-fetch@2.6.12': '@types/node-fetch@2.6.12':
@@ -12926,6 +12942,11 @@ snapshots:
markdown-table@3.0.4: {} markdown-table@3.0.4: {}
markdown-to-text@0.1.1:
dependencies:
'@types/chai': 4.3.20
'@types/mocha': 8.2.3
marked@15.0.12: {} marked@15.0.12: {}
matcher@3.0.0: matcher@3.0.0:

View File

@@ -1,22 +1,23 @@
import { backOff } from 'exponential-backoff' import { backOff } from 'exponential-backoff';
import { MarkdownTextSplitter } from 'langchain/text_splitter' import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { minimatch } from 'minimatch' import removeMarkdown from "markdown-to-text";
import { App, Notice, TFile } from 'obsidian' import { minimatch } from 'minimatch';
import pLimit from 'p-limit' import { App, Notice, TFile } from 'obsidian';
import pLimit from 'p-limit';
import { IndexProgress } from '../../../components/chat-view/QueryProgress' import { IndexProgress } from '../../../components/chat-view/QueryProgress';
import { import {
LLMAPIKeyInvalidException, LLMAPIKeyInvalidException,
LLMAPIKeyNotSetException, LLMAPIKeyNotSetException,
LLMBaseUrlNotSetException, LLMBaseUrlNotSetException,
LLMRateLimitExceededException, LLMRateLimitExceededException,
} from '../../../core/llm/exception' } from '../../../core/llm/exception';
import { InsertVector, SelectVector } from '../../../database/schema' import { InsertVector, SelectVector } from '../../../database/schema';
import { EmbeddingModel } from '../../../types/embedding' import { EmbeddingModel } from '../../../types/embedding';
import { openSettingsModalWithError } from '../../../utils/open-settings-modal' import { openSettingsModalWithError } from '../../../utils/open-settings-modal';
import { DBManager } from '../../database-manager' import { DBManager } from '../../database-manager';
import { VectorRepository } from './vector-repository' import { VectorRepository } from './vector-repository';
export class VectorManager { export class VectorManager {
private app: App private app: App
@@ -111,10 +112,25 @@ export class VectorManager {
return return
} }
const textSplitter = new MarkdownTextSplitter({ // Embed the files
const overlap = Math.floor(options.chunkSize * 0.15)
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: options.chunkSize, chunkSize: options.chunkSize,
// chunkOverlap: Math.floor(options.chunkSize * 0.15) chunkOverlap: overlap,
}) separators: [
"\n\n",
"\n",
".",
",",
" ",
"\u200b", // Zero-width space
"\uff0c", // Fullwidth comma
"\u3001", // Ideographic comma
"\uff0e", // Fullwidth full stop
"\u3002", // Ideographic full stop
"",
],
});
const skippedFiles: string[] = [] const skippedFiles: string[] = []
const contentChunks: InsertVector[] = ( const contentChunks: InsertVector[] = (
@@ -127,18 +143,25 @@ export class VectorManager {
const fileDocuments = await textSplitter.createDocuments([ const fileDocuments = await textSplitter.createDocuments([
fileContent, fileContent,
]) ])
return fileDocuments.map((chunk): InsertVector => { return fileDocuments
return { .map((chunk): InsertVector | null => {
path: file.path, const content = removeMarkdown(chunk.pageContent).replace(/\0/g, '')
mtime: file.stat.mtime, if (!content || content.trim().length === 0) {
content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全 console.log("skipped chunk", chunk.pageContent)
embedding: [], return null
metadata: { }
startLine: Number(chunk.metadata.loc.lines.from), return {
endLine: Number(chunk.metadata.loc.lines.to), path: file.path,
}, mtime: file.stat.mtime,
} content,
}) embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
.filter((chunk): chunk is InsertVector => chunk !== null)
} catch (error) { } catch (error) {
console.warn(`跳过文件 ${file.path}:`, error.message) console.warn(`跳过文件 ${file.path}:`, error.message)
skippedFiles.push(file.path) skippedFiles.push(file.path)
@@ -327,18 +350,24 @@ export class VectorManager {
fileContent, fileContent,
]) ])
const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => { const contentChunks: InsertVector[] = fileDocuments
return { .map((chunk): InsertVector | null => {
path: file.path, const content = removeMarkdown(chunk.pageContent).replace(/\0/g, '')
mtime: file.stat.mtime, if (!content || content.trim().length === 0) {
content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全 return null
embedding: [], }
metadata: { return {
startLine: Number(chunk.metadata.loc.lines.from), path: file.path,
endLine: Number(chunk.metadata.loc.lines.to), mtime: file.stat.mtime,
}, content,
} embedding: [],
}) metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
.filter((chunk): chunk is InsertVector => chunk !== null)
// 减少批量大小以降低内存压力 // 减少批量大小以降低内存压力
const insertBatchSize = 16 // 从64降低到16 const insertBatchSize = 16 // 从64降低到16

View File

@@ -71,7 +71,7 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
}; };
const providers = GetAllProviders(); // 按照重要程度排序 const providers = GetAllProviders(); // 按照重要程度排序
const embeddingProviders = GetEmbeddingProviders(); // 按照重要程度排序 // const embeddingProviders = GetEmbeddingProviders(); // 按照重要程度排序
// 获取已设置API Key的提供商列表 // 获取已设置API Key的提供商列表
const getSettedProviders = (): ApiProvider[] => { const getSettedProviders = (): ApiProvider[] => {
@@ -95,7 +95,8 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
const selectedProvider = providers.find(provider => settedProviders.includes(provider)); const selectedProvider = providers.find(provider => settedProviders.includes(provider));
// 选择embedding的提供商按embeddingProviders排序选择最靠前的 // 选择embedding的提供商按embeddingProviders排序选择最靠前的
const embeddingProvider = embeddingProviders.find(provider => settedProviders.includes(provider)); // const embeddingProvider = embeddingProviders.find(provider => settedProviders.includes(provider));
const embeddingProvider = ApiProvider.LocalProvider; // default to local provider
// 准备要更新的设置对象 // 准备要更新的设置对象
const newSettings = { ...settings }; const newSettings = { ...settings };
@@ -119,6 +120,7 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
} }
} }
// todo: this is a temporary fix for the embedding provider, we should remove this after the embedding provider is implemented
if (embeddingProvider) { if (embeddingProvider) {
const embeddingDefaultModels = GetDefaultModelId(embeddingProvider); const embeddingDefaultModels = GetDefaultModelId(embeddingProvider);

View File

@@ -1638,7 +1638,7 @@ export const grokModels = {
// LocalProvider (本地嵌入模型) // LocalProvider (本地嵌入模型)
export const localProviderDefaultModelId = null // this is not supported for chat/autocomplete export const localProviderDefaultModelId = null // this is not supported for chat/autocomplete
export const localProviderDefaultAutoCompleteModelId = null // this is not supported for chat/autocomplete export const localProviderDefaultAutoCompleteModelId = null // this is not supported for chat/autocomplete
export const localProviderDefaultEmbeddingModelId: keyof typeof localProviderEmbeddingModels = "Xenova/all-MiniLM-L6-v2" export const localProviderDefaultEmbeddingModelId: keyof typeof localProviderEmbeddingModels = "TaylorAI/bge-micro-v2"
export const localProviderEmbeddingModels = { export const localProviderEmbeddingModels = {
'Xenova/all-MiniLM-L6-v2': { dimensions: 384, description: 'All-MiniLM-L6-v2 (推荐,轻量级)' }, 'Xenova/all-MiniLM-L6-v2': { dimensions: 384, description: 'All-MiniLM-L6-v2 (推荐,轻量级)' },
@@ -1650,7 +1650,17 @@ export const localProviderEmbeddingModels = {
'Xenova/multilingual-e5-base': { dimensions: 768, description: 'E5-base (多语言,更高质量)' }, 'Xenova/multilingual-e5-base': { dimensions: 768, description: 'E5-base (多语言,更高质量)' },
'Xenova/gte-small': { dimensions: 384, description: 'GTE-small' }, 'Xenova/gte-small': { dimensions: 384, description: 'GTE-small' },
'Xenova/e5-small-v2': { dimensions: 384, description: 'E5-small-v2' }, 'Xenova/e5-small-v2': { dimensions: 384, description: 'E5-small-v2' },
'Xenova/e5-base-v2': { dimensions: 768, description: 'E5-base-v2 (更高质量)' } 'Xenova/e5-base-v2': { dimensions: 768, description: 'E5-base-v2 (更高质量)' },
// 新增的模型
'TaylorAI/bge-micro-v2': { dimensions: 384, description: 'BGE-micro-v2 (本地512令牌384维)' },
'Snowflake/snowflake-arctic-embed-xs': { dimensions: 384, description: 'Snowflake Arctic Embed XS (本地512令牌384维)' },
'Snowflake/snowflake-arctic-embed-s': { dimensions: 384, description: 'Snowflake Arctic Embed Small (本地512令牌384维)' },
'Snowflake/snowflake-arctic-embed-m': { dimensions: 768, description: 'Snowflake Arctic Embed Medium (本地512令牌768维)' },
'TaylorAI/gte-tiny': { dimensions: 384, description: 'GTE-tiny (本地512令牌384维)' },
'Mihaiii/Ivysaur': { dimensions: 384, description: 'Ivysaur (本地512令牌384维)' },
'andersonbcdefg/bge-small-4096': { dimensions: 384, description: 'BGE-small-4K (本地4096令牌384维)' },
'nomic-ai/nomic-embed-text-v1.5': { dimensions: 768, description: 'Nomic-embed-text-v1.5 (本地2048令牌768维)' },
'nomic-ai/nomic-embed-text-v1': { dimensions: 768, description: 'Nomic-embed-text (本地2048令牌768维)' }
} as const satisfies Record<string, EmbeddingModelInfo> } as const satisfies Record<string, EmbeddingModelInfo>
/// helper functions /// helper functions