添加 markdown-to-text 库以处理 Markdown 文本,更新向量管理器以使用递归字符文本分割器,并优化内容处理逻辑。同时更新本地嵌入模型的默认设置。

This commit is contained in:
duanfuxiang
2025-07-05 05:42:39 +08:00
parent 4e139ecc4f
commit 558e3b3fe4
5 changed files with 106 additions and 43 deletions

View File

@@ -101,6 +101,7 @@
"lodash.isequal": "^4.5.0",
"lru-cache": "^10.1.0",
"lucide-react": "^0.447.0",
"markdown-to-text": "^0.1.1",
"mermaid": "^11.6.0",
"micromatch": "^4.0.5",
"minimatch": "^10.0.1",

21
pnpm-lock.yaml generated
View File

@@ -149,6 +149,9 @@ importers:
lucide-react:
specifier: ^0.447.0
version: 0.447.0(react@18.3.1)
markdown-to-text:
specifier: ^0.1.1
version: 0.1.1
mermaid:
specifier: ^11.6.0
version: 11.6.0
@@ -2741,6 +2744,9 @@ packages:
'@types/babel__traverse@7.20.6':
resolution: {integrity: sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==}
'@types/chai@4.3.20':
resolution: {integrity: sha512-/pC9HAB5I/xMlc5FP77qjCnI16ChlJfW0tGa0IUcFn38VJrTV6DeZ60NU5KZBtaOZqjdpwTWohz5HU1RrhiYxQ==}
'@types/codemirror@5.60.8':
resolution: {integrity: sha512-VjFgDF/eB+Aklcy15TtOTLQeMjTo07k7KAjql8OK5Dirr7a6sJY4T1uVBDuTVG9VEmn1uUsohOpYnVfgC6/jyw==}
@@ -2910,6 +2916,9 @@ packages:
resolution: {integrity: sha512-AlvLWYer6u4BkO4QzMkHo0t9RkvVIgqggVZmO+5snUiuX2caTKqtdqygX6GeE1VQa/TnXw9WoH0spcmHtG0inQ==}
deprecated: This is a stub types definition. mermaid provides its own type definitions, so you do not need this installed.
'@types/mocha@8.2.3':
resolution: {integrity: sha512-ekGvFhFgrc2zYQoX4JeZPmVzZxw6Dtllga7iGHzfbYIYkAMUx/sAFP2GdFpLff+vdHXu5fl7WX9AT+TtqYcsyw==}
'@types/ms@2.1.0':
resolution: {integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==}
@@ -5411,6 +5420,9 @@ packages:
markdown-table@3.0.4:
resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==}
markdown-to-text@0.1.1:
resolution: {integrity: sha512-co/J5l8mJ2RK9wD/nQRGwO7JxoeyfvVNtOZll016EdAX2qYkwCWMdtYvJO42b41Ho7GFEJMuly9llf0Nj+ReQw==}
marked@15.0.12:
resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
engines: {node: '>= 18'}
@@ -9713,6 +9725,8 @@ snapshots:
dependencies:
'@babel/types': 7.26.9
'@types/chai@4.3.20': {}
'@types/codemirror@5.60.8':
dependencies:
'@types/tern': 0.23.9
@@ -9919,6 +9933,8 @@ snapshots:
transitivePeerDependencies:
- supports-color
'@types/mocha@8.2.3': {}
'@types/ms@2.1.0': {}
'@types/node-fetch@2.6.12':
@@ -12926,6 +12942,11 @@ snapshots:
markdown-table@3.0.4: {}
markdown-to-text@0.1.1:
dependencies:
'@types/chai': 4.3.20
'@types/mocha': 8.2.3
marked@15.0.12: {}
matcher@3.0.0:

View File

@@ -1,22 +1,23 @@
import { backOff } from 'exponential-backoff'
import { MarkdownTextSplitter } from 'langchain/text_splitter'
import { minimatch } from 'minimatch'
import { App, Notice, TFile } from 'obsidian'
import pLimit from 'p-limit'
import { backOff } from 'exponential-backoff';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import removeMarkdown from "markdown-to-text";
import { minimatch } from 'minimatch';
import { App, Notice, TFile } from 'obsidian';
import pLimit from 'p-limit';
import { IndexProgress } from '../../../components/chat-view/QueryProgress'
import { IndexProgress } from '../../../components/chat-view/QueryProgress';
import {
LLMAPIKeyInvalidException,
LLMAPIKeyNotSetException,
LLMBaseUrlNotSetException,
LLMRateLimitExceededException,
} from '../../../core/llm/exception'
import { InsertVector, SelectVector } from '../../../database/schema'
import { EmbeddingModel } from '../../../types/embedding'
import { openSettingsModalWithError } from '../../../utils/open-settings-modal'
import { DBManager } from '../../database-manager'
} from '../../../core/llm/exception';
import { InsertVector, SelectVector } from '../../../database/schema';
import { EmbeddingModel } from '../../../types/embedding';
import { openSettingsModalWithError } from '../../../utils/open-settings-modal';
import { DBManager } from '../../database-manager';
import { VectorRepository } from './vector-repository'
import { VectorRepository } from './vector-repository';
export class VectorManager {
private app: App
@@ -111,10 +112,25 @@ export class VectorManager {
return
}
const textSplitter = new MarkdownTextSplitter({
// Embed the files
const overlap = Math.floor(options.chunkSize * 0.15)
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: options.chunkSize,
// chunkOverlap: Math.floor(options.chunkSize * 0.15)
})
chunkOverlap: overlap,
separators: [
"\n\n",
"\n",
".",
",",
" ",
"\u200b", // Zero-width space
"\uff0c", // Fullwidth comma
"\u3001", // Ideographic comma
"\uff0e", // Fullwidth full stop
"\u3002", // Ideographic full stop
"",
],
});
const skippedFiles: string[] = []
const contentChunks: InsertVector[] = (
@@ -127,18 +143,25 @@ export class VectorManager {
const fileDocuments = await textSplitter.createDocuments([
fileContent,
])
return fileDocuments.map((chunk): InsertVector => {
return {
path: file.path,
mtime: file.stat.mtime,
content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
return fileDocuments
.map((chunk): InsertVector | null => {
const content = removeMarkdown(chunk.pageContent).replace(/\0/g, '')
if (!content || content.trim().length === 0) {
console.log("skipped chunk", chunk.pageContent)
return null
}
return {
path: file.path,
mtime: file.stat.mtime,
content,
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
.filter((chunk): chunk is InsertVector => chunk !== null)
} catch (error) {
console.warn(`跳过文件 ${file.path}:`, error.message)
skippedFiles.push(file.path)
@@ -327,18 +350,24 @@ export class VectorManager {
fileContent,
])
const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => {
return {
path: file.path,
mtime: file.stat.mtime,
content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
const contentChunks: InsertVector[] = fileDocuments
.map((chunk): InsertVector | null => {
const content = removeMarkdown(chunk.pageContent).replace(/\0/g, '')
if (!content || content.trim().length === 0) {
return null
}
return {
path: file.path,
mtime: file.stat.mtime,
content,
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
.filter((chunk): chunk is InsertVector => chunk !== null)
// 减少批量大小以降低内存压力
const insertBatchSize = 16 // 从64降低到16

View File

@@ -71,7 +71,7 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
};
const providers = GetAllProviders(); // 按照重要程度排序
const embeddingProviders = GetEmbeddingProviders(); // 按照重要程度排序
// const embeddingProviders = GetEmbeddingProviders(); // 按照重要程度排序
// 获取已设置API Key的提供商列表
const getSettedProviders = (): ApiProvider[] => {
@@ -95,7 +95,8 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
const selectedProvider = providers.find(provider => settedProviders.includes(provider));
// 选择embedding的提供商按embeddingProviders排序选择最靠前的
const embeddingProvider = embeddingProviders.find(provider => settedProviders.includes(provider));
// const embeddingProvider = embeddingProviders.find(provider => settedProviders.includes(provider));
const embeddingProvider = ApiProvider.LocalProvider; // default to local provider
// 准备要更新的设置对象
const newSettings = { ...settings };
@@ -119,6 +120,7 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
}
}
// todo: this is a temporary fix for the embedding provider, we should remove this after the embedding provider is implemented
if (embeddingProvider) {
const embeddingDefaultModels = GetDefaultModelId(embeddingProvider);

View File

@@ -1638,7 +1638,7 @@ export const grokModels = {
// LocalProvider (本地嵌入模型)
export const localProviderDefaultModelId = null // this is not supported for chat/autocomplete
export const localProviderDefaultAutoCompleteModelId = null // this is not supported for chat/autocomplete
export const localProviderDefaultEmbeddingModelId: keyof typeof localProviderEmbeddingModels = "Xenova/all-MiniLM-L6-v2"
export const localProviderDefaultEmbeddingModelId: keyof typeof localProviderEmbeddingModels = "TaylorAI/bge-micro-v2"
export const localProviderEmbeddingModels = {
'Xenova/all-MiniLM-L6-v2': { dimensions: 384, description: 'All-MiniLM-L6-v2 (推荐,轻量级)' },
@@ -1650,7 +1650,17 @@ export const localProviderEmbeddingModels = {
'Xenova/multilingual-e5-base': { dimensions: 768, description: 'E5-base (多语言,更高质量)' },
'Xenova/gte-small': { dimensions: 384, description: 'GTE-small' },
'Xenova/e5-small-v2': { dimensions: 384, description: 'E5-small-v2' },
'Xenova/e5-base-v2': { dimensions: 768, description: 'E5-base-v2 (更高质量)' }
'Xenova/e5-base-v2': { dimensions: 768, description: 'E5-base-v2 (更高质量)' },
// 新增的模型
'TaylorAI/bge-micro-v2': { dimensions: 384, description: 'BGE-micro-v2 (本地512令牌384维)' },
'Snowflake/snowflake-arctic-embed-xs': { dimensions: 384, description: 'Snowflake Arctic Embed XS (本地512令牌384维)' },
'Snowflake/snowflake-arctic-embed-s': { dimensions: 384, description: 'Snowflake Arctic Embed Small (本地512令牌384维)' },
'Snowflake/snowflake-arctic-embed-m': { dimensions: 768, description: 'Snowflake Arctic Embed Medium (本地512令牌768维)' },
'TaylorAI/gte-tiny': { dimensions: 384, description: 'GTE-tiny (本地512令牌384维)' },
'Mihaiii/Ivysaur': { dimensions: 384, description: 'Ivysaur (本地512令牌384维)' },
'andersonbcdefg/bge-small-4096': { dimensions: 384, description: 'BGE-small-4K (本地4096令牌384维)' },
'nomic-ai/nomic-embed-text-v1.5': { dimensions: 768, description: 'Nomic-embed-text-v1.5 (本地2048令牌768维)' },
'nomic-ai/nomic-embed-text-v1': { dimensions: 768, description: 'Nomic-embed-text (本地2048令牌768维)' }
} as const satisfies Record<string, EmbeddingModelInfo>
/// helper functions