mirror of
https://github.com/EthanMarti/infio-copilot.git
synced 2026-05-08 08:00:10 +00:00
添加 markdown-to-text 库以处理 Markdown 文本,更新向量管理器以使用递归字符文本分割器,并优化内容处理逻辑。同时更新本地嵌入模型的默认设置。
This commit is contained in:
@@ -101,6 +101,7 @@
|
|||||||
"lodash.isequal": "^4.5.0",
|
"lodash.isequal": "^4.5.0",
|
||||||
"lru-cache": "^10.1.0",
|
"lru-cache": "^10.1.0",
|
||||||
"lucide-react": "^0.447.0",
|
"lucide-react": "^0.447.0",
|
||||||
|
"markdown-to-text": "^0.1.1",
|
||||||
"mermaid": "^11.6.0",
|
"mermaid": "^11.6.0",
|
||||||
"micromatch": "^4.0.5",
|
"micromatch": "^4.0.5",
|
||||||
"minimatch": "^10.0.1",
|
"minimatch": "^10.0.1",
|
||||||
|
|||||||
21
pnpm-lock.yaml
generated
21
pnpm-lock.yaml
generated
@@ -149,6 +149,9 @@ importers:
|
|||||||
lucide-react:
|
lucide-react:
|
||||||
specifier: ^0.447.0
|
specifier: ^0.447.0
|
||||||
version: 0.447.0(react@18.3.1)
|
version: 0.447.0(react@18.3.1)
|
||||||
|
markdown-to-text:
|
||||||
|
specifier: ^0.1.1
|
||||||
|
version: 0.1.1
|
||||||
mermaid:
|
mermaid:
|
||||||
specifier: ^11.6.0
|
specifier: ^11.6.0
|
||||||
version: 11.6.0
|
version: 11.6.0
|
||||||
@@ -2741,6 +2744,9 @@ packages:
|
|||||||
'@types/babel__traverse@7.20.6':
|
'@types/babel__traverse@7.20.6':
|
||||||
resolution: {integrity: sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==}
|
resolution: {integrity: sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==}
|
||||||
|
|
||||||
|
'@types/chai@4.3.20':
|
||||||
|
resolution: {integrity: sha512-/pC9HAB5I/xMlc5FP77qjCnI16ChlJfW0tGa0IUcFn38VJrTV6DeZ60NU5KZBtaOZqjdpwTWohz5HU1RrhiYxQ==}
|
||||||
|
|
||||||
'@types/codemirror@5.60.8':
|
'@types/codemirror@5.60.8':
|
||||||
resolution: {integrity: sha512-VjFgDF/eB+Aklcy15TtOTLQeMjTo07k7KAjql8OK5Dirr7a6sJY4T1uVBDuTVG9VEmn1uUsohOpYnVfgC6/jyw==}
|
resolution: {integrity: sha512-VjFgDF/eB+Aklcy15TtOTLQeMjTo07k7KAjql8OK5Dirr7a6sJY4T1uVBDuTVG9VEmn1uUsohOpYnVfgC6/jyw==}
|
||||||
|
|
||||||
@@ -2910,6 +2916,9 @@ packages:
|
|||||||
resolution: {integrity: sha512-AlvLWYer6u4BkO4QzMkHo0t9RkvVIgqggVZmO+5snUiuX2caTKqtdqygX6GeE1VQa/TnXw9WoH0spcmHtG0inQ==}
|
resolution: {integrity: sha512-AlvLWYer6u4BkO4QzMkHo0t9RkvVIgqggVZmO+5snUiuX2caTKqtdqygX6GeE1VQa/TnXw9WoH0spcmHtG0inQ==}
|
||||||
deprecated: This is a stub types definition. mermaid provides its own type definitions, so you do not need this installed.
|
deprecated: This is a stub types definition. mermaid provides its own type definitions, so you do not need this installed.
|
||||||
|
|
||||||
|
'@types/mocha@8.2.3':
|
||||||
|
resolution: {integrity: sha512-ekGvFhFgrc2zYQoX4JeZPmVzZxw6Dtllga7iGHzfbYIYkAMUx/sAFP2GdFpLff+vdHXu5fl7WX9AT+TtqYcsyw==}
|
||||||
|
|
||||||
'@types/ms@2.1.0':
|
'@types/ms@2.1.0':
|
||||||
resolution: {integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==}
|
resolution: {integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==}
|
||||||
|
|
||||||
@@ -5411,6 +5420,9 @@ packages:
|
|||||||
markdown-table@3.0.4:
|
markdown-table@3.0.4:
|
||||||
resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==}
|
resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==}
|
||||||
|
|
||||||
|
markdown-to-text@0.1.1:
|
||||||
|
resolution: {integrity: sha512-co/J5l8mJ2RK9wD/nQRGwO7JxoeyfvVNtOZll016EdAX2qYkwCWMdtYvJO42b41Ho7GFEJMuly9llf0Nj+ReQw==}
|
||||||
|
|
||||||
marked@15.0.12:
|
marked@15.0.12:
|
||||||
resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
|
resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
|
||||||
engines: {node: '>= 18'}
|
engines: {node: '>= 18'}
|
||||||
@@ -9713,6 +9725,8 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
'@babel/types': 7.26.9
|
'@babel/types': 7.26.9
|
||||||
|
|
||||||
|
'@types/chai@4.3.20': {}
|
||||||
|
|
||||||
'@types/codemirror@5.60.8':
|
'@types/codemirror@5.60.8':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@types/tern': 0.23.9
|
'@types/tern': 0.23.9
|
||||||
@@ -9919,6 +9933,8 @@ snapshots:
|
|||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- supports-color
|
- supports-color
|
||||||
|
|
||||||
|
'@types/mocha@8.2.3': {}
|
||||||
|
|
||||||
'@types/ms@2.1.0': {}
|
'@types/ms@2.1.0': {}
|
||||||
|
|
||||||
'@types/node-fetch@2.6.12':
|
'@types/node-fetch@2.6.12':
|
||||||
@@ -12926,6 +12942,11 @@ snapshots:
|
|||||||
|
|
||||||
markdown-table@3.0.4: {}
|
markdown-table@3.0.4: {}
|
||||||
|
|
||||||
|
markdown-to-text@0.1.1:
|
||||||
|
dependencies:
|
||||||
|
'@types/chai': 4.3.20
|
||||||
|
'@types/mocha': 8.2.3
|
||||||
|
|
||||||
marked@15.0.12: {}
|
marked@15.0.12: {}
|
||||||
|
|
||||||
matcher@3.0.0:
|
matcher@3.0.0:
|
||||||
|
|||||||
@@ -1,22 +1,23 @@
|
|||||||
import { backOff } from 'exponential-backoff'
|
import { backOff } from 'exponential-backoff';
|
||||||
import { MarkdownTextSplitter } from 'langchain/text_splitter'
|
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||||
import { minimatch } from 'minimatch'
|
import removeMarkdown from "markdown-to-text";
|
||||||
import { App, Notice, TFile } from 'obsidian'
|
import { minimatch } from 'minimatch';
|
||||||
import pLimit from 'p-limit'
|
import { App, Notice, TFile } from 'obsidian';
|
||||||
|
import pLimit from 'p-limit';
|
||||||
|
|
||||||
import { IndexProgress } from '../../../components/chat-view/QueryProgress'
|
import { IndexProgress } from '../../../components/chat-view/QueryProgress';
|
||||||
import {
|
import {
|
||||||
LLMAPIKeyInvalidException,
|
LLMAPIKeyInvalidException,
|
||||||
LLMAPIKeyNotSetException,
|
LLMAPIKeyNotSetException,
|
||||||
LLMBaseUrlNotSetException,
|
LLMBaseUrlNotSetException,
|
||||||
LLMRateLimitExceededException,
|
LLMRateLimitExceededException,
|
||||||
} from '../../../core/llm/exception'
|
} from '../../../core/llm/exception';
|
||||||
import { InsertVector, SelectVector } from '../../../database/schema'
|
import { InsertVector, SelectVector } from '../../../database/schema';
|
||||||
import { EmbeddingModel } from '../../../types/embedding'
|
import { EmbeddingModel } from '../../../types/embedding';
|
||||||
import { openSettingsModalWithError } from '../../../utils/open-settings-modal'
|
import { openSettingsModalWithError } from '../../../utils/open-settings-modal';
|
||||||
import { DBManager } from '../../database-manager'
|
import { DBManager } from '../../database-manager';
|
||||||
|
|
||||||
import { VectorRepository } from './vector-repository'
|
import { VectorRepository } from './vector-repository';
|
||||||
|
|
||||||
export class VectorManager {
|
export class VectorManager {
|
||||||
private app: App
|
private app: App
|
||||||
@@ -111,10 +112,25 @@ export class VectorManager {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
const textSplitter = new MarkdownTextSplitter({
|
// Embed the files
|
||||||
|
const overlap = Math.floor(options.chunkSize * 0.15)
|
||||||
|
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||||
chunkSize: options.chunkSize,
|
chunkSize: options.chunkSize,
|
||||||
// chunkOverlap: Math.floor(options.chunkSize * 0.15)
|
chunkOverlap: overlap,
|
||||||
})
|
separators: [
|
||||||
|
"\n\n",
|
||||||
|
"\n",
|
||||||
|
".",
|
||||||
|
",",
|
||||||
|
" ",
|
||||||
|
"\u200b", // Zero-width space
|
||||||
|
"\uff0c", // Fullwidth comma
|
||||||
|
"\u3001", // Ideographic comma
|
||||||
|
"\uff0e", // Fullwidth full stop
|
||||||
|
"\u3002", // Ideographic full stop
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
const skippedFiles: string[] = []
|
const skippedFiles: string[] = []
|
||||||
const contentChunks: InsertVector[] = (
|
const contentChunks: InsertVector[] = (
|
||||||
@@ -127,18 +143,25 @@ export class VectorManager {
|
|||||||
const fileDocuments = await textSplitter.createDocuments([
|
const fileDocuments = await textSplitter.createDocuments([
|
||||||
fileContent,
|
fileContent,
|
||||||
])
|
])
|
||||||
return fileDocuments.map((chunk): InsertVector => {
|
return fileDocuments
|
||||||
return {
|
.map((chunk): InsertVector | null => {
|
||||||
path: file.path,
|
const content = removeMarkdown(chunk.pageContent).replace(/\0/g, '')
|
||||||
mtime: file.stat.mtime,
|
if (!content || content.trim().length === 0) {
|
||||||
content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全
|
console.log("skipped chunk", chunk.pageContent)
|
||||||
embedding: [],
|
return null
|
||||||
metadata: {
|
}
|
||||||
startLine: Number(chunk.metadata.loc.lines.from),
|
return {
|
||||||
endLine: Number(chunk.metadata.loc.lines.to),
|
path: file.path,
|
||||||
},
|
mtime: file.stat.mtime,
|
||||||
}
|
content,
|
||||||
})
|
embedding: [],
|
||||||
|
metadata: {
|
||||||
|
startLine: Number(chunk.metadata.loc.lines.from),
|
||||||
|
endLine: Number(chunk.metadata.loc.lines.to),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter((chunk): chunk is InsertVector => chunk !== null)
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn(`跳过文件 ${file.path}:`, error.message)
|
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||||||
skippedFiles.push(file.path)
|
skippedFiles.push(file.path)
|
||||||
@@ -327,18 +350,24 @@ export class VectorManager {
|
|||||||
fileContent,
|
fileContent,
|
||||||
])
|
])
|
||||||
|
|
||||||
const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => {
|
const contentChunks: InsertVector[] = fileDocuments
|
||||||
return {
|
.map((chunk): InsertVector | null => {
|
||||||
path: file.path,
|
const content = removeMarkdown(chunk.pageContent).replace(/\0/g, '')
|
||||||
mtime: file.stat.mtime,
|
if (!content || content.trim().length === 0) {
|
||||||
content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全
|
return null
|
||||||
embedding: [],
|
}
|
||||||
metadata: {
|
return {
|
||||||
startLine: Number(chunk.metadata.loc.lines.from),
|
path: file.path,
|
||||||
endLine: Number(chunk.metadata.loc.lines.to),
|
mtime: file.stat.mtime,
|
||||||
},
|
content,
|
||||||
}
|
embedding: [],
|
||||||
})
|
metadata: {
|
||||||
|
startLine: Number(chunk.metadata.loc.lines.from),
|
||||||
|
endLine: Number(chunk.metadata.loc.lines.to),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter((chunk): chunk is InsertVector => chunk !== null)
|
||||||
|
|
||||||
// 减少批量大小以降低内存压力
|
// 减少批量大小以降低内存压力
|
||||||
const insertBatchSize = 16 // 从64降低到16
|
const insertBatchSize = 16 // 从64降低到16
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
|
|||||||
};
|
};
|
||||||
|
|
||||||
const providers = GetAllProviders(); // 按照重要程度排序
|
const providers = GetAllProviders(); // 按照重要程度排序
|
||||||
const embeddingProviders = GetEmbeddingProviders(); // 按照重要程度排序
|
// const embeddingProviders = GetEmbeddingProviders(); // 按照重要程度排序
|
||||||
|
|
||||||
// 获取已设置API Key的提供商列表
|
// 获取已设置API Key的提供商列表
|
||||||
const getSettedProviders = (): ApiProvider[] => {
|
const getSettedProviders = (): ApiProvider[] => {
|
||||||
@@ -95,7 +95,8 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
|
|||||||
const selectedProvider = providers.find(provider => settedProviders.includes(provider));
|
const selectedProvider = providers.find(provider => settedProviders.includes(provider));
|
||||||
|
|
||||||
// 选择embedding的提供商(按embeddingProviders排序选择最靠前的)
|
// 选择embedding的提供商(按embeddingProviders排序选择最靠前的)
|
||||||
const embeddingProvider = embeddingProviders.find(provider => settedProviders.includes(provider));
|
// const embeddingProvider = embeddingProviders.find(provider => settedProviders.includes(provider));
|
||||||
|
const embeddingProvider = ApiProvider.LocalProvider; // default to local provider
|
||||||
|
|
||||||
// 准备要更新的设置对象
|
// 准备要更新的设置对象
|
||||||
const newSettings = { ...settings };
|
const newSettings = { ...settings };
|
||||||
@@ -119,6 +120,7 @@ const CustomProviderSettings: React.FC<CustomProviderSettingsProps> = ({ plugin,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// todo: this is a temporary fix for the embedding provider, we should remove this after the embedding provider is implemented
|
||||||
if (embeddingProvider) {
|
if (embeddingProvider) {
|
||||||
const embeddingDefaultModels = GetDefaultModelId(embeddingProvider);
|
const embeddingDefaultModels = GetDefaultModelId(embeddingProvider);
|
||||||
|
|
||||||
|
|||||||
@@ -1638,7 +1638,7 @@ export const grokModels = {
|
|||||||
// LocalProvider (本地嵌入模型)
|
// LocalProvider (本地嵌入模型)
|
||||||
export const localProviderDefaultModelId = null // this is not supported for chat/autocomplete
|
export const localProviderDefaultModelId = null // this is not supported for chat/autocomplete
|
||||||
export const localProviderDefaultAutoCompleteModelId = null // this is not supported for chat/autocomplete
|
export const localProviderDefaultAutoCompleteModelId = null // this is not supported for chat/autocomplete
|
||||||
export const localProviderDefaultEmbeddingModelId: keyof typeof localProviderEmbeddingModels = "Xenova/all-MiniLM-L6-v2"
|
export const localProviderDefaultEmbeddingModelId: keyof typeof localProviderEmbeddingModels = "TaylorAI/bge-micro-v2"
|
||||||
|
|
||||||
export const localProviderEmbeddingModels = {
|
export const localProviderEmbeddingModels = {
|
||||||
'Xenova/all-MiniLM-L6-v2': { dimensions: 384, description: 'All-MiniLM-L6-v2 (推荐,轻量级)' },
|
'Xenova/all-MiniLM-L6-v2': { dimensions: 384, description: 'All-MiniLM-L6-v2 (推荐,轻量级)' },
|
||||||
@@ -1650,7 +1650,17 @@ export const localProviderEmbeddingModels = {
|
|||||||
'Xenova/multilingual-e5-base': { dimensions: 768, description: 'E5-base (多语言,更高质量)' },
|
'Xenova/multilingual-e5-base': { dimensions: 768, description: 'E5-base (多语言,更高质量)' },
|
||||||
'Xenova/gte-small': { dimensions: 384, description: 'GTE-small' },
|
'Xenova/gte-small': { dimensions: 384, description: 'GTE-small' },
|
||||||
'Xenova/e5-small-v2': { dimensions: 384, description: 'E5-small-v2' },
|
'Xenova/e5-small-v2': { dimensions: 384, description: 'E5-small-v2' },
|
||||||
'Xenova/e5-base-v2': { dimensions: 768, description: 'E5-base-v2 (更高质量)' }
|
'Xenova/e5-base-v2': { dimensions: 768, description: 'E5-base-v2 (更高质量)' },
|
||||||
|
// 新增的模型
|
||||||
|
'TaylorAI/bge-micro-v2': { dimensions: 384, description: 'BGE-micro-v2 (本地,512令牌,384维)' },
|
||||||
|
'Snowflake/snowflake-arctic-embed-xs': { dimensions: 384, description: 'Snowflake Arctic Embed XS (本地,512令牌,384维)' },
|
||||||
|
'Snowflake/snowflake-arctic-embed-s': { dimensions: 384, description: 'Snowflake Arctic Embed Small (本地,512令牌,384维)' },
|
||||||
|
'Snowflake/snowflake-arctic-embed-m': { dimensions: 768, description: 'Snowflake Arctic Embed Medium (本地,512令牌,768维)' },
|
||||||
|
'TaylorAI/gte-tiny': { dimensions: 384, description: 'GTE-tiny (本地,512令牌,384维)' },
|
||||||
|
'Mihaiii/Ivysaur': { dimensions: 384, description: 'Ivysaur (本地,512令牌,384维)' },
|
||||||
|
'andersonbcdefg/bge-small-4096': { dimensions: 384, description: 'BGE-small-4K (本地,4096令牌,384维)' },
|
||||||
|
'nomic-ai/nomic-embed-text-v1.5': { dimensions: 768, description: 'Nomic-embed-text-v1.5 (本地,2048令牌,768维)' },
|
||||||
|
'nomic-ai/nomic-embed-text-v1': { dimensions: 768, description: 'Nomic-embed-text (本地,2048令牌,768维)' }
|
||||||
} as const satisfies Record<string, EmbeddingModelInfo>
|
} as const satisfies Record<string, EmbeddingModelInfo>
|
||||||
|
|
||||||
/// helper functions
|
/// helper functions
|
||||||
|
|||||||
Reference in New Issue
Block a user