use web worker to run pglite

This commit is contained in:
duanfuxiang
2025-03-19 21:01:32 +08:00
parent 76288377c3
commit 679d7142eb
25 changed files with 985 additions and 461 deletions

View File

@@ -6,10 +6,10 @@ import pLimit from 'p-limit'
import { IndexProgress } from '../../../components/chat-view/QueryProgress'
import {
LLMAPIKeyInvalidException,
LLMAPIKeyNotSetException,
LLMBaseUrlNotSetException,
LLMRateLimitExceededException,
LLMAPIKeyInvalidException,
LLMAPIKeyNotSetException,
LLMBaseUrlNotSetException,
LLMRateLimitExceededException,
} from '../../../core/llm/exception'
import { InsertVector, SelectVector } from '../../../database/schema'
import { EmbeddingModel } from '../../../types/embedding'
@@ -19,260 +19,353 @@ import { DBManager } from '../../database-manager'
import { VectorRepository } from './vector-repository'
export class VectorManager {
private app: App
private repository: VectorRepository
private dbManager: DBManager
private app: App
private repository: VectorRepository
private dbManager: DBManager
constructor(app: App, dbManager: DBManager) {
this.app = app
this.dbManager = dbManager
this.repository = new VectorRepository(app, dbManager.getPgClient())
}
constructor(app: App, dbManager: DBManager) {
this.app = app
this.dbManager = dbManager
this.repository = new VectorRepository(app, dbManager.getPgClient())
}
async performSimilaritySearch(
queryVector: number[],
embeddingModel: EmbeddingModel,
options: {
minSimilarity: number
limit: number
scope?: {
files: string[]
folders: string[]
}
},
): Promise<
(Omit<SelectVector, 'embedding'> & {
similarity: number
})[]
> {
return await this.repository.performSimilaritySearch(
queryVector,
embeddingModel,
options,
)
}
async performSimilaritySearch(
queryVector: number[],
embeddingModel: EmbeddingModel,
options: {
minSimilarity: number
limit: number
scope?: {
files: string[]
folders: string[]
}
},
): Promise<
(Omit<SelectVector, 'embedding'> & {
similarity: number
})[]
> {
return await this.repository.performSimilaritySearch(
queryVector,
embeddingModel,
options,
)
}
async updateVaultIndex(
embeddingModel: EmbeddingModel,
options: {
chunkSize: number
excludePatterns: string[]
includePatterns: string[]
reindexAll?: boolean
},
updateProgress?: (indexProgress: IndexProgress) => void,
): Promise<void> {
let filesToIndex: TFile[]
if (options.reindexAll) {
filesToIndex = await this.getFilesToIndex({
embeddingModel: embeddingModel,
excludePatterns: options.excludePatterns,
includePatterns: options.includePatterns,
reindexAll: true,
})
await this.repository.clearAllVectors(embeddingModel)
} else {
await this.deleteVectorsForDeletedFiles(embeddingModel)
filesToIndex = await this.getFilesToIndex({
embeddingModel: embeddingModel,
excludePatterns: options.excludePatterns,
includePatterns: options.includePatterns,
})
await this.repository.deleteVectorsForMultipleFiles(
filesToIndex.map((file) => file.path),
embeddingModel,
)
}
async updateVaultIndex(
embeddingModel: EmbeddingModel,
options: {
chunkSize: number
excludePatterns: string[]
includePatterns: string[]
reindexAll?: boolean
},
updateProgress?: (indexProgress: IndexProgress) => void,
): Promise<void> {
let filesToIndex: TFile[]
if (options.reindexAll) {
filesToIndex = await this.getFilesToIndex({
embeddingModel: embeddingModel,
excludePatterns: options.excludePatterns,
includePatterns: options.includePatterns,
reindexAll: true,
})
await this.repository.clearAllVectors(embeddingModel)
} else {
await this.cleanVectorsForDeletedFiles(embeddingModel)
filesToIndex = await this.getFilesToIndex({
embeddingModel: embeddingModel,
excludePatterns: options.excludePatterns,
includePatterns: options.includePatterns,
})
await this.repository.deleteVectorsForMultipleFiles(
filesToIndex.map((file) => file.path),
embeddingModel,
)
}
if (filesToIndex.length === 0) {
return
}
if (filesToIndex.length === 0) {
return
}
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
'markdown',
{
chunkSize: options.chunkSize,
// TODO: Use token-based chunking after migrating to WebAssembly-based tiktoken
// Current token counting method is too slow for practical use
// lengthFunction: async (text) => {
// return await tokenCount(text)
// },
},
)
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
'markdown',
{
chunkSize: options.chunkSize,
// TODO: Use token-based chunking after migrating to WebAssembly-based tiktoken
// Current token counting method is too slow for practical use
// lengthFunction: async (text) => {
// return await tokenCount(text)
// },
},
)
const contentChunks: InsertVector[] = (
await Promise.all(
filesToIndex.map(async (file) => {
const fileContent = await this.app.vault.cachedRead(file)
const fileDocuments = await textSplitter.createDocuments([
fileContent,
])
return fileDocuments.map((chunk): InsertVector => {
return {
path: file.path,
mtime: file.stat.mtime,
const contentChunks: InsertVector[] = (
await Promise.all(
filesToIndex.map(async (file) => {
const fileContent = await this.app.vault.cachedRead(file)
const fileDocuments = await textSplitter.createDocuments([
fileContent,
])
return fileDocuments.map((chunk): InsertVector => {
return {
path: file.path,
mtime: file.stat.mtime,
content: chunk.pageContent,
embedding: [],
metadata: {
startLine: chunk.metadata.loc.lines.from as number,
endLine: chunk.metadata.loc.lines.to as number,
},
}
})
}),
)
).flat()
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
}),
)
).flat()
updateProgress?.({
completedChunks: 0,
totalChunks: contentChunks.length,
totalFiles: filesToIndex.length,
})
updateProgress?.({
completedChunks: 0,
totalChunks: contentChunks.length,
totalFiles: filesToIndex.length,
})
const embeddingProgress = { completed: 0, inserted: 0 }
const embeddingChunks: InsertVector[] = []
const batchSize = 100
const limit = pLimit(50)
const abortController = new AbortController()
const tasks = contentChunks.map((chunk) =>
limit(async () => {
if (abortController.signal.aborted) {
throw new Error('Operation was aborted')
}
try {
await backOff(
async () => {
const embedding = await embeddingModel.getEmbedding(chunk.content)
const embeddedChunk = {
path: chunk.path,
mtime: chunk.mtime,
content: chunk.content,
embedding,
metadata: chunk.metadata,
}
embeddingChunks.push(embeddedChunk)
embeddingProgress.completed++
updateProgress?.({
completedChunks: embeddingProgress.completed,
totalChunks: contentChunks.length,
totalFiles: filesToIndex.length,
})
const embeddingProgress = { completed: 0 }
const embeddingChunks: InsertVector[] = []
const batchSize = 100
const limit = pLimit(50)
const abortController = new AbortController()
const tasks = contentChunks.map((chunk) =>
limit(async () => {
if (abortController.signal.aborted) {
throw new Error('Operation was aborted')
}
try {
await backOff(
async () => {
const embedding = await embeddingModel.getEmbedding(chunk.content)
const embeddedChunk = {
path: chunk.path,
mtime: chunk.mtime,
content: chunk.content,
embedding,
metadata: chunk.metadata,
}
embeddingChunks.push(embeddedChunk)
embeddingProgress.completed++
updateProgress?.({
completedChunks: embeddingProgress.completed,
totalChunks: contentChunks.length,
totalFiles: filesToIndex.length,
})
},
{
numOfAttempts: 5,
startingDelay: 1000,
timeMultiple: 1.5,
jitter: 'full',
},
)
} catch (error) {
abortController.abort()
throw error
}
}),
)
// Insert vectors in batches
if (
embeddingChunks.length >=
embeddingProgress.inserted + batchSize ||
embeddingChunks.length === contentChunks.length
) {
await this.repository.insertVectors(
embeddingChunks.slice(
embeddingProgress.inserted,
embeddingProgress.inserted + batchSize,
),
embeddingModel,
)
embeddingProgress.inserted += batchSize
}
},
{
numOfAttempts: 5,
startingDelay: 1000,
timeMultiple: 1.5,
jitter: 'full',
},
)
} catch (error) {
abortController.abort()
throw error
}
}),
)
try {
await Promise.all(tasks)
try {
await Promise.all(tasks)
} catch (error) {
if (
error instanceof LLMAPIKeyNotSetException ||
error instanceof LLMAPIKeyInvalidException ||
error instanceof LLMBaseUrlNotSetException
) {
openSettingsModalWithError(this.app, (error as Error).message)
} else if (error instanceof LLMRateLimitExceededException) {
new Notice(error.message)
} else {
console.error('Error embedding chunks:', error)
throw error
}
} finally {
await this.dbManager.save()
}
}
// all embedding generated, batch insert
if (embeddingChunks.length > 0) {
// batch insert all vectors
let inserted = 0
while (inserted < embeddingChunks.length) {
const chunksToInsert = embeddingChunks.slice(
inserted,
Math.min(inserted + batchSize, embeddingChunks.length)
)
await this.repository.insertVectors(chunksToInsert, embeddingModel)
inserted += chunksToInsert.length
}
}
} catch (error) {
if (
error instanceof LLMAPIKeyNotSetException ||
error instanceof LLMAPIKeyInvalidException ||
error instanceof LLMBaseUrlNotSetException
) {
openSettingsModalWithError(this.app, error.message)
} else if (error instanceof LLMRateLimitExceededException) {
new Notice(error.message)
} else {
console.error('Error embedding chunks:', error)
throw error
}
}
}
private async deleteVectorsForDeletedFiles(embeddingModel: EmbeddingModel) {
const indexedFilePaths =
await this.repository.getIndexedFilePaths(embeddingModel)
for (const filePath of indexedFilePaths) {
if (!this.app.vault.getAbstractFileByPath(filePath)) {
await this.repository.deleteVectorsForMultipleFiles(
[filePath],
embeddingModel,
)
}
}
}
async UpdateFileVectorIndex(
embeddingModel: EmbeddingModel,
chunkSize: number,
file: TFile
) {
private async getFilesToIndex({
embeddingModel,
excludePatterns,
includePatterns,
reindexAll,
}: {
embeddingModel: EmbeddingModel
excludePatterns: string[]
includePatterns: string[]
reindexAll?: boolean
}): Promise<TFile[]> {
let filesToIndex = this.app.vault.getMarkdownFiles()
// Delete existing vectors for the files
await this.repository.deleteVectorsForSingleFile(
file.path,
embeddingModel,
)
filesToIndex = filesToIndex.filter((file) => {
return !excludePatterns.some((pattern) => minimatch(file.path, pattern))
})
// Embed the files
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
'markdown',
{
chunkSize,
},
)
const fileContent = await this.app.vault.cachedRead(file)
const fileDocuments = await textSplitter.createDocuments([
fileContent,
])
if (includePatterns.length > 0) {
filesToIndex = filesToIndex.filter((file) => {
return includePatterns.some((pattern) => minimatch(file.path, pattern))
})
}
const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => {
return {
path: file.path,
mtime: file.stat.mtime,
content: chunk.pageContent,
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
if (reindexAll) {
return filesToIndex
}
const embeddingChunks: InsertVector[] = []
const limit = pLimit(50)
const abortController = new AbortController()
const tasks = contentChunks.map((chunk) =>
limit(async () => {
if (abortController.signal.aborted) {
throw new Error('Operation was aborted')
}
try {
await backOff(
async () => {
const embedding = await embeddingModel.getEmbedding(chunk.content)
const embeddedChunk = {
path: chunk.path,
mtime: chunk.mtime,
content: chunk.content,
embedding,
metadata: chunk.metadata,
}
embeddingChunks.push(embeddedChunk)
},
{
numOfAttempts: 5,
startingDelay: 1000,
timeMultiple: 1.5,
jitter: 'full',
},
)
} catch (error) {
abortController.abort()
throw error
}
}),
)
// Check for updated or new files
filesToIndex = await Promise.all(
filesToIndex.map(async (file) => {
const fileChunks = await this.repository.getVectorsByFilePath(
file.path,
embeddingModel,
)
if (fileChunks.length === 0) {
// File is not indexed, so we need to index it
const fileContent = await this.app.vault.cachedRead(file)
if (fileContent.length === 0) {
// Ignore empty files
return null
}
return file
}
const outOfDate = file.stat.mtime > fileChunks[0].mtime
if (outOfDate) {
// File has changed, so we need to re-index it
return file
}
return null
}),
).then((files) => files.filter(Boolean))
try {
await Promise.all(tasks)
return filesToIndex
}
// all embedding generated, batch insert
if (embeddingChunks.length > 0) {
const batchSize = 100
let inserted = 0
while (inserted < embeddingChunks.length) {
const chunksToInsert = embeddingChunks.slice(inserted, Math.min(inserted + batchSize, embeddingChunks.length))
await this.repository.insertVectors(chunksToInsert, embeddingModel)
inserted += chunksToInsert.length
}
}
} catch (error) {
console.error('Error embedding chunks:', error)
}
}
async DeleteFileVectorIndex(
embeddingModel: EmbeddingModel,
file: TFile
) {
await this.repository.deleteVectorsForSingleFile(file.path, embeddingModel)
}
private async cleanVectorsForDeletedFiles(
embeddingModel: EmbeddingModel,
) {
const indexedFilePaths = await this.repository.getAllIndexedFilePaths(embeddingModel)
const needToDelete = indexedFilePaths.filter(filePath => !this.app.vault.getAbstractFileByPath(filePath))
if (needToDelete.length > 0) {
await this.repository.deleteVectorsForMultipleFiles(
needToDelete,
embeddingModel,
)
}
}
private async getFilesToIndex({
embeddingModel,
excludePatterns,
includePatterns,
reindexAll,
}: {
embeddingModel: EmbeddingModel
excludePatterns: string[]
includePatterns: string[]
reindexAll?: boolean
}): Promise<TFile[]> {
let filesToIndex = this.app.vault.getMarkdownFiles()
filesToIndex = filesToIndex.filter((file) => {
return !excludePatterns.some((pattern) => minimatch(file.path, pattern))
})
if (includePatterns.length > 0) {
filesToIndex = filesToIndex.filter((file) => {
return includePatterns.some((pattern) => minimatch(file.path, pattern))
})
}
if (reindexAll) {
return filesToIndex
}
// Check for updated or new files
filesToIndex = await Promise.all(
filesToIndex.map(async (file) => {
const fileChunks = await this.repository.getVectorsByFilePath(
file.path,
embeddingModel,
)
if (fileChunks.length === 0) {
// File is not indexed, so we need to index it
const fileContent = await this.app.vault.cachedRead(file)
if (fileContent.length === 0) {
// Ignore empty files
return null
}
return file
}
const outOfDate = file.stat.mtime > fileChunks[0].mtime
if (outOfDate) {
// File has changed, so we need to re-index it
return file
}
return null
}),
).then((files) => files.filter(Boolean))
return filesToIndex
}
}