mirror of
https://github.com/EthanMarti/infio-copilot.git
synced 2026-05-08 08:00:10 +00:00
use web worker to run pglite
This commit is contained in:
@@ -6,10 +6,10 @@ import pLimit from 'p-limit'
|
||||
|
||||
import { IndexProgress } from '../../../components/chat-view/QueryProgress'
|
||||
import {
|
||||
LLMAPIKeyInvalidException,
|
||||
LLMAPIKeyNotSetException,
|
||||
LLMBaseUrlNotSetException,
|
||||
LLMRateLimitExceededException,
|
||||
LLMAPIKeyInvalidException,
|
||||
LLMAPIKeyNotSetException,
|
||||
LLMBaseUrlNotSetException,
|
||||
LLMRateLimitExceededException,
|
||||
} from '../../../core/llm/exception'
|
||||
import { InsertVector, SelectVector } from '../../../database/schema'
|
||||
import { EmbeddingModel } from '../../../types/embedding'
|
||||
@@ -19,260 +19,353 @@ import { DBManager } from '../../database-manager'
|
||||
import { VectorRepository } from './vector-repository'
|
||||
|
||||
export class VectorManager {
|
||||
private app: App
|
||||
private repository: VectorRepository
|
||||
private dbManager: DBManager
|
||||
private app: App
|
||||
private repository: VectorRepository
|
||||
private dbManager: DBManager
|
||||
|
||||
constructor(app: App, dbManager: DBManager) {
|
||||
this.app = app
|
||||
this.dbManager = dbManager
|
||||
this.repository = new VectorRepository(app, dbManager.getPgClient())
|
||||
}
|
||||
constructor(app: App, dbManager: DBManager) {
|
||||
this.app = app
|
||||
this.dbManager = dbManager
|
||||
this.repository = new VectorRepository(app, dbManager.getPgClient())
|
||||
}
|
||||
|
||||
async performSimilaritySearch(
|
||||
queryVector: number[],
|
||||
embeddingModel: EmbeddingModel,
|
||||
options: {
|
||||
minSimilarity: number
|
||||
limit: number
|
||||
scope?: {
|
||||
files: string[]
|
||||
folders: string[]
|
||||
}
|
||||
},
|
||||
): Promise<
|
||||
(Omit<SelectVector, 'embedding'> & {
|
||||
similarity: number
|
||||
})[]
|
||||
> {
|
||||
return await this.repository.performSimilaritySearch(
|
||||
queryVector,
|
||||
embeddingModel,
|
||||
options,
|
||||
)
|
||||
}
|
||||
async performSimilaritySearch(
|
||||
queryVector: number[],
|
||||
embeddingModel: EmbeddingModel,
|
||||
options: {
|
||||
minSimilarity: number
|
||||
limit: number
|
||||
scope?: {
|
||||
files: string[]
|
||||
folders: string[]
|
||||
}
|
||||
},
|
||||
): Promise<
|
||||
(Omit<SelectVector, 'embedding'> & {
|
||||
similarity: number
|
||||
})[]
|
||||
> {
|
||||
return await this.repository.performSimilaritySearch(
|
||||
queryVector,
|
||||
embeddingModel,
|
||||
options,
|
||||
)
|
||||
}
|
||||
|
||||
async updateVaultIndex(
|
||||
embeddingModel: EmbeddingModel,
|
||||
options: {
|
||||
chunkSize: number
|
||||
excludePatterns: string[]
|
||||
includePatterns: string[]
|
||||
reindexAll?: boolean
|
||||
},
|
||||
updateProgress?: (indexProgress: IndexProgress) => void,
|
||||
): Promise<void> {
|
||||
let filesToIndex: TFile[]
|
||||
if (options.reindexAll) {
|
||||
filesToIndex = await this.getFilesToIndex({
|
||||
embeddingModel: embeddingModel,
|
||||
excludePatterns: options.excludePatterns,
|
||||
includePatterns: options.includePatterns,
|
||||
reindexAll: true,
|
||||
})
|
||||
await this.repository.clearAllVectors(embeddingModel)
|
||||
} else {
|
||||
await this.deleteVectorsForDeletedFiles(embeddingModel)
|
||||
filesToIndex = await this.getFilesToIndex({
|
||||
embeddingModel: embeddingModel,
|
||||
excludePatterns: options.excludePatterns,
|
||||
includePatterns: options.includePatterns,
|
||||
})
|
||||
await this.repository.deleteVectorsForMultipleFiles(
|
||||
filesToIndex.map((file) => file.path),
|
||||
embeddingModel,
|
||||
)
|
||||
}
|
||||
async updateVaultIndex(
|
||||
embeddingModel: EmbeddingModel,
|
||||
options: {
|
||||
chunkSize: number
|
||||
excludePatterns: string[]
|
||||
includePatterns: string[]
|
||||
reindexAll?: boolean
|
||||
},
|
||||
updateProgress?: (indexProgress: IndexProgress) => void,
|
||||
): Promise<void> {
|
||||
let filesToIndex: TFile[]
|
||||
if (options.reindexAll) {
|
||||
filesToIndex = await this.getFilesToIndex({
|
||||
embeddingModel: embeddingModel,
|
||||
excludePatterns: options.excludePatterns,
|
||||
includePatterns: options.includePatterns,
|
||||
reindexAll: true,
|
||||
})
|
||||
await this.repository.clearAllVectors(embeddingModel)
|
||||
} else {
|
||||
await this.cleanVectorsForDeletedFiles(embeddingModel)
|
||||
filesToIndex = await this.getFilesToIndex({
|
||||
embeddingModel: embeddingModel,
|
||||
excludePatterns: options.excludePatterns,
|
||||
includePatterns: options.includePatterns,
|
||||
})
|
||||
await this.repository.deleteVectorsForMultipleFiles(
|
||||
filesToIndex.map((file) => file.path),
|
||||
embeddingModel,
|
||||
)
|
||||
}
|
||||
|
||||
if (filesToIndex.length === 0) {
|
||||
return
|
||||
}
|
||||
if (filesToIndex.length === 0) {
|
||||
return
|
||||
}
|
||||
|
||||
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
|
||||
'markdown',
|
||||
{
|
||||
chunkSize: options.chunkSize,
|
||||
// TODO: Use token-based chunking after migrating to WebAssembly-based tiktoken
|
||||
// Current token counting method is too slow for practical use
|
||||
// lengthFunction: async (text) => {
|
||||
// return await tokenCount(text)
|
||||
// },
|
||||
},
|
||||
)
|
||||
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
|
||||
'markdown',
|
||||
{
|
||||
chunkSize: options.chunkSize,
|
||||
// TODO: Use token-based chunking after migrating to WebAssembly-based tiktoken
|
||||
// Current token counting method is too slow for practical use
|
||||
// lengthFunction: async (text) => {
|
||||
// return await tokenCount(text)
|
||||
// },
|
||||
},
|
||||
)
|
||||
|
||||
const contentChunks: InsertVector[] = (
|
||||
await Promise.all(
|
||||
filesToIndex.map(async (file) => {
|
||||
const fileContent = await this.app.vault.cachedRead(file)
|
||||
const fileDocuments = await textSplitter.createDocuments([
|
||||
fileContent,
|
||||
])
|
||||
return fileDocuments.map((chunk): InsertVector => {
|
||||
return {
|
||||
path: file.path,
|
||||
mtime: file.stat.mtime,
|
||||
const contentChunks: InsertVector[] = (
|
||||
await Promise.all(
|
||||
filesToIndex.map(async (file) => {
|
||||
const fileContent = await this.app.vault.cachedRead(file)
|
||||
const fileDocuments = await textSplitter.createDocuments([
|
||||
fileContent,
|
||||
])
|
||||
return fileDocuments.map((chunk): InsertVector => {
|
||||
return {
|
||||
path: file.path,
|
||||
mtime: file.stat.mtime,
|
||||
content: chunk.pageContent,
|
||||
embedding: [],
|
||||
metadata: {
|
||||
startLine: chunk.metadata.loc.lines.from as number,
|
||||
endLine: chunk.metadata.loc.lines.to as number,
|
||||
},
|
||||
}
|
||||
})
|
||||
}),
|
||||
)
|
||||
).flat()
|
||||
metadata: {
|
||||
startLine: Number(chunk.metadata.loc.lines.from),
|
||||
endLine: Number(chunk.metadata.loc.lines.to),
|
||||
},
|
||||
}
|
||||
})
|
||||
}),
|
||||
)
|
||||
).flat()
|
||||
|
||||
updateProgress?.({
|
||||
completedChunks: 0,
|
||||
totalChunks: contentChunks.length,
|
||||
totalFiles: filesToIndex.length,
|
||||
})
|
||||
updateProgress?.({
|
||||
completedChunks: 0,
|
||||
totalChunks: contentChunks.length,
|
||||
totalFiles: filesToIndex.length,
|
||||
})
|
||||
|
||||
const embeddingProgress = { completed: 0, inserted: 0 }
|
||||
const embeddingChunks: InsertVector[] = []
|
||||
const batchSize = 100
|
||||
const limit = pLimit(50)
|
||||
const abortController = new AbortController()
|
||||
const tasks = contentChunks.map((chunk) =>
|
||||
limit(async () => {
|
||||
if (abortController.signal.aborted) {
|
||||
throw new Error('Operation was aborted')
|
||||
}
|
||||
try {
|
||||
await backOff(
|
||||
async () => {
|
||||
const embedding = await embeddingModel.getEmbedding(chunk.content)
|
||||
const embeddedChunk = {
|
||||
path: chunk.path,
|
||||
mtime: chunk.mtime,
|
||||
content: chunk.content,
|
||||
embedding,
|
||||
metadata: chunk.metadata,
|
||||
}
|
||||
embeddingChunks.push(embeddedChunk)
|
||||
embeddingProgress.completed++
|
||||
updateProgress?.({
|
||||
completedChunks: embeddingProgress.completed,
|
||||
totalChunks: contentChunks.length,
|
||||
totalFiles: filesToIndex.length,
|
||||
})
|
||||
const embeddingProgress = { completed: 0 }
|
||||
const embeddingChunks: InsertVector[] = []
|
||||
const batchSize = 100
|
||||
const limit = pLimit(50)
|
||||
const abortController = new AbortController()
|
||||
const tasks = contentChunks.map((chunk) =>
|
||||
limit(async () => {
|
||||
if (abortController.signal.aborted) {
|
||||
throw new Error('Operation was aborted')
|
||||
}
|
||||
try {
|
||||
await backOff(
|
||||
async () => {
|
||||
const embedding = await embeddingModel.getEmbedding(chunk.content)
|
||||
const embeddedChunk = {
|
||||
path: chunk.path,
|
||||
mtime: chunk.mtime,
|
||||
content: chunk.content,
|
||||
embedding,
|
||||
metadata: chunk.metadata,
|
||||
}
|
||||
embeddingChunks.push(embeddedChunk)
|
||||
embeddingProgress.completed++
|
||||
updateProgress?.({
|
||||
completedChunks: embeddingProgress.completed,
|
||||
totalChunks: contentChunks.length,
|
||||
totalFiles: filesToIndex.length,
|
||||
})
|
||||
},
|
||||
{
|
||||
numOfAttempts: 5,
|
||||
startingDelay: 1000,
|
||||
timeMultiple: 1.5,
|
||||
jitter: 'full',
|
||||
},
|
||||
)
|
||||
} catch (error) {
|
||||
abortController.abort()
|
||||
throw error
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
// Insert vectors in batches
|
||||
if (
|
||||
embeddingChunks.length >=
|
||||
embeddingProgress.inserted + batchSize ||
|
||||
embeddingChunks.length === contentChunks.length
|
||||
) {
|
||||
await this.repository.insertVectors(
|
||||
embeddingChunks.slice(
|
||||
embeddingProgress.inserted,
|
||||
embeddingProgress.inserted + batchSize,
|
||||
),
|
||||
embeddingModel,
|
||||
)
|
||||
embeddingProgress.inserted += batchSize
|
||||
}
|
||||
},
|
||||
{
|
||||
numOfAttempts: 5,
|
||||
startingDelay: 1000,
|
||||
timeMultiple: 1.5,
|
||||
jitter: 'full',
|
||||
},
|
||||
)
|
||||
} catch (error) {
|
||||
abortController.abort()
|
||||
throw error
|
||||
}
|
||||
}),
|
||||
)
|
||||
try {
|
||||
await Promise.all(tasks)
|
||||
|
||||
try {
|
||||
await Promise.all(tasks)
|
||||
} catch (error) {
|
||||
if (
|
||||
error instanceof LLMAPIKeyNotSetException ||
|
||||
error instanceof LLMAPIKeyInvalidException ||
|
||||
error instanceof LLMBaseUrlNotSetException
|
||||
) {
|
||||
openSettingsModalWithError(this.app, (error as Error).message)
|
||||
} else if (error instanceof LLMRateLimitExceededException) {
|
||||
new Notice(error.message)
|
||||
} else {
|
||||
console.error('Error embedding chunks:', error)
|
||||
throw error
|
||||
}
|
||||
} finally {
|
||||
await this.dbManager.save()
|
||||
}
|
||||
}
|
||||
// all embedding generated, batch insert
|
||||
if (embeddingChunks.length > 0) {
|
||||
// batch insert all vectors
|
||||
let inserted = 0
|
||||
while (inserted < embeddingChunks.length) {
|
||||
const chunksToInsert = embeddingChunks.slice(
|
||||
inserted,
|
||||
Math.min(inserted + batchSize, embeddingChunks.length)
|
||||
)
|
||||
await this.repository.insertVectors(chunksToInsert, embeddingModel)
|
||||
inserted += chunksToInsert.length
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
if (
|
||||
error instanceof LLMAPIKeyNotSetException ||
|
||||
error instanceof LLMAPIKeyInvalidException ||
|
||||
error instanceof LLMBaseUrlNotSetException
|
||||
) {
|
||||
openSettingsModalWithError(this.app, error.message)
|
||||
} else if (error instanceof LLMRateLimitExceededException) {
|
||||
new Notice(error.message)
|
||||
} else {
|
||||
console.error('Error embedding chunks:', error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async deleteVectorsForDeletedFiles(embeddingModel: EmbeddingModel) {
|
||||
const indexedFilePaths =
|
||||
await this.repository.getIndexedFilePaths(embeddingModel)
|
||||
for (const filePath of indexedFilePaths) {
|
||||
if (!this.app.vault.getAbstractFileByPath(filePath)) {
|
||||
await this.repository.deleteVectorsForMultipleFiles(
|
||||
[filePath],
|
||||
embeddingModel,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
async UpdateFileVectorIndex(
|
||||
embeddingModel: EmbeddingModel,
|
||||
chunkSize: number,
|
||||
file: TFile
|
||||
) {
|
||||
|
||||
private async getFilesToIndex({
|
||||
embeddingModel,
|
||||
excludePatterns,
|
||||
includePatterns,
|
||||
reindexAll,
|
||||
}: {
|
||||
embeddingModel: EmbeddingModel
|
||||
excludePatterns: string[]
|
||||
includePatterns: string[]
|
||||
reindexAll?: boolean
|
||||
}): Promise<TFile[]> {
|
||||
let filesToIndex = this.app.vault.getMarkdownFiles()
|
||||
// Delete existing vectors for the files
|
||||
await this.repository.deleteVectorsForSingleFile(
|
||||
file.path,
|
||||
embeddingModel,
|
||||
)
|
||||
|
||||
filesToIndex = filesToIndex.filter((file) => {
|
||||
return !excludePatterns.some((pattern) => minimatch(file.path, pattern))
|
||||
})
|
||||
// Embed the files
|
||||
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
|
||||
'markdown',
|
||||
{
|
||||
chunkSize,
|
||||
},
|
||||
)
|
||||
const fileContent = await this.app.vault.cachedRead(file)
|
||||
const fileDocuments = await textSplitter.createDocuments([
|
||||
fileContent,
|
||||
])
|
||||
|
||||
if (includePatterns.length > 0) {
|
||||
filesToIndex = filesToIndex.filter((file) => {
|
||||
return includePatterns.some((pattern) => minimatch(file.path, pattern))
|
||||
})
|
||||
}
|
||||
const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => {
|
||||
return {
|
||||
path: file.path,
|
||||
mtime: file.stat.mtime,
|
||||
content: chunk.pageContent,
|
||||
embedding: [],
|
||||
metadata: {
|
||||
startLine: Number(chunk.metadata.loc.lines.from),
|
||||
endLine: Number(chunk.metadata.loc.lines.to),
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
if (reindexAll) {
|
||||
return filesToIndex
|
||||
}
|
||||
const embeddingChunks: InsertVector[] = []
|
||||
const limit = pLimit(50)
|
||||
const abortController = new AbortController()
|
||||
const tasks = contentChunks.map((chunk) =>
|
||||
limit(async () => {
|
||||
if (abortController.signal.aborted) {
|
||||
throw new Error('Operation was aborted')
|
||||
}
|
||||
try {
|
||||
await backOff(
|
||||
async () => {
|
||||
const embedding = await embeddingModel.getEmbedding(chunk.content)
|
||||
const embeddedChunk = {
|
||||
path: chunk.path,
|
||||
mtime: chunk.mtime,
|
||||
content: chunk.content,
|
||||
embedding,
|
||||
metadata: chunk.metadata,
|
||||
}
|
||||
embeddingChunks.push(embeddedChunk)
|
||||
},
|
||||
{
|
||||
numOfAttempts: 5,
|
||||
startingDelay: 1000,
|
||||
timeMultiple: 1.5,
|
||||
jitter: 'full',
|
||||
},
|
||||
)
|
||||
} catch (error) {
|
||||
abortController.abort()
|
||||
throw error
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
// Check for updated or new files
|
||||
filesToIndex = await Promise.all(
|
||||
filesToIndex.map(async (file) => {
|
||||
const fileChunks = await this.repository.getVectorsByFilePath(
|
||||
file.path,
|
||||
embeddingModel,
|
||||
)
|
||||
if (fileChunks.length === 0) {
|
||||
// File is not indexed, so we need to index it
|
||||
const fileContent = await this.app.vault.cachedRead(file)
|
||||
if (fileContent.length === 0) {
|
||||
// Ignore empty files
|
||||
return null
|
||||
}
|
||||
return file
|
||||
}
|
||||
const outOfDate = file.stat.mtime > fileChunks[0].mtime
|
||||
if (outOfDate) {
|
||||
// File has changed, so we need to re-index it
|
||||
return file
|
||||
}
|
||||
return null
|
||||
}),
|
||||
).then((files) => files.filter(Boolean))
|
||||
try {
|
||||
await Promise.all(tasks)
|
||||
|
||||
return filesToIndex
|
||||
}
|
||||
// all embedding generated, batch insert
|
||||
if (embeddingChunks.length > 0) {
|
||||
const batchSize = 100
|
||||
let inserted = 0
|
||||
while (inserted < embeddingChunks.length) {
|
||||
const chunksToInsert = embeddingChunks.slice(inserted, Math.min(inserted + batchSize, embeddingChunks.length))
|
||||
await this.repository.insertVectors(chunksToInsert, embeddingModel)
|
||||
inserted += chunksToInsert.length
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error embedding chunks:', error)
|
||||
}
|
||||
}
|
||||
|
||||
async DeleteFileVectorIndex(
|
||||
embeddingModel: EmbeddingModel,
|
||||
file: TFile
|
||||
) {
|
||||
await this.repository.deleteVectorsForSingleFile(file.path, embeddingModel)
|
||||
}
|
||||
|
||||
private async cleanVectorsForDeletedFiles(
|
||||
embeddingModel: EmbeddingModel,
|
||||
) {
|
||||
const indexedFilePaths = await this.repository.getAllIndexedFilePaths(embeddingModel)
|
||||
const needToDelete = indexedFilePaths.filter(filePath => !this.app.vault.getAbstractFileByPath(filePath))
|
||||
if (needToDelete.length > 0) {
|
||||
await this.repository.deleteVectorsForMultipleFiles(
|
||||
needToDelete,
|
||||
embeddingModel,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private async getFilesToIndex({
|
||||
embeddingModel,
|
||||
excludePatterns,
|
||||
includePatterns,
|
||||
reindexAll,
|
||||
}: {
|
||||
embeddingModel: EmbeddingModel
|
||||
excludePatterns: string[]
|
||||
includePatterns: string[]
|
||||
reindexAll?: boolean
|
||||
}): Promise<TFile[]> {
|
||||
let filesToIndex = this.app.vault.getMarkdownFiles()
|
||||
|
||||
filesToIndex = filesToIndex.filter((file) => {
|
||||
return !excludePatterns.some((pattern) => minimatch(file.path, pattern))
|
||||
})
|
||||
|
||||
if (includePatterns.length > 0) {
|
||||
filesToIndex = filesToIndex.filter((file) => {
|
||||
return includePatterns.some((pattern) => minimatch(file.path, pattern))
|
||||
})
|
||||
}
|
||||
|
||||
if (reindexAll) {
|
||||
return filesToIndex
|
||||
}
|
||||
|
||||
// Check for updated or new files
|
||||
filesToIndex = await Promise.all(
|
||||
filesToIndex.map(async (file) => {
|
||||
const fileChunks = await this.repository.getVectorsByFilePath(
|
||||
file.path,
|
||||
embeddingModel,
|
||||
)
|
||||
if (fileChunks.length === 0) {
|
||||
// File is not indexed, so we need to index it
|
||||
const fileContent = await this.app.vault.cachedRead(file)
|
||||
if (fileContent.length === 0) {
|
||||
// Ignore empty files
|
||||
return null
|
||||
}
|
||||
return file
|
||||
}
|
||||
const outOfDate = file.stat.mtime > fileChunks[0].mtime
|
||||
if (outOfDate) {
|
||||
// File has changed, so we need to re-index it
|
||||
return file
|
||||
}
|
||||
return null
|
||||
}),
|
||||
).then((files) => files.filter(Boolean))
|
||||
|
||||
return filesToIndex
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user