add tool use, update system prompt

2026-05-11 01:27:46 +00:00 · 2025-03-12 21:39:29 +08:00
parent cabf2d5fa4
commit b0fbbb22d3
36 changed files with 7149 additions and 430 deletions
--- a/src/core/diff/strategies/new-unified/search-strategies.ts
+++ b/src/core/diff/strategies/new-unified/search-strategies.ts
@@ -0,0 +1,408 @@
+import { compareTwoStrings } from "string-similarity"
+import { closest } from "fastest-levenshtein"
+import { diff_match_patch } from "diff-match-patch"
+import { Change, Hunk } from "./types"
+
+export type SearchResult = {
+	index: number
+	confidence: number
+	strategy: string
+}
+
+const LARGE_FILE_THRESHOLD = 1000 // lines
+const UNIQUE_CONTENT_BOOST = 0.05
+const DEFAULT_OVERLAP_SIZE = 3 // lines of overlap between windows
+const MAX_WINDOW_SIZE = 500 // maximum lines in a window
+
+// Helper function to calculate adaptive confidence threshold based on file size
+function getAdaptiveThreshold(contentLength: number, baseThreshold: number): number {
+	if (contentLength <= LARGE_FILE_THRESHOLD) {
+		return baseThreshold
+	}
+	return Math.max(baseThreshold - 0.07, 0.8) // Reduce threshold for large files but keep minimum at 80%
+}
+
+// Helper function to evaluate content uniqueness
+function evaluateContentUniqueness(searchStr: string, content: string[]): number {
+	const searchLines = searchStr.split("\n")
+	const uniqueLines = new Set(searchLines)
+	const contentStr = content.join("\n")
+
+	// Calculate how many search lines are relatively unique in the content
+	let uniqueCount = 0
+	for (const line of uniqueLines) {
+		const regex = new RegExp(line.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g")
+		const matches = contentStr.match(regex)
+		if (matches && matches.length <= 2) {
+			// Line appears at most twice
+			uniqueCount++
+		}
+	}
+
+	return uniqueCount / uniqueLines.size
+}
+
+// Helper function to prepare search string from context
+export function prepareSearchString(changes: Change[]): string {
+	const lines = changes.filter((c) => c.type === "context" || c.type === "remove").map((c) => c.originalLine)
+	return lines.join("\n")
+}
+
+// Helper function to evaluate similarity between two texts
+export function evaluateSimilarity(original: string, modified: string): number {
+	return compareTwoStrings(original, modified)
+}
+
+// Helper function to validate using diff-match-patch
+export function getDMPSimilarity(original: string, modified: string): number {
+	const dmp = new diff_match_patch()
+	const diffs = dmp.diff_main(original, modified)
+	dmp.diff_cleanupSemantic(diffs)
+	const patches = dmp.patch_make(original, diffs)
+	const [expectedText] = dmp.patch_apply(patches, original)
+
+	const similarity = evaluateSimilarity(expectedText, modified)
+	return similarity
+}
+
+// Helper function to validate edit results using hunk information
+export function validateEditResult(hunk: Hunk, result: string): number {
+	// Build the expected text from the hunk
+	const expectedText = hunk.changes
+		.filter((change) => change.type === "context" || change.type === "add")
+		.map((change) => (change.indent ? change.indent + change.content : change.content))
+		.join("\n")
+
+	// Calculate similarity between the result and expected text
+	const similarity = getDMPSimilarity(expectedText, result)
+
+	// If the result is unchanged from original, return low confidence
+	const originalText = hunk.changes
+		.filter((change) => change.type === "context" || change.type === "remove")
+		.map((change) => (change.indent ? change.indent + change.content : change.content))
+		.join("\n")
+
+	const originalSimilarity = getDMPSimilarity(originalText, result)
+	if (originalSimilarity > 0.97 && similarity !== 1) {
+		return 0.8 * similarity // Some confidence since we found the right location
+	}
+
+	// For partial matches, scale the confidence but keep it high if we're close
+	return similarity
+}
+
+// Helper function to validate context lines against original content
+function validateContextLines(searchStr: string, content: string, confidenceThreshold: number): number {
+	// Extract just the context lines from the search string
+	const contextLines = searchStr.split("\n").filter((line) => !line.startsWith("-")) // Exclude removed lines
+
+	// Compare context lines with content
+	const similarity = evaluateSimilarity(contextLines.join("\n"), content)
+
+	// Get adaptive threshold based on content size
+	const threshold = getAdaptiveThreshold(content.split("\n").length, confidenceThreshold)
+
+	// Calculate uniqueness boost
+	const uniquenessScore = evaluateContentUniqueness(searchStr, content.split("\n"))
+	const uniquenessBoost = uniquenessScore * UNIQUE_CONTENT_BOOST
+
+	// Adjust confidence based on threshold and uniqueness
+	return similarity < threshold ? similarity * 0.3 + uniquenessBoost : similarity + uniquenessBoost
+}
+
+// Helper function to create overlapping windows
+function createOverlappingWindows(
+	content: string[],
+	searchSize: number,
+	overlapSize: number = DEFAULT_OVERLAP_SIZE,
+): { window: string[]; startIndex: number }[] {
+	const windows: { window: string[]; startIndex: number }[] = []
+
+	// Ensure minimum window size is at least searchSize
+	const effectiveWindowSize = Math.max(searchSize, Math.min(searchSize * 2, MAX_WINDOW_SIZE))
+
+	// Ensure overlap size doesn't exceed window size
+	const effectiveOverlapSize = Math.min(overlapSize, effectiveWindowSize - 1)
+
+	// Calculate step size, ensure it's at least 1
+	const stepSize = Math.max(1, effectiveWindowSize - effectiveOverlapSize)
+
+	for (let i = 0; i < content.length; i += stepSize) {
+		const windowContent = content.slice(i, i + effectiveWindowSize)
+		if (windowContent.length >= searchSize) {
+			windows.push({ window: windowContent, startIndex: i })
+		}
+	}
+
+	return windows
+}
+
+// Helper function to combine overlapping matches
+function combineOverlappingMatches(
+	matches: (SearchResult & { windowIndex: number })[],
+	overlapSize: number = DEFAULT_OVERLAP_SIZE,
+): SearchResult[] {
+	if (matches.length === 0) {
+		return []
+	}
+
+	// Sort matches by confidence
+	matches.sort((a, b) => b.confidence - a.confidence)
+
+	const combinedMatches: SearchResult[] = []
+	const usedIndices = new Set<number>()
+
+	for (const match of matches) {
+		if (usedIndices.has(match.windowIndex)) {
+			continue
+		}
+
+		// Find overlapping matches
+		const overlapping = matches.filter(
+			(m) =>
+				Math.abs(m.windowIndex - match.windowIndex) === 1 &&
+				Math.abs(m.index - match.index) <= overlapSize &&
+				!usedIndices.has(m.windowIndex),
+		)
+
+		if (overlapping.length > 0) {
+			// Boost confidence if we find same match in overlapping windows
+			const avgConfidence =
+				(match.confidence + overlapping.reduce((sum, m) => sum + m.confidence, 0)) / (overlapping.length + 1)
+			const boost = Math.min(0.05 * overlapping.length, 0.1) // Max 10% boost
+
+			combinedMatches.push({
+				index: match.index,
+				confidence: Math.min(1, avgConfidence + boost),
+				strategy: `${match.strategy}-overlapping`,
+			})
+
+			usedIndices.add(match.windowIndex)
+			overlapping.forEach((m) => usedIndices.add(m.windowIndex))
+		} else {
+			combinedMatches.push({
+				index: match.index,
+				confidence: match.confidence,
+				strategy: match.strategy,
+			})
+			usedIndices.add(match.windowIndex)
+		}
+	}
+
+	return combinedMatches
+}
+
+export function findExactMatch(
+	searchStr: string,
+	content: string[],
+	startIndex: number = 0,
+	confidenceThreshold: number = 0.97,
+): SearchResult {
+	const searchLines = searchStr.split("\n")
+	const windows = createOverlappingWindows(content.slice(startIndex), searchLines.length)
+	const matches: (SearchResult & { windowIndex: number })[] = []
+
+	windows.forEach((windowData, windowIndex) => {
+		const windowStr = windowData.window.join("\n")
+		const exactMatch = windowStr.indexOf(searchStr)
+
+		if (exactMatch !== -1) {
+			const matchedContent = windowData.window
+				.slice(
+					windowStr.slice(0, exactMatch).split("\n").length - 1,
+					windowStr.slice(0, exactMatch).split("\n").length - 1 + searchLines.length,
+				)
+				.join("\n")
+
+			const similarity = getDMPSimilarity(searchStr, matchedContent)
+			const contextSimilarity = validateContextLines(searchStr, matchedContent, confidenceThreshold)
+			const confidence = Math.min(similarity, contextSimilarity)
+
+			matches.push({
+				index: startIndex + windowData.startIndex + windowStr.slice(0, exactMatch).split("\n").length - 1,
+				confidence,
+				strategy: "exact",
+				windowIndex,
+			})
+		}
+	})
+
+	const combinedMatches = combineOverlappingMatches(matches)
+	return combinedMatches.length > 0 ? combinedMatches[0] : { index: -1, confidence: 0, strategy: "exact" }
+}
+
+// String similarity strategy
+export function findSimilarityMatch(
+	searchStr: string,
+	content: string[],
+	startIndex: number = 0,
+	confidenceThreshold: number = 0.97,
+): SearchResult {
+	const searchLines = searchStr.split("\n")
+	let bestScore = 0
+	let bestIndex = -1
+
+	for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
+		const windowStr = content.slice(i, i + searchLines.length).join("\n")
+		const score = compareTwoStrings(searchStr, windowStr)
+		if (score > bestScore && score >= confidenceThreshold) {
+			const similarity = getDMPSimilarity(searchStr, windowStr)
+			const contextSimilarity = validateContextLines(searchStr, windowStr, confidenceThreshold)
+			const adjustedScore = Math.min(similarity, contextSimilarity) * score
+
+			if (adjustedScore > bestScore) {
+				bestScore = adjustedScore
+				bestIndex = i
+			}
+		}
+	}
+
+	return {
+		index: bestIndex,
+		confidence: bestIndex !== -1 ? bestScore : 0,
+		strategy: "similarity",
+	}
+}
+
+// Levenshtein strategy
+export function findLevenshteinMatch(
+	searchStr: string,
+	content: string[],
+	startIndex: number = 0,
+	confidenceThreshold: number = 0.97,
+): SearchResult {
+	const searchLines = searchStr.split("\n")
+	const candidates = []
+
+	for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
+		candidates.push(content.slice(i, i + searchLines.length).join("\n"))
+	}
+
+	if (candidates.length > 0) {
+		const closestMatch = closest(searchStr, candidates)
+		const index = startIndex + candidates.indexOf(closestMatch)
+		const similarity = getDMPSimilarity(searchStr, closestMatch)
+		const contextSimilarity = validateContextLines(searchStr, closestMatch, confidenceThreshold)
+		const confidence = Math.min(similarity, contextSimilarity)
+		return {
+			index: confidence === 0 ? -1 : index,
+			confidence: index !== -1 ? confidence : 0,
+			strategy: "levenshtein",
+		}
+	}
+
+	return { index: -1, confidence: 0, strategy: "levenshtein" }
+}
+
+// Helper function to identify anchor lines
+function identifyAnchors(searchStr: string): { first: string | null; last: string | null } {
+	const searchLines = searchStr.split("\n")
+	let first: string | null = null
+	let last: string | null = null
+
+	// Find the first non-empty line
+	for (const line of searchLines) {
+		if (line.trim()) {
+			first = line
+			break
+		}
+	}
+
+	// Find the last non-empty line
+	for (let i = searchLines.length - 1; i >= 0; i--) {
+		if (searchLines[i].trim()) {
+			last = searchLines[i]
+			break
+		}
+	}
+
+	return { first, last }
+}
+
+// Anchor-based search strategy
+export function findAnchorMatch(
+	searchStr: string,
+	content: string[],
+	startIndex: number = 0,
+	confidenceThreshold: number = 0.97,
+): SearchResult {
+	const searchLines = searchStr.split("\n")
+	const { first, last } = identifyAnchors(searchStr)
+
+	if (!first || !last) {
+		return { index: -1, confidence: 0, strategy: "anchor" }
+	}
+
+	let firstIndex = -1
+	let lastIndex = -1
+
+	// Check if the first anchor is unique
+	let firstOccurrences = 0
+	for (const contentLine of content) {
+		if (contentLine === first) {
+			firstOccurrences++
+		}
+	}
+
+	if (firstOccurrences !== 1) {
+		return { index: -1, confidence: 0, strategy: "anchor" }
+	}
+
+	// Find the first anchor
+	for (let i = startIndex; i < content.length; i++) {
+		if (content[i] === first) {
+			firstIndex = i
+			break
+		}
+	}
+
+	// Find the last anchor
+	for (let i = content.length - 1; i >= startIndex; i--) {
+		if (content[i] === last) {
+			lastIndex = i
+			break
+		}
+	}
+
+	if (firstIndex === -1 || lastIndex === -1 || lastIndex <= firstIndex) {
+		return { index: -1, confidence: 0, strategy: "anchor" }
+	}
+
+	// Validate the context
+	const expectedContext = searchLines.slice(searchLines.indexOf(first) + 1, searchLines.indexOf(last)).join("\n")
+	const actualContext = content.slice(firstIndex + 1, lastIndex).join("\n")
+	const contextSimilarity = evaluateSimilarity(expectedContext, actualContext)
+
+	if (contextSimilarity < getAdaptiveThreshold(content.length, confidenceThreshold)) {
+		return { index: -1, confidence: 0, strategy: "anchor" }
+	}
+
+	const confidence = 1
+
+	return {
+		index: firstIndex,
+		confidence: confidence,
+		strategy: "anchor",
+	}
+}
+
+// Main search function that tries all strategies
+export function findBestMatch(
+	searchStr: string,
+	content: string[],
+	startIndex: number = 0,
+	confidenceThreshold: number = 0.97,
+): SearchResult {
+	const strategies = [findExactMatch, findAnchorMatch, findSimilarityMatch, findLevenshteinMatch]
+
+	let bestResult: SearchResult = { index: -1, confidence: 0, strategy: "none" }
+
+	for (const strategy of strategies) {
+		const result = strategy(searchStr, content, startIndex, confidenceThreshold)
+		if (result.confidence > bestResult.confidence) {
+			bestResult = result
+		}
+	}
+
+	return bestResult
+}