🎯 PR: Improve Edit Tool Reliability with Fuzzy Matching Pipeline (#1025)

This commit is contained in:
tanzhenxin
2025-11-13 19:01:09 +08:00
committed by GitHub
parent b029f0d2ce
commit 0752a31e1e
5 changed files with 718 additions and 109 deletions

View File

@@ -0,0 +1,153 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect, it } from 'vitest';
import {
countOccurrences,
maybeAugmentOldStringForDeletion,
normalizeEditStrings,
} from './editHelper.js';
describe('normalizeEditStrings', () => {
const file = `const one = 1;
const two = 2;
`;
it('returns literal matches unchanged and trims new_string trailing whitespace', () => {
const result = normalizeEditStrings(
file,
'const two = 2;',
' const two = 42; ',
);
expect(result).toEqual({
oldString: 'const two = 2;',
newString: ' const two = 42;',
});
});
it('normalizes smart quotes to match on-disk text', () => {
const result = normalizeEditStrings(
"const greeting = 'Don't';\n",
'const greeting = Dont;',
'const greeting = “Hello”; ',
);
expect(result).toEqual({
oldString: "const greeting = 'Don't';",
newString: 'const greeting = “Hello”;',
});
});
it('falls back to original strings when no match is found', () => {
const result = normalizeEditStrings(file, 'missing text', 'replacement');
expect(result).toEqual({
oldString: 'missing text',
newString: 'replacement',
});
});
it('still trims new_string when editing a brand-new file', () => {
const result = normalizeEditStrings(null, '', 'new file contents ');
expect(result).toEqual({
oldString: '',
newString: 'new file contents',
});
});
it('matches unicode dash variants', () => {
const result = normalizeEditStrings(
'const range = "1-2";\n',
'const range = "1\u20132";',
'const range = "3\u20135"; ',
);
expect(result).toEqual({
oldString: 'const range = "1-2";',
newString: 'const range = "3\u20135";',
});
});
it('matches when trailing whitespace differs only at line ends', () => {
const result = normalizeEditStrings(
'value = 1;\n',
'value = 1; \n',
'value = 2; \n',
);
expect(result).toEqual({
oldString: 'value = 1;\n',
newString: 'value = 2;\n',
});
});
it('treats non-breaking spaces as regular spaces', () => {
const result = normalizeEditStrings(
'const label = "hello world";\n',
'const label = "hello\u00a0world";',
'const label = "hi\u00a0world";',
);
expect(result).toEqual({
oldString: 'const label = "hello world";',
newString: 'const label = "hi\u00a0world";',
});
});
it('drops trailing newline from new content when the file lacks it', () => {
const result = normalizeEditStrings(
'console.log("hi")',
'console.log("hi")\n',
'console.log("bye")\n',
);
expect(result).toEqual({
oldString: 'console.log("hi")',
newString: 'console.log("bye")',
});
});
});
describe('countOccurrences', () => {
it('returns zero when substring empty or missing', () => {
expect(countOccurrences('abc', '')).toBe(0);
expect(countOccurrences('abc', 'z')).toBe(0);
});
it('counts non-overlapping occurrences', () => {
expect(countOccurrences('aaaa', 'aa')).toBe(2);
});
});
describe('maybeAugmentOldStringForDeletion', () => {
const file = 'console.log("hi")\nconsole.log("bye")\n';
it('appends newline when deleting text followed by newline', () => {
expect(
maybeAugmentOldStringForDeletion(file, 'console.log("hi")', ''),
).toBe('console.log("hi")\n');
});
it('leaves strings untouched when not deleting', () => {
expect(
maybeAugmentOldStringForDeletion(
file,
'console.log("hi")',
'replacement',
),
).toBe('console.log("hi")');
});
it('does not append newline when file lacks the variant', () => {
expect(
maybeAugmentOldStringForDeletion(
'console.log("hi")',
'console.log("hi")',
'',
),
).toBe('console.log("hi")');
});
it('no-ops when the old string already ends with a newline', () => {
expect(
maybeAugmentOldStringForDeletion(file, 'console.log("bye")\n', ''),
).toBe('console.log("bye")\n');
});
});

View File

@@ -0,0 +1,499 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* Helpers for reconciling LLM-proposed edits with on-disk text.
*
* The normalization pipeline intentionally stays deterministic: we first try
* literal substring matches, then gradually relax comparison rules (smart
* quotes, em-dashes, trailing whitespace, etc.) until we either locate the
* exact slice from the file or conclude the edit cannot be applied.
*/
/* -------------------------------------------------------------------------- */
/* Character-level normalization */
/* -------------------------------------------------------------------------- */
const UNICODE_EQUIVALENT_MAP: Record<string, string> = {
// Hyphen variations → ASCII hyphen-minus.
'\u2010': '-',
'\u2011': '-',
'\u2012': '-',
'\u2013': '-',
'\u2014': '-',
'\u2015': '-',
'\u2212': '-',
// Curly single quotes → straight apostrophe.
'\u2018': "'",
'\u2019': "'",
'\u201A': "'",
'\u201B': "'",
// Curly double quotes → straight double quote.
'\u201C': '"',
'\u201D': '"',
'\u201E': '"',
'\u201F': '"',
// Whitespace variants → normal space.
'\u00A0': ' ',
'\u2002': ' ',
'\u2003': ' ',
'\u2004': ' ',
'\u2005': ' ',
'\u2006': ' ',
'\u2007': ' ',
'\u2008': ' ',
'\u2009': ' ',
'\u200A': ' ',
'\u202F': ' ',
'\u205F': ' ',
'\u3000': ' ',
};
function normalizeBasicCharacters(text: string): string {
if (text === '') {
return text;
}
let normalized = '';
for (const char of text) {
normalized += UNICODE_EQUIVALENT_MAP[char] ?? char;
}
return normalized;
}
/**
* Removes trailing whitespace from each line while keeping the original newline
* separators intact.
*/
function stripTrailingWhitespacePreserveNewlines(text: string): string {
const pieces = text.split(/(\r\n|\n|\r)/);
let result = '';
for (let i = 0; i < pieces.length; i++) {
const segment = pieces[i];
if (segment === undefined) {
continue;
}
if (i % 2 === 0) {
result += segment.trimEnd();
} else {
result += segment;
}
}
return result;
}
/* -------------------------------------------------------------------------- */
/* Line-based search helpers */
/* -------------------------------------------------------------------------- */
interface MatchedSliceResult {
slice: string;
removedTrailingFinalEmptyLine: boolean;
}
/**
* Comparison passes become progressively more forgiving, making it possible to
* match when only trailing whitespace differs. Leading whitespace (indentation)
* is always preserved to avoid matching at incorrect scope levels.
*/
const LINE_COMPARISON_PASSES: Array<(value: string) => string> = [
(value) => value,
(value) => value.trimEnd(),
];
function normalizeLineForComparison(value: string): string {
return normalizeBasicCharacters(value).trimEnd();
}
/**
* Finds the first index where {@link pattern} appears within {@link lines} once
* both sequences are transformed in the same way.
*/
function seekSequenceWithTransform(
lines: string[],
pattern: string[],
transform: (value: string) => string,
): number | null {
if (pattern.length === 0) {
return 0;
}
if (pattern.length > lines.length) {
return null;
}
outer: for (let i = 0; i <= lines.length - pattern.length; i++) {
for (let p = 0; p < pattern.length; p++) {
if (transform(lines[i + p]) !== transform(pattern[p])) {
continue outer;
}
}
return i;
}
return null;
}
function buildLineIndex(text: string): {
lines: string[];
offsets: number[];
} {
const lines = text.split('\n');
const offsets = new Array<number>(lines.length + 1);
let cursor = 0;
for (let i = 0; i < lines.length; i++) {
offsets[i] = cursor;
cursor += lines[i].length;
if (i < lines.length - 1) {
cursor += 1; // Account for the newline that split() removed.
}
}
offsets[lines.length] = text.length;
return { lines, offsets };
}
/**
* Reconstructs the original characters for the matched lines, optionally
* preserving the newline that follows the final line.
*/
function sliceFromLines(
text: string,
offsets: number[],
lines: string[],
startLine: number,
lineCount: number,
includeTrailingNewline: boolean,
): string {
if (lineCount === 0) {
return includeTrailingNewline ? '\n' : '';
}
const startIndex = offsets[startLine] ?? 0;
const lastLineIndex = startLine + lineCount - 1;
const lastLineStart = offsets[lastLineIndex] ?? 0;
let endIndex = lastLineStart + (lines[lastLineIndex]?.length ?? 0);
if (includeTrailingNewline) {
const nextLineStart = offsets[startLine + lineCount];
if (nextLineStart !== undefined) {
endIndex = nextLineStart;
} else if (text.endsWith('\n')) {
endIndex = text.length;
}
}
return text.slice(startIndex, endIndex);
}
function findLineBasedMatch(
haystack: string,
needle: string,
): MatchedSliceResult | null {
const { lines, offsets } = buildLineIndex(haystack);
const patternLines = needle.split('\n');
const endsWithNewline = needle.endsWith('\n');
if (patternLines.length === 0) {
return null;
}
const attemptMatch = (candidate: string[]): number | null => {
for (const pass of LINE_COMPARISON_PASSES) {
const idx = seekSequenceWithTransform(lines, candidate, pass);
if (idx !== null) {
return idx;
}
}
return seekSequenceWithTransform(
lines,
candidate,
normalizeLineForComparison,
);
};
let matchIndex = attemptMatch(patternLines);
if (matchIndex !== null) {
return {
slice: sliceFromLines(
haystack,
offsets,
lines,
matchIndex,
patternLines.length,
endsWithNewline,
),
removedTrailingFinalEmptyLine: false,
};
}
if (patternLines.at(-1) === '') {
const trimmedPattern = patternLines.slice(0, -1);
if (trimmedPattern.length === 0) {
return null;
}
matchIndex = attemptMatch(trimmedPattern);
if (matchIndex !== null) {
return {
slice: sliceFromLines(
haystack,
offsets,
lines,
matchIndex,
trimmedPattern.length,
false,
),
removedTrailingFinalEmptyLine: true,
};
}
}
return null;
}
/* -------------------------------------------------------------------------- */
/* Slice discovery */
/* -------------------------------------------------------------------------- */
function findMatchedSlice(
haystack: string,
needle: string,
): MatchedSliceResult | null {
if (needle === '') {
return null;
}
const literalIndex = haystack.indexOf(needle);
if (literalIndex !== -1) {
return {
slice: haystack.slice(literalIndex, literalIndex + needle.length),
removedTrailingFinalEmptyLine: false,
};
}
const normalizedHaystack = normalizeBasicCharacters(haystack);
const normalizedNeedleChars = normalizeBasicCharacters(needle);
const normalizedIndex = normalizedHaystack.indexOf(normalizedNeedleChars);
if (normalizedIndex !== -1) {
return {
slice: haystack.slice(normalizedIndex, normalizedIndex + needle.length),
removedTrailingFinalEmptyLine: false,
};
}
return findLineBasedMatch(haystack, needle);
}
/**
* Returns the literal slice from {@link haystack} that best corresponds to the
* provided {@link needle}, or {@code null} when no match is found.
*/
/* -------------------------------------------------------------------------- */
/* Replacement helpers */
/* -------------------------------------------------------------------------- */
function removeTrailingNewline(text: string): string {
if (text.endsWith('\r\n')) {
return text.slice(0, -2);
}
if (text.endsWith('\n') || text.endsWith('\r')) {
return text.slice(0, -1);
}
return text;
}
function adjustNewStringForTrailingLine(
newString: string,
removedTrailingLine: boolean,
): string {
return removedTrailingLine ? removeTrailingNewline(newString) : newString;
}
export interface NormalizedEditStrings {
oldString: string;
newString: string;
}
/**
* Runs the core normalization pipeline:
* 1. Strip trailing whitespace copied from numbered output.
* 2. Attempt to find the literal text inside {@link fileContent}.
* 3. If found through a relaxed match (smart quotes, line trims, etc.),
* return the canonical slice from disk so later replacements operate on
* exact bytes.
*/
export function normalizeEditStrings(
fileContent: string | null,
oldString: string,
newString: string,
): NormalizedEditStrings {
const trimmedNewString = stripTrailingWhitespacePreserveNewlines(newString);
if (fileContent === null || oldString === '') {
return {
oldString,
newString: trimmedNewString,
};
}
const canonicalOriginal = findMatchedSlice(fileContent, oldString);
if (canonicalOriginal !== null) {
return {
oldString: canonicalOriginal.slice,
newString: adjustNewStringForTrailingLine(
trimmedNewString,
canonicalOriginal.removedTrailingFinalEmptyLine,
),
};
}
return {
oldString,
newString: trimmedNewString,
};
}
/**
* When deleting text and the on-disk content contains the same substring with a
* trailing newline, automatically consume that newline so the removal does not
* leave a blank line behind.
*/
export function maybeAugmentOldStringForDeletion(
fileContent: string | null,
oldString: string,
newString: string,
): string {
if (
fileContent === null ||
oldString === '' ||
newString !== '' ||
oldString.endsWith('\n')
) {
return oldString;
}
const candidate = `${oldString}\n`;
return fileContent.includes(candidate) ? candidate : oldString;
}
/**
* Counts the number of non-overlapping occurrences of {@link substr} inside
* {@link source}. Returns 0 when the substring is empty.
*/
export function countOccurrences(source: string, substr: string): number {
if (substr === '') {
return 0;
}
let count = 0;
let index = source.indexOf(substr);
while (index !== -1) {
count++;
index = source.indexOf(substr, index + substr.length);
}
return count;
}
/**
* Result from extracting a snippet showing the edited region.
*/
export interface EditSnippetResult {
/** Starting line number (1-indexed) of the snippet */
startLine: number;
/** Ending line number (1-indexed) of the snippet */
endLine: number;
/** Total number of lines in the new content */
totalLines: number;
/** The snippet content (subset of lines from newContent) */
content: string;
}
const SNIPPET_CONTEXT_LINES = 4;
const SNIPPET_MAX_LINES = 1000;
/**
* Extracts a snippet from the edited file showing the changed region with
* surrounding context. This compares the old and new content line-by-line
* from both ends to locate the changed region.
*
* @param oldContent The original file content before the edit (null for new files)
* @param newContent The new file content after the edit
* @param contextLines Number of context lines to show before and after the change
* @returns Snippet information, or null if no meaningful snippet can be extracted
*/
export function extractEditSnippet(
oldContent: string | null,
newContent: string,
): EditSnippetResult | null {
const newLines = newContent.split('\n');
const totalLines = newLines.length;
if (oldContent === null) {
return {
startLine: 1,
endLine: totalLines,
totalLines,
content: newContent,
};
}
// No changes case
if (oldContent === newContent || !newContent) {
return null;
}
const oldLines = oldContent.split('\n');
// Find the first line that differs from the start
let firstDiffLine = 0;
const minLength = Math.min(oldLines.length, newLines.length);
while (firstDiffLine < minLength) {
if (oldLines[firstDiffLine] !== newLines[firstDiffLine]) {
break;
}
firstDiffLine++;
}
// Find the first line that differs from the end
let oldEndIndex = oldLines.length - 1;
let newEndIndex = newLines.length - 1;
while (oldEndIndex >= firstDiffLine && newEndIndex >= firstDiffLine) {
if (oldLines[oldEndIndex] !== newLines[newEndIndex]) {
break;
}
oldEndIndex--;
newEndIndex--;
}
// The changed region in the new content is from firstDiffLine to newEndIndex (inclusive)
// Convert to 1-indexed line numbers
const changeStart = firstDiffLine + 1;
const changeEnd = newEndIndex + 1;
// If the change region is too large, don't generate a snippet
if (changeEnd - changeStart > SNIPPET_MAX_LINES) {
return null;
}
// Calculate snippet bounds with context
const snippetStart = Math.max(1, changeStart - SNIPPET_CONTEXT_LINES);
const snippetEnd = Math.min(totalLines, changeEnd + SNIPPET_CONTEXT_LINES);
const snippetLines = newLines.slice(snippetStart - 1, snippetEnd);
return {
startLine: snippetStart,
endLine: snippetEnd,
totalLines,
content: snippetLines.join('\n'),
};
}