/** * @license * Copyright 2025 Google LLC * SPDX-License-Identifier: Apache-2.0 */ /** * Helpers for reconciling LLM-proposed edits with on-disk text. * * The normalization pipeline intentionally stays deterministic: we first try * literal substring matches, then gradually relax comparison rules (smart * quotes, em-dashes, trailing whitespace, etc.) until we either locate the * exact slice from the file or conclude the edit cannot be applied. */ /* -------------------------------------------------------------------------- */ /* Character-level normalization */ /* -------------------------------------------------------------------------- */ const UNICODE_EQUIVALENT_MAP: Record = { // Hyphen variations → ASCII hyphen-minus. '\u2010': '-', '\u2011': '-', '\u2012': '-', '\u2013': '-', '\u2014': '-', '\u2015': '-', '\u2212': '-', // Curly single quotes → straight apostrophe. '\u2018': "'", '\u2019': "'", '\u201A': "'", '\u201B': "'", // Curly double quotes → straight double quote. '\u201C': '"', '\u201D': '"', '\u201E': '"', '\u201F': '"', // Whitespace variants → normal space. '\u00A0': ' ', '\u2002': ' ', '\u2003': ' ', '\u2004': ' ', '\u2005': ' ', '\u2006': ' ', '\u2007': ' ', '\u2008': ' ', '\u2009': ' ', '\u200A': ' ', '\u202F': ' ', '\u205F': ' ', '\u3000': ' ', }; function normalizeBasicCharacters(text: string): string { if (text === '') { return text; } let normalized = ''; for (const char of text) { normalized += UNICODE_EQUIVALENT_MAP[char] ?? char; } return normalized; } /** * Removes trailing whitespace from each line while keeping the original newline * separators intact. */ function stripTrailingWhitespacePreserveNewlines(text: string): string { const pieces = text.split(/(\r\n|\n|\r)/); let result = ''; for (let i = 0; i < pieces.length; i++) { const segment = pieces[i]; if (segment === undefined) { continue; } if (i % 2 === 0) { result += segment.trimEnd(); } else { result += segment; } } return result; } /* -------------------------------------------------------------------------- */ /* Line-based search helpers */ /* -------------------------------------------------------------------------- */ interface MatchedSliceResult { slice: string; removedTrailingFinalEmptyLine: boolean; } /** * Comparison passes become progressively more forgiving, making it possible to * match when only trailing whitespace differs. Leading whitespace (indentation) * is always preserved to avoid matching at incorrect scope levels. */ const LINE_COMPARISON_PASSES: Array<(value: string) => string> = [ (value) => value, (value) => value.trimEnd(), ]; function normalizeLineForComparison(value: string): string { return normalizeBasicCharacters(value).trimEnd(); } /** * Finds the first index where {@link pattern} appears within {@link lines} once * both sequences are transformed in the same way. */ function seekSequenceWithTransform( lines: string[], pattern: string[], transform: (value: string) => string, ): number | null { if (pattern.length === 0) { return 0; } if (pattern.length > lines.length) { return null; } outer: for (let i = 0; i <= lines.length - pattern.length; i++) { for (let p = 0; p < pattern.length; p++) { if (transform(lines[i + p]) !== transform(pattern[p])) { continue outer; } } return i; } return null; } function buildLineIndex(text: string): { lines: string[]; offsets: number[]; } { const lines = text.split('\n'); const offsets = new Array(lines.length + 1); let cursor = 0; for (let i = 0; i < lines.length; i++) { offsets[i] = cursor; cursor += lines[i].length; if (i < lines.length - 1) { cursor += 1; // Account for the newline that split() removed. } } offsets[lines.length] = text.length; return { lines, offsets }; } /** * Reconstructs the original characters for the matched lines, optionally * preserving the newline that follows the final line. */ function sliceFromLines( text: string, offsets: number[], lines: string[], startLine: number, lineCount: number, includeTrailingNewline: boolean, ): string { if (lineCount === 0) { return includeTrailingNewline ? '\n' : ''; } const startIndex = offsets[startLine] ?? 0; const lastLineIndex = startLine + lineCount - 1; const lastLineStart = offsets[lastLineIndex] ?? 0; let endIndex = lastLineStart + (lines[lastLineIndex]?.length ?? 0); if (includeTrailingNewline) { const nextLineStart = offsets[startLine + lineCount]; if (nextLineStart !== undefined) { endIndex = nextLineStart; } else if (text.endsWith('\n')) { endIndex = text.length; } } return text.slice(startIndex, endIndex); } function findLineBasedMatch( haystack: string, needle: string, ): MatchedSliceResult | null { const { lines, offsets } = buildLineIndex(haystack); const patternLines = needle.split('\n'); const endsWithNewline = needle.endsWith('\n'); if (patternLines.length === 0) { return null; } const attemptMatch = (candidate: string[]): number | null => { for (const pass of LINE_COMPARISON_PASSES) { const idx = seekSequenceWithTransform(lines, candidate, pass); if (idx !== null) { return idx; } } return seekSequenceWithTransform( lines, candidate, normalizeLineForComparison, ); }; let matchIndex = attemptMatch(patternLines); if (matchIndex !== null) { return { slice: sliceFromLines( haystack, offsets, lines, matchIndex, patternLines.length, endsWithNewline, ), removedTrailingFinalEmptyLine: false, }; } if (patternLines.at(-1) === '') { const trimmedPattern = patternLines.slice(0, -1); if (trimmedPattern.length === 0) { return null; } matchIndex = attemptMatch(trimmedPattern); if (matchIndex !== null) { return { slice: sliceFromLines( haystack, offsets, lines, matchIndex, trimmedPattern.length, false, ), removedTrailingFinalEmptyLine: true, }; } } return null; } /* -------------------------------------------------------------------------- */ /* Slice discovery */ /* -------------------------------------------------------------------------- */ function findMatchedSlice( haystack: string, needle: string, ): MatchedSliceResult | null { if (needle === '') { return null; } const literalIndex = haystack.indexOf(needle); if (literalIndex !== -1) { return { slice: haystack.slice(literalIndex, literalIndex + needle.length), removedTrailingFinalEmptyLine: false, }; } const normalizedHaystack = normalizeBasicCharacters(haystack); const normalizedNeedleChars = normalizeBasicCharacters(needle); const normalizedIndex = normalizedHaystack.indexOf(normalizedNeedleChars); if (normalizedIndex !== -1) { return { slice: haystack.slice(normalizedIndex, normalizedIndex + needle.length), removedTrailingFinalEmptyLine: false, }; } return findLineBasedMatch(haystack, needle); } /** * Returns the literal slice from {@link haystack} that best corresponds to the * provided {@link needle}, or {@code null} when no match is found. */ /* -------------------------------------------------------------------------- */ /* Replacement helpers */ /* -------------------------------------------------------------------------- */ function removeTrailingNewline(text: string): string { if (text.endsWith('\r\n')) { return text.slice(0, -2); } if (text.endsWith('\n') || text.endsWith('\r')) { return text.slice(0, -1); } return text; } function adjustNewStringForTrailingLine( newString: string, removedTrailingLine: boolean, ): string { return removedTrailingLine ? removeTrailingNewline(newString) : newString; } export interface NormalizedEditStrings { oldString: string; newString: string; } /** * Runs the core normalization pipeline: * 1. Strip trailing whitespace copied from numbered output. * 2. Attempt to find the literal text inside {@link fileContent}. * 3. If found through a relaxed match (smart quotes, line trims, etc.), * return the canonical slice from disk so later replacements operate on * exact bytes. */ export function normalizeEditStrings( fileContent: string | null, oldString: string, newString: string, ): NormalizedEditStrings { const trimmedNewString = stripTrailingWhitespacePreserveNewlines(newString); if (fileContent === null || oldString === '') { return { oldString, newString: trimmedNewString, }; } const canonicalOriginal = findMatchedSlice(fileContent, oldString); if (canonicalOriginal !== null) { return { oldString: canonicalOriginal.slice, newString: adjustNewStringForTrailingLine( trimmedNewString, canonicalOriginal.removedTrailingFinalEmptyLine, ), }; } return { oldString, newString: trimmedNewString, }; } /** * When deleting text and the on-disk content contains the same substring with a * trailing newline, automatically consume that newline so the removal does not * leave a blank line behind. */ export function maybeAugmentOldStringForDeletion( fileContent: string | null, oldString: string, newString: string, ): string { if ( fileContent === null || oldString === '' || newString !== '' || oldString.endsWith('\n') ) { return oldString; } const candidate = `${oldString}\n`; return fileContent.includes(candidate) ? candidate : oldString; } /** * Counts the number of non-overlapping occurrences of {@link substr} inside * {@link source}. Returns 0 when the substring is empty. */ export function countOccurrences(source: string, substr: string): number { if (substr === '') { return 0; } let count = 0; let index = source.indexOf(substr); while (index !== -1) { count++; index = source.indexOf(substr, index + substr.length); } return count; } /** * Result from extracting a snippet showing the edited region. */ export interface EditSnippetResult { /** Starting line number (1-indexed) of the snippet */ startLine: number; /** Ending line number (1-indexed) of the snippet */ endLine: number; /** Total number of lines in the new content */ totalLines: number; /** The snippet content (subset of lines from newContent) */ content: string; } const SNIPPET_CONTEXT_LINES = 4; const SNIPPET_MAX_LINES = 1000; /** * Extracts a snippet from the edited file showing the changed region with * surrounding context. This compares the old and new content line-by-line * from both ends to locate the changed region. * * @param oldContent The original file content before the edit (null for new files) * @param newContent The new file content after the edit * @param contextLines Number of context lines to show before and after the change * @returns Snippet information, or null if no meaningful snippet can be extracted */ export function extractEditSnippet( oldContent: string | null, newContent: string, ): EditSnippetResult | null { const newLines = newContent.split('\n'); const totalLines = newLines.length; if (oldContent === null) { return { startLine: 1, endLine: totalLines, totalLines, content: newContent, }; } // No changes case if (oldContent === newContent || !newContent) { return null; } const oldLines = oldContent.split('\n'); // Find the first line that differs from the start let firstDiffLine = 0; const minLength = Math.min(oldLines.length, newLines.length); while (firstDiffLine < minLength) { if (oldLines[firstDiffLine] !== newLines[firstDiffLine]) { break; } firstDiffLine++; } // Find the first line that differs from the end let oldEndIndex = oldLines.length - 1; let newEndIndex = newLines.length - 1; while (oldEndIndex >= firstDiffLine && newEndIndex >= firstDiffLine) { if (oldLines[oldEndIndex] !== newLines[newEndIndex]) { break; } oldEndIndex--; newEndIndex--; } // The changed region in the new content is from firstDiffLine to newEndIndex (inclusive) // Convert to 1-indexed line numbers const changeStart = firstDiffLine + 1; const changeEnd = newEndIndex + 1; // If the change region is too large, don't generate a snippet if (changeEnd - changeStart > SNIPPET_MAX_LINES) { return null; } // Calculate snippet bounds with context const snippetStart = Math.max(1, changeStart - SNIPPET_CONTEXT_LINES); const snippetEnd = Math.min(totalLines, changeEnd + SNIPPET_CONTEXT_LINES); const snippetLines = newLines.slice(snippetStart - 1, snippetEnd); return { startLine: snippetStart, endLine: snippetEnd, totalLines, content: snippetLines.join('\n'), }; }