mirror of
https://github.com/QwenLM/qwen-code.git
synced 2025-12-20 08:47:44 +00:00
500 lines
13 KiB
TypeScript
500 lines
13 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2025 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
/**
|
|
* Helpers for reconciling LLM-proposed edits with on-disk text.
|
|
*
|
|
* The normalization pipeline intentionally stays deterministic: we first try
|
|
* literal substring matches, then gradually relax comparison rules (smart
|
|
* quotes, em-dashes, trailing whitespace, etc.) until we either locate the
|
|
* exact slice from the file or conclude the edit cannot be applied.
|
|
*/
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
/* Character-level normalization */
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
const UNICODE_EQUIVALENT_MAP: Record<string, string> = {
|
|
// Hyphen variations → ASCII hyphen-minus.
|
|
'\u2010': '-',
|
|
'\u2011': '-',
|
|
'\u2012': '-',
|
|
'\u2013': '-',
|
|
'\u2014': '-',
|
|
'\u2015': '-',
|
|
'\u2212': '-',
|
|
// Curly single quotes → straight apostrophe.
|
|
'\u2018': "'",
|
|
'\u2019': "'",
|
|
'\u201A': "'",
|
|
'\u201B': "'",
|
|
// Curly double quotes → straight double quote.
|
|
'\u201C': '"',
|
|
'\u201D': '"',
|
|
'\u201E': '"',
|
|
'\u201F': '"',
|
|
// Whitespace variants → normal space.
|
|
'\u00A0': ' ',
|
|
'\u2002': ' ',
|
|
'\u2003': ' ',
|
|
'\u2004': ' ',
|
|
'\u2005': ' ',
|
|
'\u2006': ' ',
|
|
'\u2007': ' ',
|
|
'\u2008': ' ',
|
|
'\u2009': ' ',
|
|
'\u200A': ' ',
|
|
'\u202F': ' ',
|
|
'\u205F': ' ',
|
|
'\u3000': ' ',
|
|
};
|
|
|
|
function normalizeBasicCharacters(text: string): string {
|
|
if (text === '') {
|
|
return text;
|
|
}
|
|
|
|
let normalized = '';
|
|
for (const char of text) {
|
|
normalized += UNICODE_EQUIVALENT_MAP[char] ?? char;
|
|
}
|
|
return normalized;
|
|
}
|
|
|
|
/**
|
|
* Removes trailing whitespace from each line while keeping the original newline
|
|
* separators intact.
|
|
*/
|
|
function stripTrailingWhitespacePreserveNewlines(text: string): string {
|
|
const pieces = text.split(/(\r\n|\n|\r)/);
|
|
let result = '';
|
|
|
|
for (let i = 0; i < pieces.length; i++) {
|
|
const segment = pieces[i];
|
|
if (segment === undefined) {
|
|
continue;
|
|
}
|
|
|
|
if (i % 2 === 0) {
|
|
result += segment.trimEnd();
|
|
} else {
|
|
result += segment;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
/* Line-based search helpers */
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
interface MatchedSliceResult {
|
|
slice: string;
|
|
removedTrailingFinalEmptyLine: boolean;
|
|
}
|
|
|
|
/**
|
|
* Comparison passes become progressively more forgiving, making it possible to
|
|
* match when only trailing whitespace differs. Leading whitespace (indentation)
|
|
* is always preserved to avoid matching at incorrect scope levels.
|
|
*/
|
|
const LINE_COMPARISON_PASSES: Array<(value: string) => string> = [
|
|
(value) => value,
|
|
(value) => value.trimEnd(),
|
|
];
|
|
|
|
function normalizeLineForComparison(value: string): string {
|
|
return normalizeBasicCharacters(value).trimEnd();
|
|
}
|
|
|
|
/**
|
|
* Finds the first index where {@link pattern} appears within {@link lines} once
|
|
* both sequences are transformed in the same way.
|
|
*/
|
|
function seekSequenceWithTransform(
|
|
lines: string[],
|
|
pattern: string[],
|
|
transform: (value: string) => string,
|
|
): number | null {
|
|
if (pattern.length === 0) {
|
|
return 0;
|
|
}
|
|
|
|
if (pattern.length > lines.length) {
|
|
return null;
|
|
}
|
|
|
|
outer: for (let i = 0; i <= lines.length - pattern.length; i++) {
|
|
for (let p = 0; p < pattern.length; p++) {
|
|
if (transform(lines[i + p]) !== transform(pattern[p])) {
|
|
continue outer;
|
|
}
|
|
}
|
|
return i;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function buildLineIndex(text: string): {
|
|
lines: string[];
|
|
offsets: number[];
|
|
} {
|
|
const lines = text.split('\n');
|
|
const offsets = new Array<number>(lines.length + 1);
|
|
let cursor = 0;
|
|
|
|
for (let i = 0; i < lines.length; i++) {
|
|
offsets[i] = cursor;
|
|
cursor += lines[i].length;
|
|
if (i < lines.length - 1) {
|
|
cursor += 1; // Account for the newline that split() removed.
|
|
}
|
|
}
|
|
offsets[lines.length] = text.length;
|
|
|
|
return { lines, offsets };
|
|
}
|
|
|
|
/**
|
|
* Reconstructs the original characters for the matched lines, optionally
|
|
* preserving the newline that follows the final line.
|
|
*/
|
|
function sliceFromLines(
|
|
text: string,
|
|
offsets: number[],
|
|
lines: string[],
|
|
startLine: number,
|
|
lineCount: number,
|
|
includeTrailingNewline: boolean,
|
|
): string {
|
|
if (lineCount === 0) {
|
|
return includeTrailingNewline ? '\n' : '';
|
|
}
|
|
|
|
const startIndex = offsets[startLine] ?? 0;
|
|
const lastLineIndex = startLine + lineCount - 1;
|
|
const lastLineStart = offsets[lastLineIndex] ?? 0;
|
|
let endIndex = lastLineStart + (lines[lastLineIndex]?.length ?? 0);
|
|
|
|
if (includeTrailingNewline) {
|
|
const nextLineStart = offsets[startLine + lineCount];
|
|
if (nextLineStart !== undefined) {
|
|
endIndex = nextLineStart;
|
|
} else if (text.endsWith('\n')) {
|
|
endIndex = text.length;
|
|
}
|
|
}
|
|
|
|
return text.slice(startIndex, endIndex);
|
|
}
|
|
|
|
function findLineBasedMatch(
|
|
haystack: string,
|
|
needle: string,
|
|
): MatchedSliceResult | null {
|
|
const { lines, offsets } = buildLineIndex(haystack);
|
|
const patternLines = needle.split('\n');
|
|
const endsWithNewline = needle.endsWith('\n');
|
|
|
|
if (patternLines.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
const attemptMatch = (candidate: string[]): number | null => {
|
|
for (const pass of LINE_COMPARISON_PASSES) {
|
|
const idx = seekSequenceWithTransform(lines, candidate, pass);
|
|
if (idx !== null) {
|
|
return idx;
|
|
}
|
|
}
|
|
return seekSequenceWithTransform(
|
|
lines,
|
|
candidate,
|
|
normalizeLineForComparison,
|
|
);
|
|
};
|
|
|
|
let matchIndex = attemptMatch(patternLines);
|
|
if (matchIndex !== null) {
|
|
return {
|
|
slice: sliceFromLines(
|
|
haystack,
|
|
offsets,
|
|
lines,
|
|
matchIndex,
|
|
patternLines.length,
|
|
endsWithNewline,
|
|
),
|
|
removedTrailingFinalEmptyLine: false,
|
|
};
|
|
}
|
|
|
|
if (patternLines.at(-1) === '') {
|
|
const trimmedPattern = patternLines.slice(0, -1);
|
|
if (trimmedPattern.length === 0) {
|
|
return null;
|
|
}
|
|
matchIndex = attemptMatch(trimmedPattern);
|
|
if (matchIndex !== null) {
|
|
return {
|
|
slice: sliceFromLines(
|
|
haystack,
|
|
offsets,
|
|
lines,
|
|
matchIndex,
|
|
trimmedPattern.length,
|
|
false,
|
|
),
|
|
removedTrailingFinalEmptyLine: true,
|
|
};
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
/* Slice discovery */
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
function findMatchedSlice(
|
|
haystack: string,
|
|
needle: string,
|
|
): MatchedSliceResult | null {
|
|
if (needle === '') {
|
|
return null;
|
|
}
|
|
|
|
const literalIndex = haystack.indexOf(needle);
|
|
if (literalIndex !== -1) {
|
|
return {
|
|
slice: haystack.slice(literalIndex, literalIndex + needle.length),
|
|
removedTrailingFinalEmptyLine: false,
|
|
};
|
|
}
|
|
|
|
const normalizedHaystack = normalizeBasicCharacters(haystack);
|
|
const normalizedNeedleChars = normalizeBasicCharacters(needle);
|
|
const normalizedIndex = normalizedHaystack.indexOf(normalizedNeedleChars);
|
|
if (normalizedIndex !== -1) {
|
|
return {
|
|
slice: haystack.slice(normalizedIndex, normalizedIndex + needle.length),
|
|
removedTrailingFinalEmptyLine: false,
|
|
};
|
|
}
|
|
|
|
return findLineBasedMatch(haystack, needle);
|
|
}
|
|
|
|
/**
|
|
* Returns the literal slice from {@link haystack} that best corresponds to the
|
|
* provided {@link needle}, or {@code null} when no match is found.
|
|
*/
|
|
/* -------------------------------------------------------------------------- */
|
|
/* Replacement helpers */
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
function removeTrailingNewline(text: string): string {
|
|
if (text.endsWith('\r\n')) {
|
|
return text.slice(0, -2);
|
|
}
|
|
if (text.endsWith('\n') || text.endsWith('\r')) {
|
|
return text.slice(0, -1);
|
|
}
|
|
return text;
|
|
}
|
|
|
|
function adjustNewStringForTrailingLine(
|
|
newString: string,
|
|
removedTrailingLine: boolean,
|
|
): string {
|
|
return removedTrailingLine ? removeTrailingNewline(newString) : newString;
|
|
}
|
|
|
|
export interface NormalizedEditStrings {
|
|
oldString: string;
|
|
newString: string;
|
|
}
|
|
|
|
/**
|
|
* Runs the core normalization pipeline:
|
|
* 1. Strip trailing whitespace copied from numbered output.
|
|
* 2. Attempt to find the literal text inside {@link fileContent}.
|
|
* 3. If found through a relaxed match (smart quotes, line trims, etc.),
|
|
* return the canonical slice from disk so later replacements operate on
|
|
* exact bytes.
|
|
*/
|
|
export function normalizeEditStrings(
|
|
fileContent: string | null,
|
|
oldString: string,
|
|
newString: string,
|
|
): NormalizedEditStrings {
|
|
const trimmedNewString = stripTrailingWhitespacePreserveNewlines(newString);
|
|
|
|
if (fileContent === null || oldString === '') {
|
|
return {
|
|
oldString,
|
|
newString: trimmedNewString,
|
|
};
|
|
}
|
|
|
|
const canonicalOriginal = findMatchedSlice(fileContent, oldString);
|
|
if (canonicalOriginal !== null) {
|
|
return {
|
|
oldString: canonicalOriginal.slice,
|
|
newString: adjustNewStringForTrailingLine(
|
|
trimmedNewString,
|
|
canonicalOriginal.removedTrailingFinalEmptyLine,
|
|
),
|
|
};
|
|
}
|
|
|
|
return {
|
|
oldString,
|
|
newString: trimmedNewString,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* When deleting text and the on-disk content contains the same substring with a
|
|
* trailing newline, automatically consume that newline so the removal does not
|
|
* leave a blank line behind.
|
|
*/
|
|
export function maybeAugmentOldStringForDeletion(
|
|
fileContent: string | null,
|
|
oldString: string,
|
|
newString: string,
|
|
): string {
|
|
if (
|
|
fileContent === null ||
|
|
oldString === '' ||
|
|
newString !== '' ||
|
|
oldString.endsWith('\n')
|
|
) {
|
|
return oldString;
|
|
}
|
|
|
|
const candidate = `${oldString}\n`;
|
|
return fileContent.includes(candidate) ? candidate : oldString;
|
|
}
|
|
|
|
/**
|
|
* Counts the number of non-overlapping occurrences of {@link substr} inside
|
|
* {@link source}. Returns 0 when the substring is empty.
|
|
*/
|
|
export function countOccurrences(source: string, substr: string): number {
|
|
if (substr === '') {
|
|
return 0;
|
|
}
|
|
|
|
let count = 0;
|
|
let index = source.indexOf(substr);
|
|
while (index !== -1) {
|
|
count++;
|
|
index = source.indexOf(substr, index + substr.length);
|
|
}
|
|
return count;
|
|
}
|
|
|
|
/**
|
|
* Result from extracting a snippet showing the edited region.
|
|
*/
|
|
export interface EditSnippetResult {
|
|
/** Starting line number (1-indexed) of the snippet */
|
|
startLine: number;
|
|
/** Ending line number (1-indexed) of the snippet */
|
|
endLine: number;
|
|
/** Total number of lines in the new content */
|
|
totalLines: number;
|
|
/** The snippet content (subset of lines from newContent) */
|
|
content: string;
|
|
}
|
|
|
|
const SNIPPET_CONTEXT_LINES = 4;
|
|
const SNIPPET_MAX_LINES = 1000;
|
|
|
|
/**
|
|
* Extracts a snippet from the edited file showing the changed region with
|
|
* surrounding context. This compares the old and new content line-by-line
|
|
* from both ends to locate the changed region.
|
|
*
|
|
* @param oldContent The original file content before the edit (null for new files)
|
|
* @param newContent The new file content after the edit
|
|
* @param contextLines Number of context lines to show before and after the change
|
|
* @returns Snippet information, or null if no meaningful snippet can be extracted
|
|
*/
|
|
export function extractEditSnippet(
|
|
oldContent: string | null,
|
|
newContent: string,
|
|
): EditSnippetResult | null {
|
|
const newLines = newContent.split('\n');
|
|
const totalLines = newLines.length;
|
|
|
|
if (oldContent === null) {
|
|
return {
|
|
startLine: 1,
|
|
endLine: totalLines,
|
|
totalLines,
|
|
content: newContent,
|
|
};
|
|
}
|
|
|
|
// No changes case
|
|
if (oldContent === newContent || !newContent) {
|
|
return null;
|
|
}
|
|
|
|
const oldLines = oldContent.split('\n');
|
|
|
|
// Find the first line that differs from the start
|
|
let firstDiffLine = 0;
|
|
const minLength = Math.min(oldLines.length, newLines.length);
|
|
|
|
while (firstDiffLine < minLength) {
|
|
if (oldLines[firstDiffLine] !== newLines[firstDiffLine]) {
|
|
break;
|
|
}
|
|
firstDiffLine++;
|
|
}
|
|
|
|
// Find the first line that differs from the end
|
|
let oldEndIndex = oldLines.length - 1;
|
|
let newEndIndex = newLines.length - 1;
|
|
|
|
while (oldEndIndex >= firstDiffLine && newEndIndex >= firstDiffLine) {
|
|
if (oldLines[oldEndIndex] !== newLines[newEndIndex]) {
|
|
break;
|
|
}
|
|
oldEndIndex--;
|
|
newEndIndex--;
|
|
}
|
|
|
|
// The changed region in the new content is from firstDiffLine to newEndIndex (inclusive)
|
|
// Convert to 1-indexed line numbers
|
|
const changeStart = firstDiffLine + 1;
|
|
const changeEnd = newEndIndex + 1;
|
|
|
|
// If the change region is too large, don't generate a snippet
|
|
if (changeEnd - changeStart > SNIPPET_MAX_LINES) {
|
|
return null;
|
|
}
|
|
|
|
// Calculate snippet bounds with context
|
|
const snippetStart = Math.max(1, changeStart - SNIPPET_CONTEXT_LINES);
|
|
const snippetEnd = Math.min(totalLines, changeEnd + SNIPPET_CONTEXT_LINES);
|
|
|
|
const snippetLines = newLines.slice(snippetStart - 1, snippetEnd);
|
|
|
|
return {
|
|
startLine: snippetStart,
|
|
endLine: snippetEnd,
|
|
totalLines,
|
|
content: snippetLines.join('\n'),
|
|
};
|
|
}
|