mirror of
https://github.com/QwenLM/qwen-code.git
synced 2025-12-19 09:33:53 +00:00
feat(core, cli): Introduce high-performance FileSearch engine (#5136)
Co-authored-by: Jacob Richman <jacob314@gmail.com>
This commit is contained in:
269
packages/core/src/utils/filesearch/fileSearch.ts
Normal file
269
packages/core/src/utils/filesearch/fileSearch.ts
Normal file
@@ -0,0 +1,269 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import path from 'node:path';
|
||||
import fs from 'node:fs';
|
||||
import { fdir } from 'fdir';
|
||||
import picomatch from 'picomatch';
|
||||
import { Ignore } from './ignore.js';
|
||||
import { ResultCache } from './result-cache.js';
|
||||
import * as cache from './crawlCache.js';
|
||||
|
||||
export type FileSearchOptions = {
|
||||
projectRoot: string;
|
||||
ignoreDirs: string[];
|
||||
useGitignore: boolean;
|
||||
useGeminiignore: boolean;
|
||||
cache: boolean;
|
||||
cacheTtl: number;
|
||||
};
|
||||
|
||||
export class AbortError extends Error {
|
||||
constructor(message = 'Search aborted') {
|
||||
super(message);
|
||||
this.name = 'AbortError';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters a list of paths based on a given pattern.
|
||||
* @param allPaths The list of all paths to filter.
|
||||
* @param pattern The picomatch pattern to filter by.
|
||||
* @param signal An AbortSignal to cancel the operation.
|
||||
* @returns A promise that resolves to the filtered and sorted list of paths.
|
||||
*/
|
||||
export async function filter(
|
||||
allPaths: string[],
|
||||
pattern: string,
|
||||
signal: AbortSignal | undefined,
|
||||
): Promise<string[]> {
|
||||
const patternFilter = picomatch(pattern, {
|
||||
dot: true,
|
||||
contains: true,
|
||||
nocase: true,
|
||||
});
|
||||
|
||||
const results: string[] = [];
|
||||
for (const [i, p] of allPaths.entries()) {
|
||||
// Yield control to the event loop periodically to prevent blocking.
|
||||
if (i % 1000 === 0) {
|
||||
await new Promise((resolve) => setImmediate(resolve));
|
||||
if (signal?.aborted) {
|
||||
throw new AbortError();
|
||||
}
|
||||
}
|
||||
|
||||
if (patternFilter(p)) {
|
||||
results.push(p);
|
||||
}
|
||||
}
|
||||
|
||||
results.sort((a, b) => {
|
||||
const aIsDir = a.endsWith('/');
|
||||
const bIsDir = b.endsWith('/');
|
||||
|
||||
if (aIsDir && !bIsDir) return -1;
|
||||
if (!aIsDir && bIsDir) return 1;
|
||||
|
||||
// This is 40% faster than localeCompare and the only thing we would really
|
||||
// gain from localeCompare is case-sensitive sort
|
||||
return a < b ? -1 : a > b ? 1 : 0;
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
export type SearchOptions = {
|
||||
signal?: AbortSignal;
|
||||
maxResults?: number;
|
||||
};
|
||||
|
||||
/**
|
||||
* Provides a fast and efficient way to search for files within a project,
|
||||
* respecting .gitignore and .geminiignore rules, and utilizing caching
|
||||
* for improved performance.
|
||||
*/
|
||||
export class FileSearch {
|
||||
private readonly absoluteDir: string;
|
||||
private readonly ignore: Ignore = new Ignore();
|
||||
private resultCache: ResultCache | undefined;
|
||||
private allFiles: string[] = [];
|
||||
|
||||
/**
|
||||
* Constructs a new `FileSearch` instance.
|
||||
* @param options Configuration options for the file search.
|
||||
*/
|
||||
constructor(private readonly options: FileSearchOptions) {
|
||||
this.absoluteDir = path.resolve(options.projectRoot);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the file search engine by loading ignore rules, crawling the
|
||||
* file system, and building the in-memory cache. This method must be called
|
||||
* before performing any searches.
|
||||
*/
|
||||
async initialize(): Promise<void> {
|
||||
this.loadIgnoreRules();
|
||||
await this.crawlFiles();
|
||||
this.buildResultCache();
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches for files matching a given pattern.
|
||||
* @param pattern The picomatch pattern to search for (e.g., '*.js', 'src/**').
|
||||
* @param options Search options, including an AbortSignal and maxResults.
|
||||
* @returns A promise that resolves to a list of matching file paths, relative
|
||||
* to the project root.
|
||||
*/
|
||||
async search(
|
||||
pattern: string,
|
||||
options: SearchOptions = {},
|
||||
): Promise<string[]> {
|
||||
if (!this.resultCache) {
|
||||
throw new Error('Engine not initialized. Call initialize() first.');
|
||||
}
|
||||
|
||||
pattern = pattern || '*';
|
||||
|
||||
const { files: candidates, isExactMatch } =
|
||||
await this.resultCache!.get(pattern);
|
||||
|
||||
let filteredCandidates;
|
||||
if (isExactMatch) {
|
||||
filteredCandidates = candidates;
|
||||
} else {
|
||||
// Apply the user's picomatch pattern filter
|
||||
filteredCandidates = await filter(candidates, pattern, options.signal);
|
||||
this.resultCache!.set(pattern, filteredCandidates);
|
||||
}
|
||||
|
||||
// Trade-off: We apply a two-stage filtering process.
|
||||
// 1. During the file system crawl (`performCrawl`), we only apply directory-level
|
||||
// ignore rules (e.g., `node_modules/`, `dist/`). This is because applying
|
||||
// a full ignore filter (which includes file-specific patterns like `*.log`)
|
||||
// during the crawl can significantly slow down `fdir`.
|
||||
// 2. Here, in the `search` method, we apply the full ignore filter
|
||||
// (including file patterns) to the `filteredCandidates` (which have already
|
||||
// been filtered by the user's search pattern and sorted). For autocomplete,
|
||||
// the number of displayed results is small (MAX_SUGGESTIONS_TO_SHOW),
|
||||
// so applying the full filter to this truncated list is much more efficient
|
||||
// than applying it to every file during the initial crawl.
|
||||
const fileFilter = this.ignore.getFileFilter();
|
||||
const results: string[] = [];
|
||||
for (const [i, candidate] of filteredCandidates.entries()) {
|
||||
// Yield to the event loop to avoid blocking on large result sets.
|
||||
if (i % 1000 === 0) {
|
||||
await new Promise((resolve) => setImmediate(resolve));
|
||||
if (options.signal?.aborted) {
|
||||
throw new AbortError();
|
||||
}
|
||||
}
|
||||
|
||||
if (results.length >= (options.maxResults ?? Infinity)) {
|
||||
break;
|
||||
}
|
||||
// The `ignore` library throws an error if the path is '.', so we skip it.
|
||||
if (candidate === '.') {
|
||||
continue;
|
||||
}
|
||||
if (!fileFilter(candidate)) {
|
||||
results.push(candidate);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads ignore rules from .gitignore and .geminiignore files, and applies
|
||||
* any additional ignore directories specified in the options.
|
||||
*/
|
||||
private loadIgnoreRules(): void {
|
||||
if (this.options.useGitignore) {
|
||||
const gitignorePath = path.join(this.absoluteDir, '.gitignore');
|
||||
if (fs.existsSync(gitignorePath)) {
|
||||
this.ignore.add(fs.readFileSync(gitignorePath, 'utf8'));
|
||||
}
|
||||
}
|
||||
|
||||
if (this.options.useGeminiignore) {
|
||||
const geminiignorePath = path.join(this.absoluteDir, '.geminiignore');
|
||||
if (fs.existsSync(geminiignorePath)) {
|
||||
this.ignore.add(fs.readFileSync(geminiignorePath, 'utf8'));
|
||||
}
|
||||
}
|
||||
|
||||
const ignoreDirs = ['.git', ...this.options.ignoreDirs];
|
||||
this.ignore.add(
|
||||
ignoreDirs.map((dir) => {
|
||||
if (dir.endsWith('/')) {
|
||||
return dir;
|
||||
}
|
||||
return `${dir}/`;
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Crawls the file system to get a list of all files and directories,
|
||||
* optionally using a cache for faster initialization.
|
||||
*/
|
||||
private async crawlFiles(): Promise<void> {
|
||||
if (this.options.cache) {
|
||||
const cacheKey = cache.getCacheKey(
|
||||
this.absoluteDir,
|
||||
this.ignore.getFingerprint(),
|
||||
);
|
||||
const cachedResults = cache.read(cacheKey);
|
||||
|
||||
if (cachedResults) {
|
||||
this.allFiles = cachedResults;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
this.allFiles = await this.performCrawl();
|
||||
|
||||
if (this.options.cache) {
|
||||
const cacheKey = cache.getCacheKey(
|
||||
this.absoluteDir,
|
||||
this.ignore.getFingerprint(),
|
||||
);
|
||||
cache.write(cacheKey, this.allFiles, this.options.cacheTtl * 1000);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs the actual file system crawl using `fdir`, applying directory
|
||||
* ignore rules.
|
||||
* @returns A promise that resolves to a list of all files and directories.
|
||||
*/
|
||||
private async performCrawl(): Promise<string[]> {
|
||||
const dirFilter = this.ignore.getDirectoryFilter();
|
||||
|
||||
// We use `fdir` for fast file system traversal. A key performance
|
||||
// optimization for large workspaces is to exclude entire directories
|
||||
// early in the traversal process. This is why we apply directory-specific
|
||||
// ignore rules (e.g., `node_modules/`, `dist/`) directly to `fdir`'s
|
||||
// exclude filter.
|
||||
const api = new fdir()
|
||||
.withRelativePaths()
|
||||
.withDirs()
|
||||
.withPathSeparator('/') // Always use unix style paths
|
||||
.exclude((_, dirPath) => {
|
||||
const relativePath = path.relative(this.absoluteDir, dirPath);
|
||||
return dirFilter(`${relativePath}/`);
|
||||
});
|
||||
|
||||
return api.crawl(this.absoluteDir).withPromise();
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the in-memory cache for fast pattern matching.
|
||||
*/
|
||||
private buildResultCache(): void {
|
||||
this.resultCache = new ResultCache(this.allFiles, this.absoluteDir);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user