From f64bf8dede6a2fa4dd167fc2c33fcd75c6e5f519 Mon Sep 17 00:00:00 2001 From: Yiheng Xu Date: Fri, 5 Sep 2025 16:24:09 +0800 Subject: [PATCH] fix lint --- .github/workflows/terminal-bench.yml | 2 +- .../swe-bench-astropy-1/docker-compose.yaml | 2 +- .../terminal-bench/terminal-bench.test.ts | 324 ++++++++++-------- 3 files changed, 175 insertions(+), 153 deletions(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index c1e9eb61..ca815a84 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -9,7 +9,7 @@ on: type: 'string' default: 'latest' release: - types: [published] + types: ['published'] jobs: terminal-bench: diff --git a/integration-tests/terminal-bench/ci-tasks/swe-bench-astropy-1/docker-compose.yaml b/integration-tests/terminal-bench/ci-tasks/swe-bench-astropy-1/docker-compose.yaml index e0ef70ae..3b1c0cc7 100644 --- a/integration-tests/terminal-bench/ci-tasks/swe-bench-astropy-1/docker-compose.yaml +++ b/integration-tests/terminal-bench/ci-tasks/swe-bench-astropy-1/docker-compose.yaml @@ -4,7 +4,7 @@ services: dockerfile: Dockerfile image: ${T_BENCH_TASK_DOCKER_CLIENT_IMAGE_NAME} container_name: ${T_BENCH_TASK_DOCKER_CLIENT_CONTAINER_NAME} - command: [ "sh", "-c", "sleep infinity" ] + command: ['sh', '-c', 'sleep infinity'] environment: - TEST_DIR=${T_BENCH_TEST_DIR} volumes: diff --git a/integration-tests/terminal-bench/terminal-bench.test.ts b/integration-tests/terminal-bench/terminal-bench.test.ts index 0b25653f..ed5348a9 100644 --- a/integration-tests/terminal-bench/terminal-bench.test.ts +++ b/integration-tests/terminal-bench/terminal-bench.test.ts @@ -110,40 +110,155 @@ describe('terminal-bench integration', () => { } describe.each(testTasks)('Task: %s', (taskId) => { - it(`should complete ${taskId} task with oracle agent`, async () => { - rig.setup(`terminal-bench-oracle-${taskId}`); + it( + `should complete ${taskId} task with oracle agent`, + async () => { + rig.setup(`terminal-bench-oracle-${taskId}`); - const outputPath = join(outputBase, `oracle-${taskId}`); + const outputPath = join(outputBase, `oracle-${taskId}`); - // Check if ci-tasks exists - if (!existsSync(ciTasksPath)) { - console.error(`CI tasks directory does not exist: ${ciTasksPath}`); - throw new Error( - 'CI tasks not found. Please ensure ci-tasks directory is present.', - ); - } + // Check if ci-tasks exists + if (!existsSync(ciTasksPath)) { + console.error(`CI tasks directory does not exist: ${ciTasksPath}`); + throw new Error( + 'CI tasks not found. Please ensure ci-tasks directory is present.', + ); + } - // Run oracle agent on the task using ci-tasks dataset (non-blocking) - const args = [ - 'run', - '--agent', - 'oracle', - '--dataset-path', - ciTasksPath, - '--task-id', - taskId, - '--output-path', - outputPath, - '--n-concurrent', - '1', - ]; + // Run oracle agent on the task using ci-tasks dataset (non-blocking) + const args = [ + 'run', + '--agent', + 'oracle', + '--dataset-path', + ciTasksPath, + '--task-id', + taskId, + '--output-path', + outputPath, + '--n-concurrent', + '1', + ]; - try { + try { + const result = await new Promise((resolve, reject) => { + let stdout = ''; + let stderr = ''; + + const child = spawn('tb', args, { env: { ...process.env } }); + + child.stdout?.on('data', (data) => { + stdout += data.toString(); + }); + + child.stderr?.on('data', (data) => { + stderr += data.toString(); + }); + + const to = setTimeout( + () => { + child.kill(); + reject(new Error(`Process timeout for ${taskId}`)); + }, + Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000), + ); // Leave 1 minute buffer + + child.on('close', (code) => { + clearTimeout(to); + if (code !== 0) { + console.error( + `oracle agent failed for ${taskId} with stderr:`, + stderr, + ); + reject( + new Error(`Process exited with code ${code}: ${stderr}`), + ); + } else { + resolve(stdout); + } + }); + + child.on('error', (error) => { + clearTimeout(to); + console.error('Failed to start process:', error); + reject(error); + }); + }); + + // Check if the run succeeded + expect(result).toContain('Results Summary'); + + // Check if results file was created + // Terminal-bench creates results in a timestamped subdirectory + const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' }) + .trim() + .split('\n'); + const latestDir = dirs[dirs.length - 1]; // Get the latest run directory + const resultsFile = join(latestDir, 'results.json'); + + expect(existsSync(resultsFile)).toBe(true); + + const results = JSON.parse(readFileSync(resultsFile, 'utf-8')); + expect(results.accuracy).toBe(1.0); // Oracle should always succeed + expect(results.n_resolved).toBe(1); + expect(results.n_unresolved).toBe(0); + } catch (error) { + console.error(`Oracle agent failed for ${taskId}:`, error); + throw error; + } + }, + DEFAULT_TIMEOUT_MS, + ); + + it( + `should complete ${taskId} task with qwen-code agent`, + async () => { + rig.setup(`terminal-bench-qwen-${taskId}`); + + const outputPath = join(outputBase, `qwen-${taskId}`); + + // Check if API key is available + const apiKey = process.env['OPENAI_API_KEY']; + if (!apiKey) { + throw new Error( + 'OPENAI_API_KEY environment variable is not set. This test requires an API key to run the qwen-code agent.', + ); + } + + // Run qwen-code agent using spawn to avoid blocking event loop + const args = [ + 'run', + '--agent-import-path', + 'integration-tests.terminal-bench.qwen_code:QwenCodeAgent', + '--agent-kwarg', + `api_key=${apiKey}`, + '--agent-kwarg', + `version=${process.env['QWEN_CODE_VERSION'] || 'latest'}`, + '--dataset-path', + ciTasksPath, + '--task-id', + taskId, + '--output-path', + outputPath, + '--n-concurrent', + '1', + ]; + + const env = { + ...process.env, + OPENAI_API_KEY: apiKey, + OPENAI_MODEL: process.env['OPENAI_MODEL'] || 'qwen3-coder-plus', + OPENAI_BASE_URL: + process.env['OPENAI_BASE_URL'] || + 'https://dashscope.aliyuncs.com/compatible-mode/v1', + }; + + // Use spawn with promise to avoid blocking const result = await new Promise((resolve, reject) => { let stdout = ''; let stderr = ''; - const child = spawn('tb', args, { env: { ...process.env } }); + const child = spawn('tb', args, { env }); child.stdout?.on('data', (data) => { stdout += data.toString(); @@ -153,15 +268,12 @@ describe('terminal-bench integration', () => { stderr += data.toString(); }); - const to = setTimeout(() => { - child.kill(); - reject(new Error(`Process timeout for ${taskId}`)); - }, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer - child.on('close', (code) => { - clearTimeout(to); if (code !== 0) { - console.error(`oracle agent failed for ${taskId} with stderr:`, stderr); + console.error( + `qwen-code agent failed for ${taskId} with stderr:`, + stderr, + ); reject(new Error(`Process exited with code ${code}: ${stderr}`)); } else { resolve(stdout); @@ -169,138 +281,48 @@ describe('terminal-bench integration', () => { }); child.on('error', (error) => { - clearTimeout(to); console.error('Failed to start process:', error); reject(error); }); + + // Set timeout based on task + setTimeout( + () => { + child.kill(); + reject(new Error(`Process timeout for ${taskId}`)); + }, + Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000), + ); // Leave 1 minute buffer + }).catch((error) => { + // This is expected if API key is not configured correctly + if (error instanceof Error && error.message?.includes('API')) { + console.warn('API configuration issue - skipping test'); + return ''; + } + throw error; }); - // Check if the run succeeded + if (!result) return; // Skip if API configuration issue + + // Check if the run completed expect(result).toContain('Results Summary'); - // Check if results file was created - // Terminal-bench creates results in a timestamped subdirectory + // Check results file in timestamped subdirectory const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' }) .trim() .split('\n'); - const latestDir = dirs[dirs.length - 1]; // Get the latest run directory + const latestDir = dirs[dirs.length - 1]; const resultsFile = join(latestDir, 'results.json'); expect(existsSync(resultsFile)).toBe(true); const results = JSON.parse(readFileSync(resultsFile, 'utf-8')); - expect(results.accuracy).toBe(1.0); // Oracle should always succeed - expect(results.n_resolved).toBe(1); - expect(results.n_unresolved).toBe(0); - } catch (error) { - console.error(`Oracle agent failed for ${taskId}:`, error); - throw error; - } - }, DEFAULT_TIMEOUT_MS); - - it(`should complete ${taskId} task with qwen-code agent`, async () => { - rig.setup(`terminal-bench-qwen-${taskId}`); - - const outputPath = join(outputBase, `qwen-${taskId}`); - - // Check if API key is available - const apiKey = process.env['OPENAI_API_KEY']; - if (!apiKey) { - throw new Error( - 'OPENAI_API_KEY environment variable is not set. This test requires an API key to run the qwen-code agent.', - ); - } - - // Run qwen-code agent using spawn to avoid blocking event loop - const args = [ - 'run', - '--agent-import-path', - 'integration-tests.terminal-bench.qwen_code:QwenCodeAgent', - '--agent-kwarg', - `api_key=${apiKey}`, - '--agent-kwarg', - `version=${process.env['QWEN_CODE_VERSION'] || 'latest'}`, - '--dataset-path', - ciTasksPath, - '--task-id', - taskId, - '--output-path', - outputPath, - '--n-concurrent', - '1', - ]; - - const env = { - ...process.env, - OPENAI_API_KEY: apiKey, - OPENAI_MODEL: process.env['OPENAI_MODEL'] || 'qwen3-coder-plus', - OPENAI_BASE_URL: - process.env['OPENAI_BASE_URL'] || - 'https://dashscope.aliyuncs.com/compatible-mode/v1', - }; - - // Use spawn with promise to avoid blocking - const result = await new Promise((resolve, reject) => { - let stdout = ''; - let stderr = ''; - - const child = spawn('tb', args, { env }); - - child.stdout?.on('data', (data) => { - stdout += data.toString(); - }); - - child.stderr?.on('data', (data) => { - stderr += data.toString(); - }); - - child.on('close', (code) => { - if (code !== 0) { - console.error(`qwen-code agent failed for ${taskId} with stderr:`, stderr); - reject(new Error(`Process exited with code ${code}: ${stderr}`)); - } else { - resolve(stdout); - } - }); - - child.on('error', (error) => { - console.error('Failed to start process:', error); - reject(error); - }); - - // Set timeout based on task - setTimeout(() => { - child.kill(); - reject(new Error(`Process timeout for ${taskId}`)); - }, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer - }).catch((error) => { - // This is expected if API key is not configured correctly - if (error instanceof Error && error.message?.includes('API')) { - console.warn('API configuration issue - skipping test'); - return ''; - } - throw error; - }); - - if (!result) return; // Skip if API configuration issue - - // Check if the run completed - expect(result).toContain('Results Summary'); - - // Check results file in timestamped subdirectory - const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' }) - .trim() - .split('\n'); - const latestDir = dirs[dirs.length - 1]; - const resultsFile = join(latestDir, 'results.json'); - - expect(existsSync(resultsFile)).toBe(true); - - const results = JSON.parse(readFileSync(resultsFile, 'utf-8')); - // Check that the task actually completed successfully - expect(results).toHaveProperty('accuracy'); - expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved - expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0 - }, DEFAULT_TIMEOUT_MS); + // Check that the task actually completed successfully + expect(results).toHaveProperty('accuracy'); + expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved + expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0 + }, + DEFAULT_TIMEOUT_MS, + ); }); });