From c00287ac459dec6ab920717fae00ade5741d4c83 Mon Sep 17 00:00:00 2001 From: Yiheng Xu Date: Fri, 5 Sep 2025 16:17:46 +0800 Subject: [PATCH] handle timeout --- .github/workflows/terminal-bench.yml | 19 +++-- .../terminal-bench/terminal-bench.test.ts | 85 ++++++++++++++----- integration-tests/vitest.config.ts | 5 +- 3 files changed, 77 insertions(+), 32 deletions(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 50fb62e8..c1e9eb61 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -1,16 +1,15 @@ name: 'Terminal Bench Tests' on: - push: - branches: - - 'main' - - 'feat/**' - - 'feature/**' - pull_request: - branches: - - 'main' - merge_group: workflow_dispatch: + inputs: + version: + description: 'The version to test.' + required: true + type: 'string' + default: 'latest' + release: + types: [published] jobs: terminal-bench: @@ -64,6 +63,7 @@ jobs: VERBOSE: 'true' KEEP_OUTPUT: 'true' TB_TASK_ID: '${{ matrix.task_id }}' + TB_TIMEOUT_MINUTES: '30' - name: 'Run Terminal Bench Qwen (task: ${{ matrix.task_id }})' run: 'npm run test:terminal-bench:qwen' @@ -77,6 +77,7 @@ jobs: VERBOSE: 'true' KEEP_OUTPUT: 'true' TB_TASK_ID: '${{ matrix.task_id }}' + TB_TIMEOUT_MINUTES: '30' - name: 'Upload test artifacts' if: 'always()' diff --git a/integration-tests/terminal-bench/terminal-bench.test.ts b/integration-tests/terminal-bench/terminal-bench.test.ts index eee2eb1e..0b25653f 100644 --- a/integration-tests/terminal-bench/terminal-bench.test.ts +++ b/integration-tests/terminal-bench/terminal-bench.test.ts @@ -20,6 +20,12 @@ describe('terminal-bench integration', () => { // Use local ci-tasks directory for self-contained tests const ciTasksPath = join(__dirname, 'ci-tasks'); + // Single timeout source (minutes), defaults to workflow's 30 minutes + const DEFAULT_TIMEOUT_MINUTES = Number( + process.env['TB_TIMEOUT_MINUTES'] || '30', + ); + const DEFAULT_TIMEOUT_MS = DEFAULT_TIMEOUT_MINUTES * 60 * 1000; + // Use the integration test directory set by globalSetup.ts if available, // otherwise create our own in .integration-tests const integrationTestsDir = join(rootDir, '.integration-tests'); @@ -54,9 +60,7 @@ describe('terminal-bench integration', () => { console.log('Installing terminal-bench...'); // Use uv tool install for terminal-bench try { - execSync('uv tool install --python 3.12 terminal-bench', { - stdio: 'inherit', - }); + execSync('uv tool install --python 3.12 terminal-bench'); // Add uv tools to PATH for this process process.env['PATH'] = `${process.env['HOME']}/.local/bin:${process.env['PATH']}`; @@ -77,23 +81,20 @@ describe('terminal-bench integration', () => { }); // Test configuration for different tasks - const baseTestTasks = [ - { id: 'hello-world', timeout: 300000 }, - { id: 'swe-bench-astropy-1', timeout: 600000 }, - ] as const; + const baseTestTasks = ['hello-world', 'swe-bench-astropy-1'] as const; // Allow CI to select a specific task (or a subset) via env var const envTaskId = process.env['TB_TASK_ID']; const envTaskIds = process.env['TB_TASK_IDS']; // comma-separated list - let testTasks = baseTestTasks.map((t) => ({ ...t })); + let testTasks = [...baseTestTasks]; if (envTaskId || envTaskIds) { const selected = (envTaskIds || envTaskId || '') .split(',') .map((s) => s.trim()) .filter(Boolean); - const available = new Set(baseTestTasks.map((t) => t.id)); + const available = new Set(baseTestTasks.map((t) => t)); const unknown = selected.filter((s) => !available.has(s)); if (unknown.length > 0) { throw new Error( @@ -101,16 +102,14 @@ describe('terminal-bench integration', () => { ); } - testTasks = baseTestTasks - .filter((t) => selected.includes(t.id)) - .map((t) => ({ ...t })); + testTasks = baseTestTasks.filter((t) => selected.includes(t)); if (testTasks.length === 0) { throw new Error('No tasks selected via TB_TASK_ID/TB_TASK_IDS.'); } } - describe.each(testTasks)('Task: $id', ({ id: taskId, timeout }) => { + describe.each(testTasks)('Task: %s', (taskId) => { it(`should complete ${taskId} task with oracle agent`, async () => { rig.setup(`terminal-bench-oracle-${taskId}`); @@ -124,14 +123,56 @@ describe('terminal-bench integration', () => { ); } - // Run oracle agent on the task using ci-tasks dataset - const command = `tb run --agent oracle --dataset-path ${ciTasksPath} --task-id ${taskId} --output-path ${outputPath} --n-concurrent 1`; + // Run oracle agent on the task using ci-tasks dataset (non-blocking) + const args = [ + 'run', + '--agent', + 'oracle', + '--dataset-path', + ciTasksPath, + '--task-id', + taskId, + '--output-path', + outputPath, + '--n-concurrent', + '1', + ]; try { - const result = execSync(command, { - encoding: 'utf-8', - timeout: timeout - 60000, // Leave 1 minute buffer - env: { ...process.env }, + const result = await new Promise((resolve, reject) => { + let stdout = ''; + let stderr = ''; + + const child = spawn('tb', args, { env: { ...process.env } }); + + child.stdout?.on('data', (data) => { + stdout += data.toString(); + }); + + child.stderr?.on('data', (data) => { + stderr += data.toString(); + }); + + const to = setTimeout(() => { + child.kill(); + reject(new Error(`Process timeout for ${taskId}`)); + }, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer + + child.on('close', (code) => { + clearTimeout(to); + if (code !== 0) { + console.error(`oracle agent failed for ${taskId} with stderr:`, stderr); + reject(new Error(`Process exited with code ${code}: ${stderr}`)); + } else { + resolve(stdout); + } + }); + + child.on('error', (error) => { + clearTimeout(to); + console.error('Failed to start process:', error); + reject(error); + }); }); // Check if the run succeeded @@ -155,7 +196,7 @@ describe('terminal-bench integration', () => { console.error(`Oracle agent failed for ${taskId}:`, error); throw error; } - }, timeout); + }, DEFAULT_TIMEOUT_MS); it(`should complete ${taskId} task with qwen-code agent`, async () => { rig.setup(`terminal-bench-qwen-${taskId}`); @@ -231,7 +272,7 @@ describe('terminal-bench integration', () => { setTimeout(() => { child.kill(); reject(new Error(`Process timeout for ${taskId}`)); - }, timeout - 60000); // Leave 1 minute buffer + }, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer }).catch((error) => { // This is expected if API key is not configured correctly if (error instanceof Error && error.message?.includes('API')) { @@ -260,6 +301,6 @@ describe('terminal-bench integration', () => { expect(results).toHaveProperty('accuracy'); expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0 - }, timeout); + }, DEFAULT_TIMEOUT_MS); }); }); diff --git a/integration-tests/vitest.config.ts b/integration-tests/vitest.config.ts index e0c6b848..9b3709d3 100644 --- a/integration-tests/vitest.config.ts +++ b/integration-tests/vitest.config.ts @@ -6,9 +6,12 @@ import { defineConfig } from 'vitest/config'; +const timeoutMinutes = Number(process.env.TB_TIMEOUT_MINUTES || '5'); +const testTimeoutMs = timeoutMinutes * 60 * 1000; + export default defineConfig({ test: { - testTimeout: 300000, // 5 minutes + testTimeout: testTimeoutMs, globalSetup: './globalSetup.ts', reporters: ['default'], include: ['**/*.test.ts'],