handle timeout

This commit is contained in:
Yiheng Xu
2025-09-05 16:17:46 +08:00
parent 6b394bd61d
commit c00287ac45
3 changed files with 77 additions and 32 deletions

View File

@@ -1,16 +1,15 @@
name: 'Terminal Bench Tests' name: 'Terminal Bench Tests'
on: on:
push:
branches:
- 'main'
- 'feat/**'
- 'feature/**'
pull_request:
branches:
- 'main'
merge_group:
workflow_dispatch: workflow_dispatch:
inputs:
version:
description: 'The version to test.'
required: true
type: 'string'
default: 'latest'
release:
types: [published]
jobs: jobs:
terminal-bench: terminal-bench:
@@ -64,6 +63,7 @@ jobs:
VERBOSE: 'true' VERBOSE: 'true'
KEEP_OUTPUT: 'true' KEEP_OUTPUT: 'true'
TB_TASK_ID: '${{ matrix.task_id }}' TB_TASK_ID: '${{ matrix.task_id }}'
TB_TIMEOUT_MINUTES: '30'
- name: 'Run Terminal Bench Qwen (task: ${{ matrix.task_id }})' - name: 'Run Terminal Bench Qwen (task: ${{ matrix.task_id }})'
run: 'npm run test:terminal-bench:qwen' run: 'npm run test:terminal-bench:qwen'
@@ -77,6 +77,7 @@ jobs:
VERBOSE: 'true' VERBOSE: 'true'
KEEP_OUTPUT: 'true' KEEP_OUTPUT: 'true'
TB_TASK_ID: '${{ matrix.task_id }}' TB_TASK_ID: '${{ matrix.task_id }}'
TB_TIMEOUT_MINUTES: '30'
- name: 'Upload test artifacts' - name: 'Upload test artifacts'
if: 'always()' if: 'always()'

View File

@@ -20,6 +20,12 @@ describe('terminal-bench integration', () => {
// Use local ci-tasks directory for self-contained tests // Use local ci-tasks directory for self-contained tests
const ciTasksPath = join(__dirname, 'ci-tasks'); const ciTasksPath = join(__dirname, 'ci-tasks');
// Single timeout source (minutes), defaults to workflow's 30 minutes
const DEFAULT_TIMEOUT_MINUTES = Number(
process.env['TB_TIMEOUT_MINUTES'] || '30',
);
const DEFAULT_TIMEOUT_MS = DEFAULT_TIMEOUT_MINUTES * 60 * 1000;
// Use the integration test directory set by globalSetup.ts if available, // Use the integration test directory set by globalSetup.ts if available,
// otherwise create our own in .integration-tests // otherwise create our own in .integration-tests
const integrationTestsDir = join(rootDir, '.integration-tests'); const integrationTestsDir = join(rootDir, '.integration-tests');
@@ -54,9 +60,7 @@ describe('terminal-bench integration', () => {
console.log('Installing terminal-bench...'); console.log('Installing terminal-bench...');
// Use uv tool install for terminal-bench // Use uv tool install for terminal-bench
try { try {
execSync('uv tool install --python 3.12 terminal-bench', { execSync('uv tool install --python 3.12 terminal-bench');
stdio: 'inherit',
});
// Add uv tools to PATH for this process // Add uv tools to PATH for this process
process.env['PATH'] = process.env['PATH'] =
`${process.env['HOME']}/.local/bin:${process.env['PATH']}`; `${process.env['HOME']}/.local/bin:${process.env['PATH']}`;
@@ -77,23 +81,20 @@ describe('terminal-bench integration', () => {
}); });
// Test configuration for different tasks // Test configuration for different tasks
const baseTestTasks = [ const baseTestTasks = ['hello-world', 'swe-bench-astropy-1'] as const;
{ id: 'hello-world', timeout: 300000 },
{ id: 'swe-bench-astropy-1', timeout: 600000 },
] as const;
// Allow CI to select a specific task (or a subset) via env var // Allow CI to select a specific task (or a subset) via env var
const envTaskId = process.env['TB_TASK_ID']; const envTaskId = process.env['TB_TASK_ID'];
const envTaskIds = process.env['TB_TASK_IDS']; // comma-separated list const envTaskIds = process.env['TB_TASK_IDS']; // comma-separated list
let testTasks = baseTestTasks.map((t) => ({ ...t })); let testTasks = [...baseTestTasks];
if (envTaskId || envTaskIds) { if (envTaskId || envTaskIds) {
const selected = (envTaskIds || envTaskId || '') const selected = (envTaskIds || envTaskId || '')
.split(',') .split(',')
.map((s) => s.trim()) .map((s) => s.trim())
.filter(Boolean); .filter(Boolean);
const available = new Set(baseTestTasks.map((t) => t.id)); const available = new Set(baseTestTasks.map((t) => t));
const unknown = selected.filter((s) => !available.has(s)); const unknown = selected.filter((s) => !available.has(s));
if (unknown.length > 0) { if (unknown.length > 0) {
throw new Error( throw new Error(
@@ -101,16 +102,14 @@ describe('terminal-bench integration', () => {
); );
} }
testTasks = baseTestTasks testTasks = baseTestTasks.filter((t) => selected.includes(t));
.filter((t) => selected.includes(t.id))
.map((t) => ({ ...t }));
if (testTasks.length === 0) { if (testTasks.length === 0) {
throw new Error('No tasks selected via TB_TASK_ID/TB_TASK_IDS.'); throw new Error('No tasks selected via TB_TASK_ID/TB_TASK_IDS.');
} }
} }
describe.each(testTasks)('Task: $id', ({ id: taskId, timeout }) => { describe.each(testTasks)('Task: %s', (taskId) => {
it(`should complete ${taskId} task with oracle agent`, async () => { it(`should complete ${taskId} task with oracle agent`, async () => {
rig.setup(`terminal-bench-oracle-${taskId}`); rig.setup(`terminal-bench-oracle-${taskId}`);
@@ -124,14 +123,56 @@ describe('terminal-bench integration', () => {
); );
} }
// Run oracle agent on the task using ci-tasks dataset // Run oracle agent on the task using ci-tasks dataset (non-blocking)
const command = `tb run --agent oracle --dataset-path ${ciTasksPath} --task-id ${taskId} --output-path ${outputPath} --n-concurrent 1`; const args = [
'run',
'--agent',
'oracle',
'--dataset-path',
ciTasksPath,
'--task-id',
taskId,
'--output-path',
outputPath,
'--n-concurrent',
'1',
];
try { try {
const result = execSync(command, { const result = await new Promise<string>((resolve, reject) => {
encoding: 'utf-8', let stdout = '';
timeout: timeout - 60000, // Leave 1 minute buffer let stderr = '';
env: { ...process.env },
const child = spawn('tb', args, { env: { ...process.env } });
child.stdout?.on('data', (data) => {
stdout += data.toString();
});
child.stderr?.on('data', (data) => {
stderr += data.toString();
});
const to = setTimeout(() => {
child.kill();
reject(new Error(`Process timeout for ${taskId}`));
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer
child.on('close', (code) => {
clearTimeout(to);
if (code !== 0) {
console.error(`oracle agent failed for ${taskId} with stderr:`, stderr);
reject(new Error(`Process exited with code ${code}: ${stderr}`));
} else {
resolve(stdout);
}
});
child.on('error', (error) => {
clearTimeout(to);
console.error('Failed to start process:', error);
reject(error);
});
}); });
// Check if the run succeeded // Check if the run succeeded
@@ -155,7 +196,7 @@ describe('terminal-bench integration', () => {
console.error(`Oracle agent failed for ${taskId}:`, error); console.error(`Oracle agent failed for ${taskId}:`, error);
throw error; throw error;
} }
}, timeout); }, DEFAULT_TIMEOUT_MS);
it(`should complete ${taskId} task with qwen-code agent`, async () => { it(`should complete ${taskId} task with qwen-code agent`, async () => {
rig.setup(`terminal-bench-qwen-${taskId}`); rig.setup(`terminal-bench-qwen-${taskId}`);
@@ -231,7 +272,7 @@ describe('terminal-bench integration', () => {
setTimeout(() => { setTimeout(() => {
child.kill(); child.kill();
reject(new Error(`Process timeout for ${taskId}`)); reject(new Error(`Process timeout for ${taskId}`));
}, timeout - 60000); // Leave 1 minute buffer }, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer
}).catch((error) => { }).catch((error) => {
// This is expected if API key is not configured correctly // This is expected if API key is not configured correctly
if (error instanceof Error && error.message?.includes('API')) { if (error instanceof Error && error.message?.includes('API')) {
@@ -260,6 +301,6 @@ describe('terminal-bench integration', () => {
expect(results).toHaveProperty('accuracy'); expect(results).toHaveProperty('accuracy');
expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved
expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0 expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0
}, timeout); }, DEFAULT_TIMEOUT_MS);
}); });
}); });

View File

@@ -6,9 +6,12 @@
import { defineConfig } from 'vitest/config'; import { defineConfig } from 'vitest/config';
const timeoutMinutes = Number(process.env.TB_TIMEOUT_MINUTES || '5');
const testTimeoutMs = timeoutMinutes * 60 * 1000;
export default defineConfig({ export default defineConfig({
test: { test: {
testTimeout: 300000, // 5 minutes testTimeout: testTimeoutMs,
globalSetup: './globalSetup.ts', globalSetup: './globalSetup.ts',
reporters: ['default'], reporters: ['default'],
include: ['**/*.test.ts'], include: ['**/*.test.ts'],