This commit is contained in:
Yiheng Xu
2025-09-05 16:24:09 +08:00
parent c00287ac45
commit f64bf8dede
3 changed files with 175 additions and 153 deletions

View File

@@ -9,7 +9,7 @@ on:
type: 'string' type: 'string'
default: 'latest' default: 'latest'
release: release:
types: [published] types: ['published']
jobs: jobs:
terminal-bench: terminal-bench:

View File

@@ -4,7 +4,7 @@ services:
dockerfile: Dockerfile dockerfile: Dockerfile
image: ${T_BENCH_TASK_DOCKER_CLIENT_IMAGE_NAME} image: ${T_BENCH_TASK_DOCKER_CLIENT_IMAGE_NAME}
container_name: ${T_BENCH_TASK_DOCKER_CLIENT_CONTAINER_NAME} container_name: ${T_BENCH_TASK_DOCKER_CLIENT_CONTAINER_NAME}
command: [ "sh", "-c", "sleep infinity" ] command: ['sh', '-c', 'sleep infinity']
environment: environment:
- TEST_DIR=${T_BENCH_TEST_DIR} - TEST_DIR=${T_BENCH_TEST_DIR}
volumes: volumes:

View File

@@ -110,40 +110,155 @@ describe('terminal-bench integration', () => {
} }
describe.each(testTasks)('Task: %s', (taskId) => { describe.each(testTasks)('Task: %s', (taskId) => {
it(`should complete ${taskId} task with oracle agent`, async () => { it(
rig.setup(`terminal-bench-oracle-${taskId}`); `should complete ${taskId} task with oracle agent`,
async () => {
rig.setup(`terminal-bench-oracle-${taskId}`);
const outputPath = join(outputBase, `oracle-${taskId}`); const outputPath = join(outputBase, `oracle-${taskId}`);
// Check if ci-tasks exists // Check if ci-tasks exists
if (!existsSync(ciTasksPath)) { if (!existsSync(ciTasksPath)) {
console.error(`CI tasks directory does not exist: ${ciTasksPath}`); console.error(`CI tasks directory does not exist: ${ciTasksPath}`);
throw new Error( throw new Error(
'CI tasks not found. Please ensure ci-tasks directory is present.', 'CI tasks not found. Please ensure ci-tasks directory is present.',
); );
} }
// Run oracle agent on the task using ci-tasks dataset (non-blocking) // Run oracle agent on the task using ci-tasks dataset (non-blocking)
const args = [ const args = [
'run', 'run',
'--agent', '--agent',
'oracle', 'oracle',
'--dataset-path', '--dataset-path',
ciTasksPath, ciTasksPath,
'--task-id', '--task-id',
taskId, taskId,
'--output-path', '--output-path',
outputPath, outputPath,
'--n-concurrent', '--n-concurrent',
'1', '1',
]; ];
try { try {
const result = await new Promise<string>((resolve, reject) => {
let stdout = '';
let stderr = '';
const child = spawn('tb', args, { env: { ...process.env } });
child.stdout?.on('data', (data) => {
stdout += data.toString();
});
child.stderr?.on('data', (data) => {
stderr += data.toString();
});
const to = setTimeout(
() => {
child.kill();
reject(new Error(`Process timeout for ${taskId}`));
},
Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000),
); // Leave 1 minute buffer
child.on('close', (code) => {
clearTimeout(to);
if (code !== 0) {
console.error(
`oracle agent failed for ${taskId} with stderr:`,
stderr,
);
reject(
new Error(`Process exited with code ${code}: ${stderr}`),
);
} else {
resolve(stdout);
}
});
child.on('error', (error) => {
clearTimeout(to);
console.error('Failed to start process:', error);
reject(error);
});
});
// Check if the run succeeded
expect(result).toContain('Results Summary');
// Check if results file was created
// Terminal-bench creates results in a timestamped subdirectory
const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' })
.trim()
.split('\n');
const latestDir = dirs[dirs.length - 1]; // Get the latest run directory
const resultsFile = join(latestDir, 'results.json');
expect(existsSync(resultsFile)).toBe(true);
const results = JSON.parse(readFileSync(resultsFile, 'utf-8'));
expect(results.accuracy).toBe(1.0); // Oracle should always succeed
expect(results.n_resolved).toBe(1);
expect(results.n_unresolved).toBe(0);
} catch (error) {
console.error(`Oracle agent failed for ${taskId}:`, error);
throw error;
}
},
DEFAULT_TIMEOUT_MS,
);
it(
`should complete ${taskId} task with qwen-code agent`,
async () => {
rig.setup(`terminal-bench-qwen-${taskId}`);
const outputPath = join(outputBase, `qwen-${taskId}`);
// Check if API key is available
const apiKey = process.env['OPENAI_API_KEY'];
if (!apiKey) {
throw new Error(
'OPENAI_API_KEY environment variable is not set. This test requires an API key to run the qwen-code agent.',
);
}
// Run qwen-code agent using spawn to avoid blocking event loop
const args = [
'run',
'--agent-import-path',
'integration-tests.terminal-bench.qwen_code:QwenCodeAgent',
'--agent-kwarg',
`api_key=${apiKey}`,
'--agent-kwarg',
`version=${process.env['QWEN_CODE_VERSION'] || 'latest'}`,
'--dataset-path',
ciTasksPath,
'--task-id',
taskId,
'--output-path',
outputPath,
'--n-concurrent',
'1',
];
const env = {
...process.env,
OPENAI_API_KEY: apiKey,
OPENAI_MODEL: process.env['OPENAI_MODEL'] || 'qwen3-coder-plus',
OPENAI_BASE_URL:
process.env['OPENAI_BASE_URL'] ||
'https://dashscope.aliyuncs.com/compatible-mode/v1',
};
// Use spawn with promise to avoid blocking
const result = await new Promise<string>((resolve, reject) => { const result = await new Promise<string>((resolve, reject) => {
let stdout = ''; let stdout = '';
let stderr = ''; let stderr = '';
const child = spawn('tb', args, { env: { ...process.env } }); const child = spawn('tb', args, { env });
child.stdout?.on('data', (data) => { child.stdout?.on('data', (data) => {
stdout += data.toString(); stdout += data.toString();
@@ -153,15 +268,12 @@ describe('terminal-bench integration', () => {
stderr += data.toString(); stderr += data.toString();
}); });
const to = setTimeout(() => {
child.kill();
reject(new Error(`Process timeout for ${taskId}`));
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer
child.on('close', (code) => { child.on('close', (code) => {
clearTimeout(to);
if (code !== 0) { if (code !== 0) {
console.error(`oracle agent failed for ${taskId} with stderr:`, stderr); console.error(
`qwen-code agent failed for ${taskId} with stderr:`,
stderr,
);
reject(new Error(`Process exited with code ${code}: ${stderr}`)); reject(new Error(`Process exited with code ${code}: ${stderr}`));
} else { } else {
resolve(stdout); resolve(stdout);
@@ -169,138 +281,48 @@ describe('terminal-bench integration', () => {
}); });
child.on('error', (error) => { child.on('error', (error) => {
clearTimeout(to);
console.error('Failed to start process:', error); console.error('Failed to start process:', error);
reject(error); reject(error);
}); });
// Set timeout based on task
setTimeout(
() => {
child.kill();
reject(new Error(`Process timeout for ${taskId}`));
},
Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000),
); // Leave 1 minute buffer
}).catch((error) => {
// This is expected if API key is not configured correctly
if (error instanceof Error && error.message?.includes('API')) {
console.warn('API configuration issue - skipping test');
return '';
}
throw error;
}); });
// Check if the run succeeded if (!result) return; // Skip if API configuration issue
// Check if the run completed
expect(result).toContain('Results Summary'); expect(result).toContain('Results Summary');
// Check if results file was created // Check results file in timestamped subdirectory
// Terminal-bench creates results in a timestamped subdirectory
const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' }) const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' })
.trim() .trim()
.split('\n'); .split('\n');
const latestDir = dirs[dirs.length - 1]; // Get the latest run directory const latestDir = dirs[dirs.length - 1];
const resultsFile = join(latestDir, 'results.json'); const resultsFile = join(latestDir, 'results.json');
expect(existsSync(resultsFile)).toBe(true); expect(existsSync(resultsFile)).toBe(true);
const results = JSON.parse(readFileSync(resultsFile, 'utf-8')); const results = JSON.parse(readFileSync(resultsFile, 'utf-8'));
expect(results.accuracy).toBe(1.0); // Oracle should always succeed // Check that the task actually completed successfully
expect(results.n_resolved).toBe(1); expect(results).toHaveProperty('accuracy');
expect(results.n_unresolved).toBe(0); expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved
} catch (error) { expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0
console.error(`Oracle agent failed for ${taskId}:`, error); },
throw error; DEFAULT_TIMEOUT_MS,
} );
}, DEFAULT_TIMEOUT_MS);
it(`should complete ${taskId} task with qwen-code agent`, async () => {
rig.setup(`terminal-bench-qwen-${taskId}`);
const outputPath = join(outputBase, `qwen-${taskId}`);
// Check if API key is available
const apiKey = process.env['OPENAI_API_KEY'];
if (!apiKey) {
throw new Error(
'OPENAI_API_KEY environment variable is not set. This test requires an API key to run the qwen-code agent.',
);
}
// Run qwen-code agent using spawn to avoid blocking event loop
const args = [
'run',
'--agent-import-path',
'integration-tests.terminal-bench.qwen_code:QwenCodeAgent',
'--agent-kwarg',
`api_key=${apiKey}`,
'--agent-kwarg',
`version=${process.env['QWEN_CODE_VERSION'] || 'latest'}`,
'--dataset-path',
ciTasksPath,
'--task-id',
taskId,
'--output-path',
outputPath,
'--n-concurrent',
'1',
];
const env = {
...process.env,
OPENAI_API_KEY: apiKey,
OPENAI_MODEL: process.env['OPENAI_MODEL'] || 'qwen3-coder-plus',
OPENAI_BASE_URL:
process.env['OPENAI_BASE_URL'] ||
'https://dashscope.aliyuncs.com/compatible-mode/v1',
};
// Use spawn with promise to avoid blocking
const result = await new Promise<string>((resolve, reject) => {
let stdout = '';
let stderr = '';
const child = spawn('tb', args, { env });
child.stdout?.on('data', (data) => {
stdout += data.toString();
});
child.stderr?.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
if (code !== 0) {
console.error(`qwen-code agent failed for ${taskId} with stderr:`, stderr);
reject(new Error(`Process exited with code ${code}: ${stderr}`));
} else {
resolve(stdout);
}
});
child.on('error', (error) => {
console.error('Failed to start process:', error);
reject(error);
});
// Set timeout based on task
setTimeout(() => {
child.kill();
reject(new Error(`Process timeout for ${taskId}`));
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer
}).catch((error) => {
// This is expected if API key is not configured correctly
if (error instanceof Error && error.message?.includes('API')) {
console.warn('API configuration issue - skipping test');
return '';
}
throw error;
});
if (!result) return; // Skip if API configuration issue
// Check if the run completed
expect(result).toContain('Results Summary');
// Check results file in timestamped subdirectory
const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' })
.trim()
.split('\n');
const latestDir = dirs[dirs.length - 1];
const resultsFile = join(latestDir, 'results.json');
expect(existsSync(resultsFile)).toBe(true);
const results = JSON.parse(readFileSync(resultsFile, 'utf-8'));
// Check that the task actually completed successfully
expect(results).toHaveProperty('accuracy');
expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved
expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0
}, DEFAULT_TIMEOUT_MS);
}); });
}); });