This commit is contained in:
Yiheng Xu
2025-09-05 16:24:09 +08:00
parent c00287ac45
commit f64bf8dede
3 changed files with 175 additions and 153 deletions

View File

@@ -9,7 +9,7 @@ on:
type: 'string' type: 'string'
default: 'latest' default: 'latest'
release: release:
types: [published] types: ['published']
jobs: jobs:
terminal-bench: terminal-bench:

View File

@@ -4,7 +4,7 @@ services:
dockerfile: Dockerfile dockerfile: Dockerfile
image: ${T_BENCH_TASK_DOCKER_CLIENT_IMAGE_NAME} image: ${T_BENCH_TASK_DOCKER_CLIENT_IMAGE_NAME}
container_name: ${T_BENCH_TASK_DOCKER_CLIENT_CONTAINER_NAME} container_name: ${T_BENCH_TASK_DOCKER_CLIENT_CONTAINER_NAME}
command: [ "sh", "-c", "sleep infinity" ] command: ['sh', '-c', 'sleep infinity']
environment: environment:
- TEST_DIR=${T_BENCH_TEST_DIR} - TEST_DIR=${T_BENCH_TEST_DIR}
volumes: volumes:

View File

@@ -110,7 +110,9 @@ describe('terminal-bench integration', () => {
} }
describe.each(testTasks)('Task: %s', (taskId) => { describe.each(testTasks)('Task: %s', (taskId) => {
it(`should complete ${taskId} task with oracle agent`, async () => { it(
`should complete ${taskId} task with oracle agent`,
async () => {
rig.setup(`terminal-bench-oracle-${taskId}`); rig.setup(`terminal-bench-oracle-${taskId}`);
const outputPath = join(outputBase, `oracle-${taskId}`); const outputPath = join(outputBase, `oracle-${taskId}`);
@@ -153,16 +155,24 @@ describe('terminal-bench integration', () => {
stderr += data.toString(); stderr += data.toString();
}); });
const to = setTimeout(() => { const to = setTimeout(
() => {
child.kill(); child.kill();
reject(new Error(`Process timeout for ${taskId}`)); reject(new Error(`Process timeout for ${taskId}`));
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer },
Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000),
); // Leave 1 minute buffer
child.on('close', (code) => { child.on('close', (code) => {
clearTimeout(to); clearTimeout(to);
if (code !== 0) { if (code !== 0) {
console.error(`oracle agent failed for ${taskId} with stderr:`, stderr); console.error(
reject(new Error(`Process exited with code ${code}: ${stderr}`)); `oracle agent failed for ${taskId} with stderr:`,
stderr,
);
reject(
new Error(`Process exited with code ${code}: ${stderr}`),
);
} else { } else {
resolve(stdout); resolve(stdout);
} }
@@ -196,9 +206,13 @@ describe('terminal-bench integration', () => {
console.error(`Oracle agent failed for ${taskId}:`, error); console.error(`Oracle agent failed for ${taskId}:`, error);
throw error; throw error;
} }
}, DEFAULT_TIMEOUT_MS); },
DEFAULT_TIMEOUT_MS,
);
it(`should complete ${taskId} task with qwen-code agent`, async () => { it(
`should complete ${taskId} task with qwen-code agent`,
async () => {
rig.setup(`terminal-bench-qwen-${taskId}`); rig.setup(`terminal-bench-qwen-${taskId}`);
const outputPath = join(outputBase, `qwen-${taskId}`); const outputPath = join(outputBase, `qwen-${taskId}`);
@@ -256,7 +270,10 @@ describe('terminal-bench integration', () => {
child.on('close', (code) => { child.on('close', (code) => {
if (code !== 0) { if (code !== 0) {
console.error(`qwen-code agent failed for ${taskId} with stderr:`, stderr); console.error(
`qwen-code agent failed for ${taskId} with stderr:`,
stderr,
);
reject(new Error(`Process exited with code ${code}: ${stderr}`)); reject(new Error(`Process exited with code ${code}: ${stderr}`));
} else { } else {
resolve(stdout); resolve(stdout);
@@ -269,10 +286,13 @@ describe('terminal-bench integration', () => {
}); });
// Set timeout based on task // Set timeout based on task
setTimeout(() => { setTimeout(
() => {
child.kill(); child.kill();
reject(new Error(`Process timeout for ${taskId}`)); reject(new Error(`Process timeout for ${taskId}`));
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer },
Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000),
); // Leave 1 minute buffer
}).catch((error) => { }).catch((error) => {
// This is expected if API key is not configured correctly // This is expected if API key is not configured correctly
if (error instanceof Error && error.message?.includes('API')) { if (error instanceof Error && error.message?.includes('API')) {
@@ -301,6 +321,8 @@ describe('terminal-bench integration', () => {
expect(results).toHaveProperty('accuracy'); expect(results).toHaveProperty('accuracy');
expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved
expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0 expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0
}, DEFAULT_TIMEOUT_MS); },
DEFAULT_TIMEOUT_MS,
);
}); });
}); });