mirror of
https://github.com/QwenLM/qwen-code.git
synced 2025-12-23 02:07:52 +00:00
handle timeout
This commit is contained in:
19
.github/workflows/terminal-bench.yml
vendored
19
.github/workflows/terminal-bench.yml
vendored
@@ -1,16 +1,15 @@
|
|||||||
name: 'Terminal Bench Tests'
|
name: 'Terminal Bench Tests'
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- 'main'
|
|
||||||
- 'feat/**'
|
|
||||||
- 'feature/**'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- 'main'
|
|
||||||
merge_group:
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
version:
|
||||||
|
description: 'The version to test.'
|
||||||
|
required: true
|
||||||
|
type: 'string'
|
||||||
|
default: 'latest'
|
||||||
|
release:
|
||||||
|
types: [published]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
terminal-bench:
|
terminal-bench:
|
||||||
@@ -64,6 +63,7 @@ jobs:
|
|||||||
VERBOSE: 'true'
|
VERBOSE: 'true'
|
||||||
KEEP_OUTPUT: 'true'
|
KEEP_OUTPUT: 'true'
|
||||||
TB_TASK_ID: '${{ matrix.task_id }}'
|
TB_TASK_ID: '${{ matrix.task_id }}'
|
||||||
|
TB_TIMEOUT_MINUTES: '30'
|
||||||
|
|
||||||
- name: 'Run Terminal Bench Qwen (task: ${{ matrix.task_id }})'
|
- name: 'Run Terminal Bench Qwen (task: ${{ matrix.task_id }})'
|
||||||
run: 'npm run test:terminal-bench:qwen'
|
run: 'npm run test:terminal-bench:qwen'
|
||||||
@@ -77,6 +77,7 @@ jobs:
|
|||||||
VERBOSE: 'true'
|
VERBOSE: 'true'
|
||||||
KEEP_OUTPUT: 'true'
|
KEEP_OUTPUT: 'true'
|
||||||
TB_TASK_ID: '${{ matrix.task_id }}'
|
TB_TASK_ID: '${{ matrix.task_id }}'
|
||||||
|
TB_TIMEOUT_MINUTES: '30'
|
||||||
|
|
||||||
- name: 'Upload test artifacts'
|
- name: 'Upload test artifacts'
|
||||||
if: 'always()'
|
if: 'always()'
|
||||||
|
|||||||
@@ -20,6 +20,12 @@ describe('terminal-bench integration', () => {
|
|||||||
// Use local ci-tasks directory for self-contained tests
|
// Use local ci-tasks directory for self-contained tests
|
||||||
const ciTasksPath = join(__dirname, 'ci-tasks');
|
const ciTasksPath = join(__dirname, 'ci-tasks');
|
||||||
|
|
||||||
|
// Single timeout source (minutes), defaults to workflow's 30 minutes
|
||||||
|
const DEFAULT_TIMEOUT_MINUTES = Number(
|
||||||
|
process.env['TB_TIMEOUT_MINUTES'] || '30',
|
||||||
|
);
|
||||||
|
const DEFAULT_TIMEOUT_MS = DEFAULT_TIMEOUT_MINUTES * 60 * 1000;
|
||||||
|
|
||||||
// Use the integration test directory set by globalSetup.ts if available,
|
// Use the integration test directory set by globalSetup.ts if available,
|
||||||
// otherwise create our own in .integration-tests
|
// otherwise create our own in .integration-tests
|
||||||
const integrationTestsDir = join(rootDir, '.integration-tests');
|
const integrationTestsDir = join(rootDir, '.integration-tests');
|
||||||
@@ -54,9 +60,7 @@ describe('terminal-bench integration', () => {
|
|||||||
console.log('Installing terminal-bench...');
|
console.log('Installing terminal-bench...');
|
||||||
// Use uv tool install for terminal-bench
|
// Use uv tool install for terminal-bench
|
||||||
try {
|
try {
|
||||||
execSync('uv tool install --python 3.12 terminal-bench', {
|
execSync('uv tool install --python 3.12 terminal-bench');
|
||||||
stdio: 'inherit',
|
|
||||||
});
|
|
||||||
// Add uv tools to PATH for this process
|
// Add uv tools to PATH for this process
|
||||||
process.env['PATH'] =
|
process.env['PATH'] =
|
||||||
`${process.env['HOME']}/.local/bin:${process.env['PATH']}`;
|
`${process.env['HOME']}/.local/bin:${process.env['PATH']}`;
|
||||||
@@ -77,23 +81,20 @@ describe('terminal-bench integration', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Test configuration for different tasks
|
// Test configuration for different tasks
|
||||||
const baseTestTasks = [
|
const baseTestTasks = ['hello-world', 'swe-bench-astropy-1'] as const;
|
||||||
{ id: 'hello-world', timeout: 300000 },
|
|
||||||
{ id: 'swe-bench-astropy-1', timeout: 600000 },
|
|
||||||
] as const;
|
|
||||||
|
|
||||||
// Allow CI to select a specific task (or a subset) via env var
|
// Allow CI to select a specific task (or a subset) via env var
|
||||||
const envTaskId = process.env['TB_TASK_ID'];
|
const envTaskId = process.env['TB_TASK_ID'];
|
||||||
const envTaskIds = process.env['TB_TASK_IDS']; // comma-separated list
|
const envTaskIds = process.env['TB_TASK_IDS']; // comma-separated list
|
||||||
|
|
||||||
let testTasks = baseTestTasks.map((t) => ({ ...t }));
|
let testTasks = [...baseTestTasks];
|
||||||
if (envTaskId || envTaskIds) {
|
if (envTaskId || envTaskIds) {
|
||||||
const selected = (envTaskIds || envTaskId || '')
|
const selected = (envTaskIds || envTaskId || '')
|
||||||
.split(',')
|
.split(',')
|
||||||
.map((s) => s.trim())
|
.map((s) => s.trim())
|
||||||
.filter(Boolean);
|
.filter(Boolean);
|
||||||
|
|
||||||
const available = new Set(baseTestTasks.map((t) => t.id));
|
const available = new Set(baseTestTasks.map((t) => t));
|
||||||
const unknown = selected.filter((s) => !available.has(s));
|
const unknown = selected.filter((s) => !available.has(s));
|
||||||
if (unknown.length > 0) {
|
if (unknown.length > 0) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
@@ -101,16 +102,14 @@ describe('terminal-bench integration', () => {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
testTasks = baseTestTasks
|
testTasks = baseTestTasks.filter((t) => selected.includes(t));
|
||||||
.filter((t) => selected.includes(t.id))
|
|
||||||
.map((t) => ({ ...t }));
|
|
||||||
|
|
||||||
if (testTasks.length === 0) {
|
if (testTasks.length === 0) {
|
||||||
throw new Error('No tasks selected via TB_TASK_ID/TB_TASK_IDS.');
|
throw new Error('No tasks selected via TB_TASK_ID/TB_TASK_IDS.');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
describe.each(testTasks)('Task: $id', ({ id: taskId, timeout }) => {
|
describe.each(testTasks)('Task: %s', (taskId) => {
|
||||||
it(`should complete ${taskId} task with oracle agent`, async () => {
|
it(`should complete ${taskId} task with oracle agent`, async () => {
|
||||||
rig.setup(`terminal-bench-oracle-${taskId}`);
|
rig.setup(`terminal-bench-oracle-${taskId}`);
|
||||||
|
|
||||||
@@ -124,14 +123,56 @@ describe('terminal-bench integration', () => {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run oracle agent on the task using ci-tasks dataset
|
// Run oracle agent on the task using ci-tasks dataset (non-blocking)
|
||||||
const command = `tb run --agent oracle --dataset-path ${ciTasksPath} --task-id ${taskId} --output-path ${outputPath} --n-concurrent 1`;
|
const args = [
|
||||||
|
'run',
|
||||||
|
'--agent',
|
||||||
|
'oracle',
|
||||||
|
'--dataset-path',
|
||||||
|
ciTasksPath,
|
||||||
|
'--task-id',
|
||||||
|
taskId,
|
||||||
|
'--output-path',
|
||||||
|
outputPath,
|
||||||
|
'--n-concurrent',
|
||||||
|
'1',
|
||||||
|
];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result = execSync(command, {
|
const result = await new Promise<string>((resolve, reject) => {
|
||||||
encoding: 'utf-8',
|
let stdout = '';
|
||||||
timeout: timeout - 60000, // Leave 1 minute buffer
|
let stderr = '';
|
||||||
env: { ...process.env },
|
|
||||||
|
const child = spawn('tb', args, { env: { ...process.env } });
|
||||||
|
|
||||||
|
child.stdout?.on('data', (data) => {
|
||||||
|
stdout += data.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
child.stderr?.on('data', (data) => {
|
||||||
|
stderr += data.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
const to = setTimeout(() => {
|
||||||
|
child.kill();
|
||||||
|
reject(new Error(`Process timeout for ${taskId}`));
|
||||||
|
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer
|
||||||
|
|
||||||
|
child.on('close', (code) => {
|
||||||
|
clearTimeout(to);
|
||||||
|
if (code !== 0) {
|
||||||
|
console.error(`oracle agent failed for ${taskId} with stderr:`, stderr);
|
||||||
|
reject(new Error(`Process exited with code ${code}: ${stderr}`));
|
||||||
|
} else {
|
||||||
|
resolve(stdout);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on('error', (error) => {
|
||||||
|
clearTimeout(to);
|
||||||
|
console.error('Failed to start process:', error);
|
||||||
|
reject(error);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// Check if the run succeeded
|
// Check if the run succeeded
|
||||||
@@ -155,7 +196,7 @@ describe('terminal-bench integration', () => {
|
|||||||
console.error(`Oracle agent failed for ${taskId}:`, error);
|
console.error(`Oracle agent failed for ${taskId}:`, error);
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
}, timeout);
|
}, DEFAULT_TIMEOUT_MS);
|
||||||
|
|
||||||
it(`should complete ${taskId} task with qwen-code agent`, async () => {
|
it(`should complete ${taskId} task with qwen-code agent`, async () => {
|
||||||
rig.setup(`terminal-bench-qwen-${taskId}`);
|
rig.setup(`terminal-bench-qwen-${taskId}`);
|
||||||
@@ -231,7 +272,7 @@ describe('terminal-bench integration', () => {
|
|||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
child.kill();
|
child.kill();
|
||||||
reject(new Error(`Process timeout for ${taskId}`));
|
reject(new Error(`Process timeout for ${taskId}`));
|
||||||
}, timeout - 60000); // Leave 1 minute buffer
|
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer
|
||||||
}).catch((error) => {
|
}).catch((error) => {
|
||||||
// This is expected if API key is not configured correctly
|
// This is expected if API key is not configured correctly
|
||||||
if (error instanceof Error && error.message?.includes('API')) {
|
if (error instanceof Error && error.message?.includes('API')) {
|
||||||
@@ -260,6 +301,6 @@ describe('terminal-bench integration', () => {
|
|||||||
expect(results).toHaveProperty('accuracy');
|
expect(results).toHaveProperty('accuracy');
|
||||||
expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved
|
expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved
|
||||||
expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0
|
expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0
|
||||||
}, timeout);
|
}, DEFAULT_TIMEOUT_MS);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -6,9 +6,12 @@
|
|||||||
|
|
||||||
import { defineConfig } from 'vitest/config';
|
import { defineConfig } from 'vitest/config';
|
||||||
|
|
||||||
|
const timeoutMinutes = Number(process.env.TB_TIMEOUT_MINUTES || '5');
|
||||||
|
const testTimeoutMs = timeoutMinutes * 60 * 1000;
|
||||||
|
|
||||||
export default defineConfig({
|
export default defineConfig({
|
||||||
test: {
|
test: {
|
||||||
testTimeout: 300000, // 5 minutes
|
testTimeout: testTimeoutMs,
|
||||||
globalSetup: './globalSetup.ts',
|
globalSetup: './globalSetup.ts',
|
||||||
reporters: ['default'],
|
reporters: ['default'],
|
||||||
include: ['**/*.test.ts'],
|
include: ['**/*.test.ts'],
|
||||||
|
|||||||
Reference in New Issue
Block a user