mirror of
https://github.com/QwenLM/qwen-code.git
synced 2025-12-22 17:57:46 +00:00
fix lint
This commit is contained in:
2
.github/workflows/terminal-bench.yml
vendored
2
.github/workflows/terminal-bench.yml
vendored
@@ -9,7 +9,7 @@ on:
|
|||||||
type: 'string'
|
type: 'string'
|
||||||
default: 'latest'
|
default: 'latest'
|
||||||
release:
|
release:
|
||||||
types: [published]
|
types: ['published']
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
terminal-bench:
|
terminal-bench:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ services:
|
|||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
image: ${T_BENCH_TASK_DOCKER_CLIENT_IMAGE_NAME}
|
image: ${T_BENCH_TASK_DOCKER_CLIENT_IMAGE_NAME}
|
||||||
container_name: ${T_BENCH_TASK_DOCKER_CLIENT_CONTAINER_NAME}
|
container_name: ${T_BENCH_TASK_DOCKER_CLIENT_CONTAINER_NAME}
|
||||||
command: [ "sh", "-c", "sleep infinity" ]
|
command: ['sh', '-c', 'sleep infinity']
|
||||||
environment:
|
environment:
|
||||||
- TEST_DIR=${T_BENCH_TEST_DIR}
|
- TEST_DIR=${T_BENCH_TEST_DIR}
|
||||||
volumes:
|
volumes:
|
||||||
|
|||||||
@@ -110,40 +110,155 @@ describe('terminal-bench integration', () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
describe.each(testTasks)('Task: %s', (taskId) => {
|
describe.each(testTasks)('Task: %s', (taskId) => {
|
||||||
it(`should complete ${taskId} task with oracle agent`, async () => {
|
it(
|
||||||
rig.setup(`terminal-bench-oracle-${taskId}`);
|
`should complete ${taskId} task with oracle agent`,
|
||||||
|
async () => {
|
||||||
|
rig.setup(`terminal-bench-oracle-${taskId}`);
|
||||||
|
|
||||||
const outputPath = join(outputBase, `oracle-${taskId}`);
|
const outputPath = join(outputBase, `oracle-${taskId}`);
|
||||||
|
|
||||||
// Check if ci-tasks exists
|
// Check if ci-tasks exists
|
||||||
if (!existsSync(ciTasksPath)) {
|
if (!existsSync(ciTasksPath)) {
|
||||||
console.error(`CI tasks directory does not exist: ${ciTasksPath}`);
|
console.error(`CI tasks directory does not exist: ${ciTasksPath}`);
|
||||||
throw new Error(
|
throw new Error(
|
||||||
'CI tasks not found. Please ensure ci-tasks directory is present.',
|
'CI tasks not found. Please ensure ci-tasks directory is present.',
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run oracle agent on the task using ci-tasks dataset (non-blocking)
|
// Run oracle agent on the task using ci-tasks dataset (non-blocking)
|
||||||
const args = [
|
const args = [
|
||||||
'run',
|
'run',
|
||||||
'--agent',
|
'--agent',
|
||||||
'oracle',
|
'oracle',
|
||||||
'--dataset-path',
|
'--dataset-path',
|
||||||
ciTasksPath,
|
ciTasksPath,
|
||||||
'--task-id',
|
'--task-id',
|
||||||
taskId,
|
taskId,
|
||||||
'--output-path',
|
'--output-path',
|
||||||
outputPath,
|
outputPath,
|
||||||
'--n-concurrent',
|
'--n-concurrent',
|
||||||
'1',
|
'1',
|
||||||
];
|
];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
const result = await new Promise<string>((resolve, reject) => {
|
||||||
|
let stdout = '';
|
||||||
|
let stderr = '';
|
||||||
|
|
||||||
|
const child = spawn('tb', args, { env: { ...process.env } });
|
||||||
|
|
||||||
|
child.stdout?.on('data', (data) => {
|
||||||
|
stdout += data.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
child.stderr?.on('data', (data) => {
|
||||||
|
stderr += data.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
const to = setTimeout(
|
||||||
|
() => {
|
||||||
|
child.kill();
|
||||||
|
reject(new Error(`Process timeout for ${taskId}`));
|
||||||
|
},
|
||||||
|
Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000),
|
||||||
|
); // Leave 1 minute buffer
|
||||||
|
|
||||||
|
child.on('close', (code) => {
|
||||||
|
clearTimeout(to);
|
||||||
|
if (code !== 0) {
|
||||||
|
console.error(
|
||||||
|
`oracle agent failed for ${taskId} with stderr:`,
|
||||||
|
stderr,
|
||||||
|
);
|
||||||
|
reject(
|
||||||
|
new Error(`Process exited with code ${code}: ${stderr}`),
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
resolve(stdout);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on('error', (error) => {
|
||||||
|
clearTimeout(to);
|
||||||
|
console.error('Failed to start process:', error);
|
||||||
|
reject(error);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Check if the run succeeded
|
||||||
|
expect(result).toContain('Results Summary');
|
||||||
|
|
||||||
|
// Check if results file was created
|
||||||
|
// Terminal-bench creates results in a timestamped subdirectory
|
||||||
|
const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' })
|
||||||
|
.trim()
|
||||||
|
.split('\n');
|
||||||
|
const latestDir = dirs[dirs.length - 1]; // Get the latest run directory
|
||||||
|
const resultsFile = join(latestDir, 'results.json');
|
||||||
|
|
||||||
|
expect(existsSync(resultsFile)).toBe(true);
|
||||||
|
|
||||||
|
const results = JSON.parse(readFileSync(resultsFile, 'utf-8'));
|
||||||
|
expect(results.accuracy).toBe(1.0); // Oracle should always succeed
|
||||||
|
expect(results.n_resolved).toBe(1);
|
||||||
|
expect(results.n_unresolved).toBe(0);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Oracle agent failed for ${taskId}:`, error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
DEFAULT_TIMEOUT_MS,
|
||||||
|
);
|
||||||
|
|
||||||
|
it(
|
||||||
|
`should complete ${taskId} task with qwen-code agent`,
|
||||||
|
async () => {
|
||||||
|
rig.setup(`terminal-bench-qwen-${taskId}`);
|
||||||
|
|
||||||
|
const outputPath = join(outputBase, `qwen-${taskId}`);
|
||||||
|
|
||||||
|
// Check if API key is available
|
||||||
|
const apiKey = process.env['OPENAI_API_KEY'];
|
||||||
|
if (!apiKey) {
|
||||||
|
throw new Error(
|
||||||
|
'OPENAI_API_KEY environment variable is not set. This test requires an API key to run the qwen-code agent.',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run qwen-code agent using spawn to avoid blocking event loop
|
||||||
|
const args = [
|
||||||
|
'run',
|
||||||
|
'--agent-import-path',
|
||||||
|
'integration-tests.terminal-bench.qwen_code:QwenCodeAgent',
|
||||||
|
'--agent-kwarg',
|
||||||
|
`api_key=${apiKey}`,
|
||||||
|
'--agent-kwarg',
|
||||||
|
`version=${process.env['QWEN_CODE_VERSION'] || 'latest'}`,
|
||||||
|
'--dataset-path',
|
||||||
|
ciTasksPath,
|
||||||
|
'--task-id',
|
||||||
|
taskId,
|
||||||
|
'--output-path',
|
||||||
|
outputPath,
|
||||||
|
'--n-concurrent',
|
||||||
|
'1',
|
||||||
|
];
|
||||||
|
|
||||||
|
const env = {
|
||||||
|
...process.env,
|
||||||
|
OPENAI_API_KEY: apiKey,
|
||||||
|
OPENAI_MODEL: process.env['OPENAI_MODEL'] || 'qwen3-coder-plus',
|
||||||
|
OPENAI_BASE_URL:
|
||||||
|
process.env['OPENAI_BASE_URL'] ||
|
||||||
|
'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
||||||
|
};
|
||||||
|
|
||||||
|
// Use spawn with promise to avoid blocking
|
||||||
const result = await new Promise<string>((resolve, reject) => {
|
const result = await new Promise<string>((resolve, reject) => {
|
||||||
let stdout = '';
|
let stdout = '';
|
||||||
let stderr = '';
|
let stderr = '';
|
||||||
|
|
||||||
const child = spawn('tb', args, { env: { ...process.env } });
|
const child = spawn('tb', args, { env });
|
||||||
|
|
||||||
child.stdout?.on('data', (data) => {
|
child.stdout?.on('data', (data) => {
|
||||||
stdout += data.toString();
|
stdout += data.toString();
|
||||||
@@ -153,15 +268,12 @@ describe('terminal-bench integration', () => {
|
|||||||
stderr += data.toString();
|
stderr += data.toString();
|
||||||
});
|
});
|
||||||
|
|
||||||
const to = setTimeout(() => {
|
|
||||||
child.kill();
|
|
||||||
reject(new Error(`Process timeout for ${taskId}`));
|
|
||||||
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer
|
|
||||||
|
|
||||||
child.on('close', (code) => {
|
child.on('close', (code) => {
|
||||||
clearTimeout(to);
|
|
||||||
if (code !== 0) {
|
if (code !== 0) {
|
||||||
console.error(`oracle agent failed for ${taskId} with stderr:`, stderr);
|
console.error(
|
||||||
|
`qwen-code agent failed for ${taskId} with stderr:`,
|
||||||
|
stderr,
|
||||||
|
);
|
||||||
reject(new Error(`Process exited with code ${code}: ${stderr}`));
|
reject(new Error(`Process exited with code ${code}: ${stderr}`));
|
||||||
} else {
|
} else {
|
||||||
resolve(stdout);
|
resolve(stdout);
|
||||||
@@ -169,138 +281,48 @@ describe('terminal-bench integration', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
child.on('error', (error) => {
|
child.on('error', (error) => {
|
||||||
clearTimeout(to);
|
|
||||||
console.error('Failed to start process:', error);
|
console.error('Failed to start process:', error);
|
||||||
reject(error);
|
reject(error);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Set timeout based on task
|
||||||
|
setTimeout(
|
||||||
|
() => {
|
||||||
|
child.kill();
|
||||||
|
reject(new Error(`Process timeout for ${taskId}`));
|
||||||
|
},
|
||||||
|
Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000),
|
||||||
|
); // Leave 1 minute buffer
|
||||||
|
}).catch((error) => {
|
||||||
|
// This is expected if API key is not configured correctly
|
||||||
|
if (error instanceof Error && error.message?.includes('API')) {
|
||||||
|
console.warn('API configuration issue - skipping test');
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
});
|
});
|
||||||
|
|
||||||
// Check if the run succeeded
|
if (!result) return; // Skip if API configuration issue
|
||||||
|
|
||||||
|
// Check if the run completed
|
||||||
expect(result).toContain('Results Summary');
|
expect(result).toContain('Results Summary');
|
||||||
|
|
||||||
// Check if results file was created
|
// Check results file in timestamped subdirectory
|
||||||
// Terminal-bench creates results in a timestamped subdirectory
|
|
||||||
const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' })
|
const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' })
|
||||||
.trim()
|
.trim()
|
||||||
.split('\n');
|
.split('\n');
|
||||||
const latestDir = dirs[dirs.length - 1]; // Get the latest run directory
|
const latestDir = dirs[dirs.length - 1];
|
||||||
const resultsFile = join(latestDir, 'results.json');
|
const resultsFile = join(latestDir, 'results.json');
|
||||||
|
|
||||||
expect(existsSync(resultsFile)).toBe(true);
|
expect(existsSync(resultsFile)).toBe(true);
|
||||||
|
|
||||||
const results = JSON.parse(readFileSync(resultsFile, 'utf-8'));
|
const results = JSON.parse(readFileSync(resultsFile, 'utf-8'));
|
||||||
expect(results.accuracy).toBe(1.0); // Oracle should always succeed
|
// Check that the task actually completed successfully
|
||||||
expect(results.n_resolved).toBe(1);
|
expect(results).toHaveProperty('accuracy');
|
||||||
expect(results.n_unresolved).toBe(0);
|
expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved
|
||||||
} catch (error) {
|
expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0
|
||||||
console.error(`Oracle agent failed for ${taskId}:`, error);
|
},
|
||||||
throw error;
|
DEFAULT_TIMEOUT_MS,
|
||||||
}
|
);
|
||||||
}, DEFAULT_TIMEOUT_MS);
|
|
||||||
|
|
||||||
it(`should complete ${taskId} task with qwen-code agent`, async () => {
|
|
||||||
rig.setup(`terminal-bench-qwen-${taskId}`);
|
|
||||||
|
|
||||||
const outputPath = join(outputBase, `qwen-${taskId}`);
|
|
||||||
|
|
||||||
// Check if API key is available
|
|
||||||
const apiKey = process.env['OPENAI_API_KEY'];
|
|
||||||
if (!apiKey) {
|
|
||||||
throw new Error(
|
|
||||||
'OPENAI_API_KEY environment variable is not set. This test requires an API key to run the qwen-code agent.',
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Run qwen-code agent using spawn to avoid blocking event loop
|
|
||||||
const args = [
|
|
||||||
'run',
|
|
||||||
'--agent-import-path',
|
|
||||||
'integration-tests.terminal-bench.qwen_code:QwenCodeAgent',
|
|
||||||
'--agent-kwarg',
|
|
||||||
`api_key=${apiKey}`,
|
|
||||||
'--agent-kwarg',
|
|
||||||
`version=${process.env['QWEN_CODE_VERSION'] || 'latest'}`,
|
|
||||||
'--dataset-path',
|
|
||||||
ciTasksPath,
|
|
||||||
'--task-id',
|
|
||||||
taskId,
|
|
||||||
'--output-path',
|
|
||||||
outputPath,
|
|
||||||
'--n-concurrent',
|
|
||||||
'1',
|
|
||||||
];
|
|
||||||
|
|
||||||
const env = {
|
|
||||||
...process.env,
|
|
||||||
OPENAI_API_KEY: apiKey,
|
|
||||||
OPENAI_MODEL: process.env['OPENAI_MODEL'] || 'qwen3-coder-plus',
|
|
||||||
OPENAI_BASE_URL:
|
|
||||||
process.env['OPENAI_BASE_URL'] ||
|
|
||||||
'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
||||||
};
|
|
||||||
|
|
||||||
// Use spawn with promise to avoid blocking
|
|
||||||
const result = await new Promise<string>((resolve, reject) => {
|
|
||||||
let stdout = '';
|
|
||||||
let stderr = '';
|
|
||||||
|
|
||||||
const child = spawn('tb', args, { env });
|
|
||||||
|
|
||||||
child.stdout?.on('data', (data) => {
|
|
||||||
stdout += data.toString();
|
|
||||||
});
|
|
||||||
|
|
||||||
child.stderr?.on('data', (data) => {
|
|
||||||
stderr += data.toString();
|
|
||||||
});
|
|
||||||
|
|
||||||
child.on('close', (code) => {
|
|
||||||
if (code !== 0) {
|
|
||||||
console.error(`qwen-code agent failed for ${taskId} with stderr:`, stderr);
|
|
||||||
reject(new Error(`Process exited with code ${code}: ${stderr}`));
|
|
||||||
} else {
|
|
||||||
resolve(stdout);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
child.on('error', (error) => {
|
|
||||||
console.error('Failed to start process:', error);
|
|
||||||
reject(error);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Set timeout based on task
|
|
||||||
setTimeout(() => {
|
|
||||||
child.kill();
|
|
||||||
reject(new Error(`Process timeout for ${taskId}`));
|
|
||||||
}, Math.max(60_000, DEFAULT_TIMEOUT_MS - 60_000)); // Leave 1 minute buffer
|
|
||||||
}).catch((error) => {
|
|
||||||
// This is expected if API key is not configured correctly
|
|
||||||
if (error instanceof Error && error.message?.includes('API')) {
|
|
||||||
console.warn('API configuration issue - skipping test');
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
throw error;
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!result) return; // Skip if API configuration issue
|
|
||||||
|
|
||||||
// Check if the run completed
|
|
||||||
expect(result).toContain('Results Summary');
|
|
||||||
|
|
||||||
// Check results file in timestamped subdirectory
|
|
||||||
const dirs = execSync(`ls -d ${outputPath}/*/`, { encoding: 'utf-8' })
|
|
||||||
.trim()
|
|
||||||
.split('\n');
|
|
||||||
const latestDir = dirs[dirs.length - 1];
|
|
||||||
const resultsFile = join(latestDir, 'results.json');
|
|
||||||
|
|
||||||
expect(existsSync(resultsFile)).toBe(true);
|
|
||||||
|
|
||||||
const results = JSON.parse(readFileSync(resultsFile, 'utf-8'));
|
|
||||||
// Check that the task actually completed successfully
|
|
||||||
expect(results).toHaveProperty('accuracy');
|
|
||||||
expect(results.n_resolved).toBeGreaterThan(0); // At least one task should be resolved
|
|
||||||
expect(results.accuracy).toBeGreaterThan(0); // Accuracy should be greater than 0
|
|
||||||
}, DEFAULT_TIMEOUT_MS);
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user