fix: simplify todo-write integration test, to make it more stable

2025-12-20 16:57:46 +00:00 · 2025-10-24 11:22:44 +08:00
parent 570b745f30
commit d482ad28dd
1 changed files with 10 additions and 73 deletions
--- a/integration-tests/todo_write.test.ts
+++ b/integration-tests/todo_write.test.ts
@@ -12,13 +12,12 @@ describe('todo_write', () => {
    const rig = new TestRig();
    await rig.setup('should be able to create and manage a todo list');
-    const prompt = `I want to implement a new feature to track user preferences. Here are the tasks:
+    const prompt = `Please create a todo list with these three simple tasks:
-1. Create a user preferences model
+1. Buy milk
-2. Add API endpoints for preferences
+2. Walk the dog  
-3. Implement frontend components
+3. Read a book
 4. Write tests for the new functionality
-Please create a todo list for these tasks.`;
+Use the todo_write tool to create this list.`;
    const result = await rig.run(prompt);
@@ -50,83 +49,21 @@ Please create a todo list for these tasks.`;
    expect(todoArgs.todos).toBeDefined();
    expect(Array.isArray(todoArgs.todos)).toBe(true);
-    expect(todoArgs.todos.length).toBeGreaterThan(0);
+    expect(todoArgs.todos.length).toBeGreaterThanOrEqual(3);
    // Check that all todos have the correct structure
    for (const todo of todoArgs.todos) {
      expect(todo.id).toBeDefined();
      expect(todo.content).toBeDefined();
-      expect(['pending', 'in_progress', 'completed']).toContain(todo.status);
+      expect(['pending', 'in_progress', 'completed', 'cancelled']).toContain(
        todo.status,
      );
    }
    // Log success info if verbose
    if (process.env['VERBOSE'] === 'true') {
      console.log('Todo list created successfully');
-    }
+      console.log(`Created ${todoArgs.todos.length} todos`);
  });
  it('should be able to update todo status', async () => {
    const rig = new TestRig();
    await rig.setup('should be able to update todo status');
    // First create a todo list
    const initialPrompt = `Create a todo list with these tasks:
 1. Set up project structure
 2. Implement authentication
 3. Add database migrations`;
    await rig.run(initialPrompt);
    await rig.waitForToolCall('todo_write');
    // Now update the todo list by marking one as in progress
    const updatePrompt = `I've started working on implementing authentication. Please update the todo list to reflect that.`;
    const result = await rig.run(updatePrompt);
    const foundToolCall = await rig.waitForToolCall('todo_write');
    // Add debugging information
    if (!foundToolCall) {
      printDebugInfo(rig, result);
    }
    expect(
      foundToolCall,
      'Expected to find a todo_write tool call',
    ).toBeTruthy();
    // Validate model output - will throw if no output
    validateModelOutput(result, null, 'Todo update test');
    // Check that the tool was called with updated parameters
    const toolLogs = rig.readToolLogs();
    const todoWriteCalls = toolLogs.filter(
      (t) => t.toolRequest.name === 'todo_write',
    );
    expect(todoWriteCalls.length).toBeGreaterThan(0);
    // Parse the arguments to verify the update
    const todoArgs = JSON.parse(
      todoWriteCalls[todoWriteCalls.length - 1].toolRequest.args,
    );
    expect(todoArgs.todos).toBeDefined();
    expect(Array.isArray(todoArgs.todos)).toBe(true);
    // The model might create a new list with just the task it's working on
    // or it might update the existing list. Let's check that we have at least one todo
    expect(todoArgs.todos.length).toBeGreaterThanOrEqual(1);
    // Check that all todos have the correct structure
    for (const todo of todoArgs.todos) {
      expect(todo.id).toBeDefined();
      expect(todo.content).toBeDefined();
      expect(['pending', 'in_progress', 'completed']).toContain(todo.status);
    }
    // Log success info if verbose
    if (process.env['VERBOSE'] === 'true') {
      console.log('Todo list updated successfully');
    }
  });
 });