🚀 Add Todo Write Tool for Task Management and Progress Tracking (#478)

2025-12-19 09:33:53 +00:00 · 2025-08-28 20:32:21 +08:00
parent c1498668b6
commit 1610c1586e
13 changed files with 1901 additions and 103 deletions
--- a/docs/tools/index.md
+++ b/docs/tools/index.md
@@ -49,6 +49,7 @@ Qwen Code's built-in tools can be broadly categorized as follows:
 - **[Web Search Tool](./web-search.md) (`web_search`):** For searching the web.
 - **[Multi-File Read Tool](./multi-file.md) (`read_many_files`):** A specialized tool for reading content from multiple files or directories, often used by the `@` command.
 - **[Memory Tool](./memory.md) (`save_memory`):** For saving and recalling information across sessions.
+- **[Todo Write Tool](./todo-write.md) (`todo_write`):** For creating and managing structured task lists during coding sessions.

 Additionally, these tools incorporate:

--- a/docs/tools/todo-write.md
+++ b/docs/tools/todo-write.md
@@ -0,0 +1,63 @@
+# Todo Write Tool (`todo_write`)
+
+This document describes the `todo_write` tool for Qwen Code.
+
+## Description
+
+Use `todo_write` to create and manage a structured task list for your current coding session. This tool helps the AI assistant track progress and organize complex tasks, providing you with visibility into what work is being performed.
+
+### Arguments
+
+`todo_write` takes one argument:
+
+- `todos` (array, required): An array of todo items, where each item contains:
+  - `id` (string, required): A unique identifier for the todo item.
+  - `content` (string, required): The description of the task.
+  - `status` (string, required): The current status (`pending`, `in_progress`, or `completed`).
+
+## How to use `todo_write` with Qwen Code
+
+The AI assistant will automatically use this tool when working on complex, multi-step tasks. You don't need to explicitly request it, but you can ask the assistant to create a todo list if you want to see the planned approach for your request.
+
+The tool stores todo lists in your home directory (`~/.qwen/todos/`) with session-specific files, so each coding session maintains its own task list.
+
+## When the AI uses this tool
+
+The assistant uses `todo_write` for:
+
+- Complex tasks requiring multiple steps
+- Feature implementations with several components
+- Refactoring operations across multiple files
+- Any work involving 3 or more distinct actions
+
+The assistant will not use this tool for simple, single-step tasks or purely informational requests.
+
+### `todo_write` examples
+
+Creating a feature implementation plan:
+
+```
+todo_write(todos=[
+  {
+    "id": "create-model",
+    "content": "Create user preferences model",
+    "status": "pending"
+  },
+  {
+    "id": "add-endpoints",
+    "content": "Add API endpoints for preferences",
+    "status": "pending"
+  },
+  {
+    "id": "implement-ui",
+    "content": "Implement frontend components",
+    "status": "pending"
+  }
+])
+```
+
+## Important notes
+
+- **Automatic usage:** The AI assistant manages todo lists automatically during complex tasks.
+- **Progress visibility:** You'll see todo lists updated in real-time as work progresses.
+- **Session isolation:** Each coding session has its own todo list that doesn't interfere with others.
--- a/integration-tests/todo_write.test.ts
+++ b/integration-tests/todo_write.test.ts
@@ -0,0 +1,132 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect } from 'vitest';
+import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
+
+describe('todo_write', () => {
+  it('should be able to create and manage a todo list', async () => {
+    const rig = new TestRig();
+    await rig.setup('should be able to create and manage a todo list');
+
+    const prompt = `I want to implement a new feature to track user preferences. Here are the tasks:
+1. Create a user preferences model
+2. Add API endpoints for preferences
+3. Implement frontend components
+4. Write tests for the new functionality
+
+Please create a todo list for these tasks.`;
+
+    const result = await rig.run(prompt);
+
+    const foundToolCall = await rig.waitForToolCall('todo_write');
+
+    // Add debugging information
+    if (!foundToolCall) {
+      printDebugInfo(rig, result);
+    }
+
+    expect(
+      foundToolCall,
+      'Expected to find a todo_write tool call',
+    ).toBeTruthy();
+
+    // Validate model output - will throw if no output
+    validateModelOutput(result, null, 'Todo write test');
+
+    // Check that the tool was called with the right parameters
+    const toolLogs = rig.readToolLogs();
+    const todoWriteCalls = toolLogs.filter(
+      (t) => t.toolRequest.name === 'todo_write',
+    );
+
+    expect(todoWriteCalls.length).toBeGreaterThan(0);
+
+    // Parse the arguments to verify they contain our tasks
+    const todoArgs = JSON.parse(todoWriteCalls[0].toolRequest.args);
+
+    expect(todoArgs.todos).toBeDefined();
+    expect(Array.isArray(todoArgs.todos)).toBe(true);
+    expect(todoArgs.todos.length).toBe(4);
+
+    // Check that all todos have the correct structure
+    for (const todo of todoArgs.todos) {
+      expect(todo.id).toBeDefined();
+      expect(todo.content).toBeDefined();
+      expect(['pending', 'in_progress', 'completed']).toContain(todo.status);
+    }
+
+    // Log success info if verbose
+    if (process.env.VERBOSE === 'true') {
+      console.log('Todo list created successfully');
+    }
+  });
+
+  it('should be able to update todo status', async () => {
+    const rig = new TestRig();
+    await rig.setup('should be able to update todo status');
+
+    // First create a todo list
+    const initialPrompt = `Create a todo list with these tasks:
+1. Set up project structure
+2. Implement authentication
+3. Add database migrations`;
+
+    await rig.run(initialPrompt);
+    await rig.waitForToolCall('todo_write');
+
+    // Now update the todo list by marking one as in progress
+    const updatePrompt = `I've started working on implementing authentication. Please update the todo list to reflect that.`;
+
+    const result = await rig.run(updatePrompt);
+
+    const foundToolCall = await rig.waitForToolCall('todo_write');
+
+    // Add debugging information
+    if (!foundToolCall) {
+      printDebugInfo(rig, result);
+    }
+
+    expect(
+      foundToolCall,
+      'Expected to find a todo_write tool call',
+    ).toBeTruthy();
+
+    // Validate model output - will throw if no output
+    validateModelOutput(result, null, 'Todo update test');
+
+    // Check that the tool was called with updated parameters
+    const toolLogs = rig.readToolLogs();
+    const todoWriteCalls = toolLogs.filter(
+      (t) => t.toolRequest.name === 'todo_write',
+    );
+
+    expect(todoWriteCalls.length).toBeGreaterThan(0);
+
+    // Parse the arguments to verify the update
+    const todoArgs = JSON.parse(
+      todoWriteCalls[todoWriteCalls.length - 1].toolRequest.args,
+    );
+
+    expect(todoArgs.todos).toBeDefined();
+    expect(Array.isArray(todoArgs.todos)).toBe(true);
+    // The model might create a new list with just the task it's working on
+    // or it might update the existing list. Let's check that we have at least one todo
+    expect(todoArgs.todos.length).toBeGreaterThanOrEqual(1);
+
+    // Check that all todos have the correct structure
+    for (const todo of todoArgs.todos) {
+      expect(todo.id).toBeDefined();
+      expect(todo.content).toBeDefined();
+      expect(['pending', 'in_progress', 'completed']).toContain(todo.status);
+    }
+
+    // Log success info if verbose
+    if (process.env.VERBOSE === 'true') {
+      console.log('Todo list updated successfully');
+    }
+  });
+});
--- a/packages/cli/src/ui/components/TodoDisplay.test.tsx
+++ b/packages/cli/src/ui/components/TodoDisplay.test.tsx
@@ -0,0 +1,97 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { render } from 'ink-testing-library';
+import { describe, it, expect } from 'vitest';
+import { TodoItem, TodoDisplay } from './TodoDisplay.js';
+
+describe('TodoDisplay', () => {
+  const mockTodos: TodoItem[] = [
+    {
+      id: '1',
+      content: 'Complete feature implementation',
+      status: 'completed',
+    },
+    {
+      id: '2',
+      content: 'Write unit tests',
+      status: 'in_progress',
+    },
+    {
+      id: '3',
+      content: 'Update documentation',
+      status: 'pending',
+    },
+  ];
+
+  it('should render todo list', () => {
+    const { lastFrame } = render(<TodoDisplay todos={mockTodos} />);
+
+    const output = lastFrame();
+
+    // Check all todo items are displayed
+    expect(output).toContain('Complete feature implementation');
+    expect(output).toContain('Write unit tests');
+    expect(output).toContain('Update documentation');
+  });
+
+  it('should display correct status icons', () => {
+    const { lastFrame } = render(<TodoDisplay todos={mockTodos} />);
+
+    const output = lastFrame();
+
+    // Check status icons are present
+    expect(output).toContain('●'); // completed
+    expect(output).toContain('◐'); // in_progress
+    expect(output).toContain('○'); // pending
+  });
+
+  it('should handle empty todo list', () => {
+    const { lastFrame } = render(<TodoDisplay todos={[]} />);
+
+    const output = lastFrame();
+
+    // Should render nothing for empty todos
+    expect(output).toBe('');
+  });
+
+  it('should handle undefined todos', () => {
+    const { lastFrame } = render(
+      <TodoDisplay todos={undefined as unknown as TodoItem[]} />,
+    );
+
+    const output = lastFrame();
+
+    // Should render nothing for undefined todos
+    expect(output).toBe('');
+  });
+
+  it('should render tasks with different statuses', () => {
+    const allCompleted: TodoItem[] = [
+      { id: '1', content: 'Task 1', status: 'completed' },
+      { id: '2', content: 'Task 2', status: 'completed' },
+    ];
+
+    const { lastFrame } = render(<TodoDisplay todos={allCompleted} />);
+
+    const output = lastFrame();
+    expect(output).toContain('Task 1');
+    expect(output).toContain('Task 2');
+  });
+
+  it('should render tasks with mixed statuses', () => {
+    const mixedTodos: TodoItem[] = [
+      { id: '1', content: 'Task 1', status: 'pending' },
+      { id: '2', content: 'Task 2', status: 'in_progress' },
+    ];
+
+    const { lastFrame } = render(<TodoDisplay todos={mixedTodos} />);
+
+    const output = lastFrame();
+    expect(output).toContain('Task 1');
+    expect(output).toContain('Task 2');
+  });
+});
--- a/packages/cli/src/ui/components/TodoDisplay.tsx
+++ b/packages/cli/src/ui/components/TodoDisplay.tsx
@@ -0,0 +1,72 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import React from 'react';
+import { Box, Text } from 'ink';
+import { Colors } from '../colors.js';
+
+export interface TodoItem {
+  id: string;
+  content: string;
+  status: 'pending' | 'in_progress' | 'completed';
+}
+
+interface TodoDisplayProps {
+  todos: TodoItem[];
+}
+
+const STATUS_ICONS = {
+  pending: '○',
+  in_progress: '◐',
+  completed: '●',
+} as const;
+
+export const TodoDisplay: React.FC<TodoDisplayProps> = ({ todos }) => {
+  if (!todos || todos.length === 0) {
+    return null;
+  }
+
+  return (
+    <Box flexDirection="column">
+      {todos.map((todo) => (
+        <TodoItemRow key={todo.id} todo={todo} />
+      ))}
+    </Box>
+  );
+};
+
+interface TodoItemRowProps {
+  todo: TodoItem;
+}
+
+const TodoItemRow: React.FC<TodoItemRowProps> = ({ todo }) => {
+  const statusIcon = STATUS_ICONS[todo.status];
+  const isCompleted = todo.status === 'completed';
+  const isInProgress = todo.status === 'in_progress';
+
+  // Use the same color for both status icon and text, like RadioButtonSelect
+  const itemColor = isCompleted
+    ? Colors.Foreground
+    : isInProgress
+      ? Colors.AccentGreen
+      : Colors.Foreground;
+
+  return (
+    <Box flexDirection="row" minHeight={1}>
+      {/* Status Icon */}
+      <Box width={3}>
+        <Text color={itemColor}>{statusIcon}</Text>
+      </Box>
+
+      {/* Content */}
+      <Box flexGrow={1}>
+        <Text color={itemColor} strikethrough={isCompleted} wrap="wrap">
+          {todo.content}
+        </Text>
+      </Box>
+    </Box>
+  );
+};
--- a/packages/cli/src/ui/components/messages/ToolMessage.tsx
+++ b/packages/cli/src/ui/components/messages/ToolMessage.tsx
@@ -12,6 +12,8 @@ import { Colors } from '../../colors.js';
 import { MarkdownDisplay } from '../../utils/MarkdownDisplay.js';
 import { GeminiRespondingSpinner } from '../GeminiRespondingSpinner.js';
 import { MaxSizedBox } from '../shared/MaxSizedBox.js';
+import { TodoDisplay } from '../TodoDisplay.js';
+import { TodoResultDisplay } from '@qwen-code/qwen-code-core';

 const STATIC_HEIGHT = 1;
 const RESERVED_LINE_COUNT = 5; // for tool name, status, padding etc.
@@ -23,6 +25,116 @@ const MIN_LINES_SHOWN = 2; // show at least this many lines
 const MAXIMUM_RESULT_DISPLAY_CHARACTERS = 1000000;
 export type TextEmphasis = 'high' | 'medium' | 'low';

+type DisplayRendererResult =
+  | { type: 'none' }
+  | { type: 'todo'; data: TodoResultDisplay }
+  | { type: 'string'; data: string }
+  | { type: 'diff'; data: { fileDiff: string; fileName: string } };
+
+/**
+ * Custom hook to determine the type of result display and return appropriate rendering info
+ */
+const useResultDisplayRenderer = (
+  resultDisplay: unknown,
+): DisplayRendererResult =>
+  React.useMemo(() => {
+    if (!resultDisplay) {
+      return { type: 'none' };
+    }
+
+    // Check for TodoResultDisplay
+    if (
+      typeof resultDisplay === 'object' &&
+      resultDisplay !== null &&
+      'type' in resultDisplay &&
+      resultDisplay.type === 'todo_list'
+    ) {
+      return {
+        type: 'todo',
+        data: resultDisplay as TodoResultDisplay,
+      };
+    }
+
+    // Check for FileDiff
+    if (
+      typeof resultDisplay === 'object' &&
+      resultDisplay !== null &&
+      'fileDiff' in resultDisplay
+    ) {
+      return {
+        type: 'diff',
+        data: resultDisplay as { fileDiff: string; fileName: string },
+      };
+    }
+
+    // Default to string
+    return {
+      type: 'string',
+      data: resultDisplay as string,
+    };
+  }, [resultDisplay]);
+
+/**
+ * Component to render todo list results
+ */
+const TodoResultRenderer: React.FC<{ data: TodoResultDisplay }> = ({
+  data,
+}) => <TodoDisplay todos={data.todos} />;
+
+/**
+ * Component to render string results (markdown or plain text)
+ */
+const StringResultRenderer: React.FC<{
+  data: string;
+  renderAsMarkdown: boolean;
+  availableHeight?: number;
+  childWidth: number;
+}> = ({ data, renderAsMarkdown, availableHeight, childWidth }) => {
+  let displayData = data;
+
+  // Truncate if too long
+  if (displayData.length > MAXIMUM_RESULT_DISPLAY_CHARACTERS) {
+    displayData = '...' + displayData.slice(-MAXIMUM_RESULT_DISPLAY_CHARACTERS);
+  }
+
+  if (renderAsMarkdown) {
+    return (
+      <Box flexDirection="column">
+        <MarkdownDisplay
+          text={displayData}
+          isPending={false}
+          availableTerminalHeight={availableHeight}
+          terminalWidth={childWidth}
+        />
+      </Box>
+    );
+  }
+
+  return (
+    <MaxSizedBox maxHeight={availableHeight} maxWidth={childWidth}>
+      <Box>
+        <Text wrap="wrap">{displayData}</Text>
+      </Box>
+    </MaxSizedBox>
+  );
+};
+
+/**
+ * Component to render diff results
+ */
+const DiffResultRenderer: React.FC<{
+  data: { fileDiff: string; fileName: string };
+  availableHeight?: number;
+  childWidth: number;
+}> = ({ data, availableHeight, childWidth }) => (
+  <DiffRenderer
+    diffContent={data.fileDiff}
+    filename={data.fileName}
+    availableTerminalHeight={availableHeight}
+    terminalWidth={childWidth}
+  />
+);
+
 export interface ToolMessageProps extends IndividualToolCallDisplay {
  availableTerminalHeight?: number;
  terminalWidth: number;
@@ -55,13 +167,10 @@ export const ToolMessage: React.FC<ToolMessageProps> = ({
  }

  const childWidth = terminalWidth - 3; // account for padding.
-  if (typeof resultDisplay === 'string') {
-    if (resultDisplay.length > MAXIMUM_RESULT_DISPLAY_CHARACTERS) {
-      // Truncate the result display to fit within the available width.
-      resultDisplay =
-        '...' + resultDisplay.slice(-MAXIMUM_RESULT_DISPLAY_CHARACTERS);
-    }
-  }
+
+  // Use the custom hook to determine the display type
+  const displayRenderer = useResultDisplayRenderer(resultDisplay);
+
  return (
    <Box paddingX={1} paddingY={0} flexDirection="column">
      <Box minHeight={1}>
@@ -74,32 +183,25 @@ export const ToolMessage: React.FC<ToolMessageProps> = ({
        />
        {emphasis === 'high' && <TrailingIndicator />}
      </Box>
-      {resultDisplay && (
+      {displayRenderer.type !== 'none' && (
        <Box paddingLeft={STATUS_INDICATOR_WIDTH} width="100%" marginTop={1}>
          <Box flexDirection="column">
-            {typeof resultDisplay === 'string' && renderOutputAsMarkdown && (
-              <Box flexDirection="column">
-                <MarkdownDisplay
-                  text={resultDisplay}
-                  isPending={false}
-                  availableTerminalHeight={availableHeight}
-                  terminalWidth={childWidth}
-                />
-              </Box>
+            {displayRenderer.type === 'todo' && (
+              <TodoResultRenderer data={displayRenderer.data} />
            )}
-            {typeof resultDisplay === 'string' && !renderOutputAsMarkdown && (
-              <MaxSizedBox maxHeight={availableHeight} maxWidth={childWidth}>
-                <Box>
-                  <Text wrap="wrap">{resultDisplay}</Text>
-                </Box>
-              </MaxSizedBox>
+            {displayRenderer.type === 'string' && (
+              <StringResultRenderer
+                data={displayRenderer.data}
+                renderAsMarkdown={renderOutputAsMarkdown}
+                availableHeight={availableHeight}
+                childWidth={childWidth}
+              />
            )}
-            {typeof resultDisplay !== 'string' && (
-              <DiffRenderer
-                diffContent={resultDisplay.fileDiff}
-                filename={resultDisplay.fileName}
-                availableTerminalHeight={availableHeight}
-                terminalWidth={childWidth}
+            {displayRenderer.type === 'diff' && (
+              <DiffResultRenderer
+                data={displayRenderer.data}
+                availableHeight={availableHeight}
+                childWidth={childWidth}
              />
            )}
          </Box>
--- a/packages/cli/src/zed-integration/zedIntegration.ts
+++ b/packages/cli/src/zed-integration/zedIntegration.ts
@@ -770,7 +770,28 @@ function toToolCallContent(toolResult: ToolResult): acp.ToolCallContent | null {
        type: 'content',
        content: { type: 'text', text: toolResult.returnDisplay },
      };
-    } else {
+    } else if (
+      'type' in toolResult.returnDisplay &&
+      toolResult.returnDisplay.type === 'todo_list'
+    ) {
+      // Handle TodoResultDisplay - convert to text representation
+      const todoText = toolResult.returnDisplay.todos
+        .map((todo) => {
+          const statusIcon = {
+            pending: '○',
+            in_progress: '◐',
+            completed: '●',
+          }[todo.status];
+          return `${statusIcon} ${todo.content}`;
+        })
+        .join('\n');
+
+      return {
+        type: 'content',
+        content: { type: 'text', text: todoText },
+      };
+    } else if ('fileDiff' in toolResult.returnDisplay) {
+      // Handle FileDiff
      return {
        type: 'diff',
        path: toolResult.returnDisplay.fileName,
@@ -778,9 +799,8 @@ function toToolCallContent(toolResult: ToolResult): acp.ToolCallContent | null {
        newText: toolResult.returnDisplay.newContent,
      };
    }
-  } else {
-    return null;
  }
+  return null;
 }

 const basicPermissionOptions = [
--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@@ -27,6 +27,7 @@ import {
  setGeminiMdFilename,
  GEMINI_CONFIG_DIR as GEMINI_DIR,
 } from '../tools/memoryTool.js';
+import { TodoWriteTool } from '../tools/todoWrite.js';
 import { WebSearchTool } from '../tools/web-search.js';
 import { GeminiClient } from '../core/client.js';
 import { FileDiscoveryService } from '../services/fileDiscoveryService.js';
@@ -867,6 +868,7 @@ export class Config {
    registerCoreTool(ReadManyFilesTool, this);
    registerCoreTool(ShellTool, this);
    registerCoreTool(MemoryTool);
+    registerCoreTool(TodoWriteTool, this);
    // Conditionally register web search tool only if Tavily API key is set
    if (this.getTavilyApiKey()) {
      registerCoreTool(WebSearchTool, this);
--- a/packages/core/src/core/snapshots/prompts.test.ts.snap
+++ b/packages/core/src/core/snapshots/prompts.test.ts.snap
@@ -16,15 +16,69 @@ exports[`Core System Prompt (prompts.ts) > should append userMemory with separat
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -41,7 +95,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -66,6 +120,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

@@ -254,15 +309,69 @@ exports[`Core System Prompt (prompts.ts) > should include git instructions when
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -279,7 +388,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -304,6 +413,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

@@ -502,15 +612,69 @@ exports[`Core System Prompt (prompts.ts) > should include non-sandbox instructio
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -527,7 +691,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -552,6 +716,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

@@ -735,15 +900,69 @@ exports[`Core System Prompt (prompts.ts) > should include sandbox-specific instr
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -760,7 +979,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -785,6 +1004,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

@@ -968,15 +1188,69 @@ exports[`Core System Prompt (prompts.ts) > should include seatbelt-specific inst
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -993,7 +1267,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -1018,6 +1292,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

@@ -1201,15 +1476,69 @@ exports[`Core System Prompt (prompts.ts) > should not include git instructions w
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -1226,7 +1555,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -1251,6 +1580,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

@@ -1434,15 +1764,69 @@ exports[`Core System Prompt (prompts.ts) > should return the base prompt when no
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -1459,7 +1843,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -1484,6 +1868,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

@@ -1667,15 +2052,69 @@ exports[`Core System Prompt (prompts.ts) > should return the base prompt when us
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -1692,7 +2131,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -1717,6 +2156,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

@@ -1900,15 +2340,69 @@ exports[`Core System Prompt (prompts.ts) > should return the base prompt when us
 - **Path Construction:** Before using any file system tool (e.g., read_file' or 'write_file'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the todo_write tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the todo_write tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the todo_write tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the todo_write tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use 'search_file_content' and 'glob' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use 'read_file' and 'read_many_files' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the 'todo_write' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use 'search_file_content', 'glob', 'read_file', and 'read_many_files' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., 'replace', 'write_file' 'run_shell_command' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the todo_write tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -1925,7 +2419,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the 'todo_write' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using 'run_shell_command' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -1950,6 +2444,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the 'run_shell_command' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the 'todo_write' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the 'save_memory' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

--- a/packages/core/src/core/prompts.ts
+++ b/packages/core/src/core/prompts.ts
@@ -17,6 +17,7 @@ import { WriteFileTool } from '../tools/write-file.js';
 import process from 'node:process';
 import { isGitRepository } from '../utils/gitUtils.js';
 import { MemoryTool, GEMINI_CONFIG_DIR } from '../tools/memoryTool.js';
+import { TodoWriteTool } from '../tools/todoWrite.js';

 export interface ModelTemplateMapping {
  baseUrls?: string[];
@@ -135,15 +136,69 @@ You are Qwen Code, an interactive CLI agent developed by Alibaba Group, speciali
 - **Path Construction:** Before using any file system tool (e.g., ${ReadFileTool.Name}' or '${WriteFileTool.Name}'), you must construct the full absolute path for the file_path argument. Always combine the absolute path of the project's root directory with the file's path relative to the root. For example, if the project root is /path/to/project/ and the file is foo/bar/baz.txt, the final path you must use is /path/to/project/foo/bar/baz.txt. If the user provides a relative path, you must resolve it against the root directory to create an absolute path.
 - **Do Not revert changes:** Do not revert changes to the codebase unless asked to do so by the user. Only revert changes made by you if they have resulted in an error or if the user has explicitly asked you to revert the changes.

+# Task Management
+You have access to the ${TodoWriteTool.Name} tool to help you manage and plan tasks. Use these tools VERY frequently to ensure that you are tracking your tasks and giving the user visibility into your progress.
+These tools are also EXTREMELY helpful for planning tasks, and for breaking down larger complex tasks into smaller steps. If you do not use this tool when planning, you may forget to do important tasks - and that is unacceptable.
+
+It is critical that you mark todos as completed as soon as you are done with a task. Do not batch up multiple tasks before marking them as completed.
+
+Examples:
+
+<example>
+user: Run the build and fix any type errors
+assistant: I'm going to use the ${TodoWriteTool.Name} tool to write the following items to the todo list: 
+- Run the build
+- Fix any type errors
+
+I'm now going to run the build using Bash.
+
+Looks like I found 10 type errors. I'm going to use the ${TodoWriteTool.Name} tool to write 10 items to the todo list.
+
+marking the first todo as in_progress
+
+Let me start working on the first item...
+
+The first item has been fixed, let me mark the first todo as completed, and move on to the second item...
+..
+..
+</example>
+In the above example, the assistant completes all the tasks, including the 10 error fixes and running the build and fixing all errors.
+
+<example>
+user: Help me write a new feature that allows users to track their usage metrics and export them to various formats
+
+A: I'll help you implement a usage metrics tracking and export feature. Let me first use the ${TodoWriteTool.Name} tool to plan this task.
+Adding the following todos to the todo list:
+1. Research existing metrics tracking in the codebase
+2. Design the metrics collection system
+3. Implement core metrics tracking functionality
+4. Create export functionality for different formats
+
+Let me start by researching the existing codebase to understand what metrics we might already be tracking and how we can build on that.
+
+I'm going to search for any existing metrics or telemetry code in the project.
+
+I've found some existing telemetry code. Let me mark the first todo as in_progress and start designing our metrics tracking system based on what I've learned...
+
+[Assistant continues implementing the feature step by step, marking todos as in_progress and completed as they go]
+</example>
+
+
 # Primary Workflows

 ## Software Engineering Tasks
-When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this sequence:
-1. **Understand:** Think about the user's request and the relevant codebase context. Use '${GrepTool.Name}' and '${GlobTool.Name}' search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use '${ReadFileTool.Name}' and '${ReadManyFilesTool.Name}' to understand context and validate any assumptions you may have.
-2. **Plan:** Build a coherent and grounded (based on the understanding in step 1) plan for how you intend to resolve the user's task. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process. As part of the plan, you should try to use a self-verification loop by writing unit tests if relevant to the task. Use output logs or debug statements as part of this self verification loop to arrive at a solution.
-3. **Implement:** Use the available tools (e.g., '${EditTool.Name}', '${WriteFileTool.Name}' '${ShellTool.Name}' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
-4. **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
-5. **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+When requested to perform tasks like fixing bugs, adding features, refactoring, or explaining code, follow this iterative approach:
+- **Plan:** After understanding the user's request, create an initial plan based on your existing knowledge and any immediately obvious context. Use the '${TodoWriteTool.Name}' tool to capture this rough plan for complex or multi-step work. Don't wait for complete understanding - start with what you know.
+- **Implement:** Begin implementing the plan while gathering additional context as needed. Use '${GrepTool.Name}', '${GlobTool.Name}', '${ReadFileTool.Name}', and '${ReadManyFilesTool.Name}' tools strategically when you encounter specific unknowns during implementation. Use the available tools (e.g., '${EditTool.Name}', '${WriteFileTool.Name}' '${ShellTool.Name}' ...) to act on the plan, strictly adhering to the project's established conventions (detailed under 'Core Mandates').
+- **Adapt:** As you discover new information or encounter obstacles, update your plan and todos accordingly. Mark todos as in_progress when starting and completed when finishing each task. Add new todos if the scope expands. Refine your approach based on what you learn.
+- **Verify (Tests):** If applicable and feasible, verify the changes using the project's testing procedures. Identify the correct test commands and frameworks by examining 'README' files, build/package configuration (e.g., 'package.json'), or existing test execution patterns. NEVER assume standard test commands.
+- **Verify (Standards):** VERY IMPORTANT: After making code changes, execute the project-specific build, linting and type-checking commands (e.g., 'tsc', 'npm run lint', 'ruff check .') that you have identified for this project (or obtained from the user). This ensures code quality and adherence to standards. If unsure about these commands, you can ask the user if they'd like you to run them and if so how to.
+
+**Key Principle:** Start with a reasonable plan based on available information, then adapt as you learn. Users prefer seeing progress quickly rather than waiting for perfect understanding.
+
+- Tool results and user messages may include <system-reminder> tags. <system-reminder> tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result.
+
+IMPORTANT: Always use the ${TodoWriteTool.Name} tool to plan and track tasks throughout the conversation.

 ## New Applications

@@ -160,7 +215,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
  - **3d Games:** HTML/CSS/JavaScript with Three.js.
  - **2d Games:** HTML/CSS/JavaScript.
 3. **User Approval:** Obtain user approval for the proposed plan.
-4. **Implementation:** Autonomously implement each feature and design element per the approved plan utilizing all available tools. When starting ensure you scaffold the application using '${ShellTool.Name}' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
+4. **Implementation:** Use the '${TodoWriteTool.Name}' tool to convert the approved plan into a structured todo list with specific, actionable tasks, then autonomously implement each task utilizing all available tools. When starting ensure you scaffold the application using '${ShellTool.Name}' for commands like 'npm init', 'npx create-react-app'. Aim for full scope completion. Proactively create or source necessary placeholder assets (e.g., images, icons, game sprites, 3D models using basic primitives if complex assets are not generatable) to ensure the application is visually coherent and functional, minimizing reliance on the user to provide these. If the model can generate simple assets (e.g., a uniformly colored square sprite, a simple 3D cube), it should do so. Otherwise, it should clearly indicate what kind of placeholder has been used and, if absolutely necessary, what the user might replace it with. Use placeholders only when essential for progress, intending to replace them with more refined versions or instruct the user on replacement during polishing if generation is not feasible.
 5. **Verify:** Review work against the original request, the approved plan. Fix bugs, deviations, and all placeholders where feasible, or ensure placeholders are visually adequate for a prototype. Ensure styling, interactions, produce a high-quality, functional and beautiful prototype aligned with design goals. Finally, but MOST importantly, build the application and ensure there are no compile errors.
 6. **Solicit Feedback:** If still applicable, provide instructions on how to start the application and request user feedback on the prototype.

@@ -185,6 +240,7 @@ When requested to perform tasks like fixing bugs, adding features, refactoring,
 - **Command Execution:** Use the '${ShellTool.Name}' tool for running shell commands, remembering the safety rule to explain modifying commands first.
 - **Background Processes:** Use background processes (via \`&\`) for commands that are unlikely to stop on their own, e.g. \`node server.js &\`. If unsure, ask the user.
 - **Interactive Commands:** Try to avoid shell commands that are likely to require user interaction (e.g. \`git rebase -i\`). Use non-interactive versions of commands (e.g. \`npm init -y\` instead of \`npm init\`) when available, and otherwise remind the user that interactive shell commands are not supported and may cause hangs until canceled by the user.
+- **Task Management:** Use the '${TodoWriteTool.Name}' tool proactively for complex, multi-step tasks to track progress and provide visibility to users. This tool helps organize work systematically and ensures no requirements are missed.
 - **Remembering Facts:** Use the '${MemoryTool.Name}' tool to remember specific, *user-related* facts or preferences when the user explicitly asks, or when they state a clear, concise piece of information that would help personalize or streamline *your future interactions with them* (e.g., preferred coding style, common project paths they use, personal tool aliases). This tool is for user-specific information that should persist across sessions. Do *not* use it for general project context or information. If unsure whether to save something, you can ask the user, "Should I remember that for you?"
 - **Respect User Confirmations:** Most tool calls (also denoted as 'function calls') will first require confirmation from the user, where they will either approve or cancel the function call. If a user cancels a function call, respect their choice and do _not_ try to make the function call again. It is okay to request the tool call again _only_ if the user requests that same tool call on a subsequent prompt. When a user cancels a function call, assume best intentions from the user and consider inquiring if they prefer any alternative paths forward.

--- a/packages/core/src/tools/todoWrite.test.ts
+++ b/packages/core/src/tools/todoWrite.test.ts
@@ -0,0 +1,283 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import { TodoWriteTool, TodoWriteParams, TodoItem } from './todoWrite.js';
+import * as fs from 'fs/promises';
+import * as fsSync from 'fs';
+import { Config } from '../config/config.js';
+
+// Mock fs modules
+vi.mock('fs/promises');
+vi.mock('fs');
+
+const mockFs = vi.mocked(fs);
+const mockFsSync = vi.mocked(fsSync);
+
+describe('TodoWriteTool', () => {
+  let tool: TodoWriteTool;
+  let mockAbortSignal: AbortSignal;
+  let mockConfig: Config;
+
+  beforeEach(() => {
+    mockConfig = {
+      getSessionId: () => 'test-session-123',
+    } as Config;
+    tool = new TodoWriteTool(mockConfig);
+    mockAbortSignal = new AbortController().signal;
+    vi.clearAllMocks();
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  describe('validateToolParams', () => {
+    it('should validate correct parameters', () => {
+      const params: TodoWriteParams = {
+        todos: [
+          { id: '1', content: 'Task 1', status: 'pending' },
+          { id: '2', content: 'Task 2', status: 'in_progress' },
+        ],
+      };
+
+      const result = tool.validateToolParams(params);
+      expect(result).toBeNull();
+    });
+
+    it('should accept empty todos array', () => {
+      const params: TodoWriteParams = {
+        todos: [],
+      };
+
+      const result = tool.validateToolParams(params);
+      expect(result).toBeNull();
+    });
+
+    it('should accept single todo', () => {
+      const params: TodoWriteParams = {
+        todos: [{ id: '1', content: 'Task 1', status: 'pending' }],
+      };
+
+      const result = tool.validateToolParams(params);
+      expect(result).toBeNull();
+    });
+
+    it('should reject todos with empty content', () => {
+      const params: TodoWriteParams = {
+        todos: [
+          { id: '1', content: '', status: 'pending' },
+          { id: '2', content: 'Task 2', status: 'pending' },
+        ],
+      };
+
+      const result = tool.validateToolParams(params);
+      expect(result).toContain('must NOT have fewer than 1 characters');
+    });
+
+    it('should reject todos with empty id', () => {
+      const params: TodoWriteParams = {
+        todos: [
+          { id: '', content: 'Task 1', status: 'pending' },
+          { id: '2', content: 'Task 2', status: 'pending' },
+        ],
+      };
+
+      const result = tool.validateToolParams(params);
+      expect(result).toContain('non-empty "id"');
+    });
+
+    it('should reject todos with invalid status', () => {
+      const params: TodoWriteParams = {
+        todos: [
+          {
+            id: '1',
+            content: 'Task 1',
+            status: 'invalid' as TodoItem['status'],
+          },
+          { id: '2', content: 'Task 2', status: 'pending' },
+        ],
+      };
+
+      const result = tool.validateToolParams(params);
+      expect(result).toContain('must be equal to one of the allowed values');
+    });
+
+    it('should reject todos with duplicate IDs', () => {
+      const params: TodoWriteParams = {
+        todos: [
+          { id: '1', content: 'Task 1', status: 'pending' },
+          { id: '1', content: 'Task 2', status: 'pending' },
+        ],
+      };
+
+      const result = tool.validateToolParams(params);
+      expect(result).toContain('unique');
+    });
+  });
+
+  describe('execute', () => {
+    it('should create new todos file when none exists', async () => {
+      const params: TodoWriteParams = {
+        todos: [
+          { id: '1', content: 'Task 1', status: 'pending' },
+          { id: '2', content: 'Task 2', status: 'in_progress' },
+        ],
+      };
+
+      // Mock file not existing
+      mockFs.readFile.mockRejectedValue({ code: 'ENOENT' });
+      mockFs.mkdir.mockResolvedValue(undefined);
+      mockFs.writeFile.mockResolvedValue(undefined);
+
+      const invocation = tool.build(params);
+      const result = await invocation.execute(mockAbortSignal);
+
+      expect(result.llmContent).toContain('success');
+      expect(result.returnDisplay).toEqual({
+        type: 'todo_list',
+        todos: [
+          { id: '1', content: 'Task 1', status: 'pending' },
+          { id: '2', content: 'Task 2', status: 'in_progress' },
+        ],
+      });
+      expect(mockFs.writeFile).toHaveBeenCalledWith(
+        expect.stringContaining('test-session-123.json'),
+        expect.stringContaining('"todos"'),
+        'utf-8',
+      );
+    });
+
+    it('should replace todos with new ones', async () => {
+      const existingTodos = [
+        { id: '1', content: 'Existing Task', status: 'completed' },
+      ];
+
+      const params: TodoWriteParams = {
+        todos: [
+          { id: '1', content: 'Updated Task', status: 'completed' },
+          { id: '2', content: 'New Task', status: 'pending' },
+        ],
+      };
+
+      // Mock existing file
+      mockFs.readFile.mockResolvedValue(
+        JSON.stringify({ todos: existingTodos }),
+      );
+      mockFs.mkdir.mockResolvedValue(undefined);
+      mockFs.writeFile.mockResolvedValue(undefined);
+
+      const invocation = tool.build(params);
+      const result = await invocation.execute(mockAbortSignal);
+
+      expect(result.llmContent).toContain('success');
+      expect(result.returnDisplay).toEqual({
+        type: 'todo_list',
+        todos: [
+          { id: '1', content: 'Updated Task', status: 'completed' },
+          { id: '2', content: 'New Task', status: 'pending' },
+        ],
+      });
+      expect(mockFs.writeFile).toHaveBeenCalledWith(
+        expect.stringContaining('test-session-123.json'),
+        expect.stringMatching(/"Updated Task"/),
+        'utf-8',
+      );
+    });
+
+    it('should handle file write errors', async () => {
+      const params: TodoWriteParams = {
+        todos: [
+          { id: '1', content: 'Task 1', status: 'pending' },
+          { id: '2', content: 'Task 2', status: 'pending' },
+        ],
+      };
+
+      mockFs.readFile.mockRejectedValue({ code: 'ENOENT' });
+      mockFs.mkdir.mockResolvedValue(undefined);
+      mockFs.writeFile.mockRejectedValue(new Error('Write failed'));
+
+      const invocation = tool.build(params);
+      const result = await invocation.execute(mockAbortSignal);
+
+      expect(result.llmContent).toContain('"success":false');
+      expect(result.returnDisplay).toContain('Error writing todos');
+    });
+
+    it('should handle empty todos array', async () => {
+      const params: TodoWriteParams = {
+        todos: [],
+      };
+
+      mockFs.mkdir.mockResolvedValue(undefined);
+      mockFs.writeFile.mockResolvedValue(undefined);
+
+      const invocation = tool.build(params);
+      const result = await invocation.execute(mockAbortSignal);
+
+      expect(result.llmContent).toContain('success');
+      expect(result.returnDisplay).toEqual({
+        type: 'todo_list',
+        todos: [],
+      });
+      expect(mockFs.writeFile).toHaveBeenCalledWith(
+        expect.stringContaining('test-session-123.json'),
+        expect.stringContaining('"todos"'),
+        'utf-8',
+      );
+    });
+  });
+
+  describe('tool properties', () => {
+    it('should have correct tool name', () => {
+      expect(TodoWriteTool.Name).toBe('todo_write');
+      expect(tool.name).toBe('todo_write');
+    });
+
+    it('should have correct display name', () => {
+      expect(tool.displayName).toBe('Todo Write');
+    });
+
+    it('should have correct kind', () => {
+      expect(tool.kind).toBe('think');
+    });
+
+    it('should have schema with required properties', () => {
+      const schema = tool.schema;
+      expect(schema.name).toBe('todo_write');
+      expect(schema.parametersJsonSchema).toHaveProperty('properties.todos');
+      expect(schema.parametersJsonSchema).not.toHaveProperty(
+        'properties.merge',
+      );
+    });
+  });
+
+  describe('getDescription', () => {
+    it('should return "Create todos" when no todos file exists', () => {
+      // Mock existsSync to return false (file doesn't exist)
+      mockFsSync.existsSync.mockReturnValue(false);
+
+      const params = {
+        todos: [{ id: '1', content: 'Test todo', status: 'pending' as const }],
+      };
+      const invocation = tool.build(params);
+      expect(invocation.getDescription()).toBe('Create todos');
+    });
+
+    it('should return "Update todos" when todos file exists', () => {
+      // Mock existsSync to return true (file exists)
+      mockFsSync.existsSync.mockReturnValue(true);
+
+      const params = {
+        todos: [
+          { id: '1', content: 'Updated todo', status: 'completed' as const },
+        ],
+      };
+      const invocation = tool.build(params);
+      expect(invocation.getDescription()).toBe('Update todos');
+    });
+  });
+});
--- a/packages/core/src/tools/todoWrite.ts
+++ b/packages/core/src/tools/todoWrite.ts
@@ -0,0 +1,466 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import {
+  BaseDeclarativeTool,
+  BaseToolInvocation,
+  Kind,
+  ToolResult,
+} from './tools.js';
+import { FunctionDeclaration } from '@google/genai';
+import * as fs from 'fs/promises';
+import * as fsSync from 'fs';
+import * as path from 'path';
+import * as process from 'process';
+
+import { QWEN_DIR } from '../utils/paths.js';
+import { SchemaValidator } from '../utils/schemaValidator.js';
+import { Config } from '../config/config.js';
+
+export interface TodoItem {
+  id: string;
+  content: string;
+  status: 'pending' | 'in_progress' | 'completed';
+}
+
+export interface TodoWriteParams {
+  todos: TodoItem[];
+  modified_by_user?: boolean;
+  modified_content?: string;
+}
+
+const todoWriteToolSchemaData: FunctionDeclaration = {
+  name: 'todo_write',
+  description:
+    'Creates and manages a structured task list for your current coding session. This helps track progress, organize complex tasks, and demonstrate thoroughness.',
+  parametersJsonSchema: {
+    type: 'object',
+    properties: {
+      todos: {
+        type: 'array',
+        items: {
+          type: 'object',
+          properties: {
+            content: {
+              type: 'string',
+              minLength: 1,
+            },
+            status: {
+              type: 'string',
+              enum: ['pending', 'in_progress', 'completed'],
+            },
+            id: {
+              type: 'string',
+            },
+          },
+          required: ['content', 'status', 'id'],
+          additionalProperties: false,
+        },
+        description: 'The updated todo list',
+      },
+    },
+    required: ['todos'],
+    $schema: 'http://json-schema.org/draft-07/schema#',
+  },
+};
+
+const todoWriteToolDescription = `
+Use this tool to create and manage a structured task list for your current coding session. This helps you track progress, organize complex tasks, and demonstrate thoroughness to the user.
+It also helps the user understand the progress of the task and overall progress of their requests.
+
+## When to Use This Tool
+Use this tool proactively in these scenarios:
+
+1. Complex multi-step tasks - When a task requires 3 or more distinct steps or actions
+2. Non-trivial and complex tasks - Tasks that require careful planning or multiple operations
+3. User explicitly requests todo list - When the user directly asks you to use the todo list
+4. User provides multiple tasks - When users provide a list of things to be done (numbered or comma-separated)
+5. After receiving new instructions - Immediately capture user requirements as todos
+6. When you start working on a task - Mark it as in_progress BEFORE beginning work. Ideally you should only have one todo as in_progress at a time
+7. After completing a task - Mark it as completed and add any new follow-up tasks discovered during implementation
+
+## When NOT to Use This Tool
+
+Skip using this tool when:
+1. There is only a single, straightforward task
+2. The task is trivial and tracking it provides no organizational benefit
+3. The task can be completed in less than 3 trivial steps
+4. The task is purely conversational or informational
+
+NOTE that you should not use this tool if there is only one trivial task to do. In this case you are better off just doing the task directly.
+
+## Examples of When to Use the Todo List
+
+<example>
+User: I want to add a dark mode toggle to the application settings. Make sure you run the tests and build when you're done!
+Assistant: I'll help add a dark mode toggle to your application settings. Let me create a todo list to track this implementation.
+*Creates todo list with the following items:*
+1. Create dark mode toggle component in Settings page
+2. Add dark mode state management (context/store)
+3. Implement CSS-in-JS styles for dark theme
+4. Update existing components to support theme switching
+5. Run tests and build process, addressing any failures or errors that occur
+*Begins working on the first task*
+
+<reasoning>
+The assistant used the todo list because:
+1. Adding dark mode is a multi-step feature requiring UI, state management, and styling changes
+2. The user explicitly requested tests and build be run afterward
+3. The assistant inferred that tests and build need to pass by adding "Ensure tests and build succeed" as the final task
+</reasoning>
+</example>
+
+<example>
+User: Help me rename the function getCwd to getCurrentWorkingDirectory across my project
+Assistant: Let me first search through your codebase to find all occurrences of 'getCwd'.
+*Uses grep or search tools to locate all instances of getCwd in the codebase*
+Assistant: I've found 15 instances of 'getCwd' across 8 different files. Let me create a todo list to track these changes.
+*Creates todo list with specific items for each file that needs updating*
+
+<reasoning>
+The assistant used the todo list because:
+1. First, the assistant searched to understand the scope of the task
+2. Upon finding multiple occurrences across different files, it determined this was a complex task with multiple steps
+3. The todo list helps ensure every instance is tracked and updated systematically
+4. This approach prevents missing any occurrences and maintains code consistency
+</reasoning>
+</example>
+
+
+<example>
+User: I need to implement these features for my e-commerce site: user registration, product catalog, shopping cart, and checkout flow.
+Assistant: I'll help implement these features. First, let's add all the features to the todo list.
+*Creates a todo list breaking down each feature into specific tasks based on the project architecture*
+Assistant: Let's start with user registration. This will involve creating database models, API endpoints, and frontend forms that integrate with your existing authentication system.
+
+<reasoning>
+The assistant used the todo list because:
+1. The user provided multiple complex features to implement in a comma separated list
+2. The todo list helps organize these large features into manageable tasks
+3. This approach allows for tracking progress across the entire implementation
+</reasoning>
+</example>
+
+<example>
+User: Can you help optimize my React application? It's rendering slowly and has performance issues.</user>
+Assistant: I'll help optimize your React application. First, let me examine your codebase to identify potential performance bottlenecks.
+*Reviews component structure, render patterns, state management, and data fetching*
+Assistant: After analyzing your codebase, I've identified several performance issues. Let me create a todo list to track our optimization efforts.
+*Creates todo list with items like: 1) Implement memoization for expensive calculations in ProductList, 2) Add virtualization for long lists in Dashboard, 3) Optimize image loading in Gallery component, 4) Fix state update loops in ShoppingCart, 5) Review bundle size and implement code splitting*
+Let's start by implementing memoization for the expensive calculations in your ProductList component.</assistant>
+
+<reasoning>
+The assistant used the todo list because:
+1. First, the assistant examined the codebase to identify specific performance issues
+2. Based on this analysis, it identified multiple optimization opportunities
+3. Performance optimization is a non-trivial task requiring multiple steps
+4. The todo list helps methodically track improvements across different components
+5. This systematic approach ensures all performance bottlenecks are addressed
+</reasoning>
+</example>
+
+## Examples of When NOT to Use the Todo List
+
+<example>
+User: How do I print 'Hello World' in Python?
+Assistant: In Python, you can print "Hello World" with this simple code:
+
+python
+print("Hello World")
+
+This will output the text "Hello World" to the console when executed.</assistant>
+
+<reasoning>
+The assistant did not use the todo list because this is a single, trivial task that can be completed in one step. There's no need to track multiple tasks or steps for such a straightforward request.
+</reasoning>
+</example>
+
+<example>
+User: What does the git status command do?
+Assistant: The git status command shows the current state of your working directory and staging area. It displays which changes have been staged, which haven't, and which files aren't being tracked by Git. This is a fundamental command for understanding the current state of your repository before making commits.
+
+<reasoning>
+The assistant did not use the todo list because this is an informational request with no actual coding task to complete. The user is simply asking for an explanation, not for the assistant to perform multiple steps or tasks.
+</reasoning>
+</example>
+
+<example>
+User: Can you add a comment to the calculateTotal function to explain what it does?
+Assistant: Sure, let me add a comment to the calculateTotal function to explain what it does.
+* Uses the Edit tool to add a comment to the calculateTotal function *
+
+<reasoning>
+The assistant did not use the todo list because this is a single, straightforward task confined to one location in the code. Adding a comment doesn't require tracking multiple steps or systematic organization.
+</reasoning>
+</example>
+
+<example>
+User: Run npm install for me and tell me what happens.
+Assistant: I'll run the npm install command for you.
+
+*Executes: npm install*
+
+The command completed successfully. Here's the output:
+[Output of npm install command]
+
+All dependencies have been installed according to your package.json file.
+
+<reasoning>
+The assistant did not use the todo list because this is a single command execution with immediate results. There are no multiple steps to track or organize, making the todo list unnecessary for this straightforward task.
+</reasoning>
+</example>
+
+## Task States and Management
+
+1. **Task States**: Use these states to track progress:
+   - pending: Task not yet started
+   - in_progress: Currently working on (limit to ONE task at a time)
+   - completed: Task finished successfully
+
+2. **Task Management**:
+   - Update task status in real-time as you work
+   - Mark tasks complete IMMEDIATELY after finishing (don't batch completions)
+   - Only have ONE task in_progress at any time
+   - Complete current tasks before starting new ones
+   - Remove tasks that are no longer relevant from the list entirely
+
+3. **Task Completion Requirements**:
+   - ONLY mark a task as completed when you have FULLY accomplished it
+   - If you encounter errors, blockers, or cannot finish, keep the task as in_progress
+   - When blocked, create a new task describing what needs to be resolved
+   - Never mark a task as completed if:
+     - Tests are failing
+     - Implementation is partial
+     - You encountered unresolved errors
+     - You couldn't find necessary files or dependencies
+
+4. **Task Breakdown**:
+   - Create specific, actionable items
+   - Break complex tasks into smaller, manageable steps
+   - Use clear, descriptive task names
+
+When in doubt, use this tool. Being proactive with task management demonstrates attentiveness and ensures you complete all requirements successfully.
+`;
+
+const TODO_SUBDIR = 'todos';
+
+function getTodoFilePath(sessionId?: string): string {
+  const homeDir = process.env.HOME || process.env.USERPROFILE || process.cwd();
+  const todoDir = path.join(homeDir, QWEN_DIR, TODO_SUBDIR);
+
+  // Use sessionId if provided, otherwise fall back to 'default'
+  const filename = `${sessionId || 'default'}.json`;
+  return path.join(todoDir, filename);
+}
+
+/**
+ * Reads the current todos from the file system
+ */
+async function readTodosFromFile(sessionId?: string): Promise<TodoItem[]> {
+  try {
+    const todoFilePath = getTodoFilePath(sessionId);
+    const content = await fs.readFile(todoFilePath, 'utf-8');
+    const data = JSON.parse(content);
+    return Array.isArray(data.todos) ? data.todos : [];
+  } catch (err) {
+    const error = err as Error & { code?: string };
+    if (!(error instanceof Error) || error.code !== 'ENOENT') {
+      throw err;
+    }
+    return [];
+  }
+}
+
+/**
+ * Writes todos to the file system
+ */
+async function writeTodosToFile(
+  todos: TodoItem[],
+  sessionId?: string,
+): Promise<void> {
+  const todoFilePath = getTodoFilePath(sessionId);
+  const todoDir = path.dirname(todoFilePath);
+
+  await fs.mkdir(todoDir, { recursive: true });
+
+  const data = {
+    todos,
+    sessionId: sessionId || 'default',
+  };
+
+  await fs.writeFile(todoFilePath, JSON.stringify(data, null, 2), 'utf-8');
+}
+
+class TodoWriteToolInvocation extends BaseToolInvocation<
+  TodoWriteParams,
+  ToolResult
+> {
+  private operationType: 'create' | 'update';
+
+  constructor(
+    private readonly config: Config,
+    params: TodoWriteParams,
+    operationType: 'create' | 'update' = 'update',
+  ) {
+    super(params);
+    this.operationType = operationType;
+  }
+
+  getDescription(): string {
+    return this.operationType === 'create' ? 'Create todos' : 'Update todos';
+  }
+
+  override async shouldConfirmExecute(
+    _abortSignal: AbortSignal,
+  ): Promise<false> {
+    // Todo operations should execute automatically without user confirmation
+    return false;
+  }
+
+  async execute(_signal: AbortSignal): Promise<ToolResult> {
+    const { todos, modified_by_user, modified_content } = this.params;
+    const sessionId = this.config.getSessionId();
+
+    try {
+      let finalTodos: TodoItem[];
+
+      if (modified_by_user && modified_content !== undefined) {
+        // User modified the content in external editor, parse it directly
+        const data = JSON.parse(modified_content);
+        finalTodos = Array.isArray(data.todos) ? data.todos : [];
+      } else {
+        // Use the normal todo logic - simply replace with new todos
+        finalTodos = todos;
+      }
+
+      await writeTodosToFile(finalTodos, sessionId);
+
+      // Create structured display object for rich UI rendering
+      const todoResultDisplay = {
+        type: 'todo_list' as const,
+        todos: finalTodos,
+      };
+
+      return {
+        llmContent: JSON.stringify({
+          success: true,
+          todos: finalTodos,
+        }),
+        returnDisplay: todoResultDisplay,
+      };
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      console.error(
+        `[TodoWriteTool] Error executing todo_write: ${errorMessage}`,
+      );
+      return {
+        llmContent: JSON.stringify({
+          success: false,
+          error: `Failed to write todos. Detail: ${errorMessage}`,
+        }),
+        returnDisplay: `Error writing todos: ${errorMessage}`,
+      };
+    }
+  }
+}
+
+/**
+ * Utility function to read todos for a specific session (useful for session recovery)
+ */
+export async function readTodosForSession(
+  sessionId?: string,
+): Promise<TodoItem[]> {
+  return readTodosFromFile(sessionId);
+}
+
+/**
+ * Utility function to list all todo files in the todos directory
+ */
+export async function listTodoSessions(): Promise<string[]> {
+  try {
+    const homeDir =
+      process.env.HOME || process.env.USERPROFILE || process.cwd();
+    const todoDir = path.join(homeDir, QWEN_DIR, TODO_SUBDIR);
+    const files = await fs.readdir(todoDir);
+    return files
+      .filter((file: string) => file.endsWith('.json'))
+      .map((file: string) => file.replace('.json', ''));
+  } catch (err) {
+    const error = err as Error & { code?: string };
+    if (!(error instanceof Error) || error.code !== 'ENOENT') {
+      throw err;
+    }
+    return [];
+  }
+}
+
+export class TodoWriteTool extends BaseDeclarativeTool<
+  TodoWriteParams,
+  ToolResult
+> {
+  static readonly Name: string = todoWriteToolSchemaData.name!;
+
+  constructor(private readonly config: Config) {
+    super(
+      TodoWriteTool.Name,
+      'Todo Write',
+      todoWriteToolDescription,
+      Kind.Think,
+      todoWriteToolSchemaData.parametersJsonSchema as Record<string, unknown>,
+    );
+  }
+
+  override validateToolParams(params: TodoWriteParams): string | null {
+    const errors = SchemaValidator.validate(
+      this.schema.parametersJsonSchema,
+      params,
+    );
+    if (errors) {
+      return errors;
+    }
+
+    // Validate todos array
+    if (!Array.isArray(params.todos)) {
+      return 'Parameter "todos" must be an array.';
+    }
+
+    // Validate individual todos
+    for (const todo of params.todos) {
+      if (!todo.id || typeof todo.id !== 'string' || todo.id.trim() === '') {
+        return 'Each todo must have a non-empty "id" string.';
+      }
+      if (
+        !todo.content ||
+        typeof todo.content !== 'string' ||
+        todo.content.trim() === ''
+      ) {
+        return 'Each todo must have a non-empty "content" string.';
+      }
+      if (!['pending', 'in_progress', 'completed'].includes(todo.status)) {
+        return 'Each todo must have a valid "status" (pending, in_progress, completed).';
+      }
+    }
+
+    // Check for duplicate IDs
+    const ids = params.todos.map((todo) => todo.id);
+    const uniqueIds = new Set(ids);
+    if (ids.length !== uniqueIds.size) {
+      return 'Todo IDs must be unique within the array.';
+    }
+
+    return null;
+  }
+
+  protected createInvocation(params: TodoWriteParams) {
+    // Determine if this is a create or update operation by checking if todos file exists
+    const sessionId = this.config.getSessionId();
+    const todoFilePath = getTodoFilePath(sessionId);
+    const operationType = fsSync.existsSync(todoFilePath) ? 'update' : 'create';
+
+    return new TodoWriteToolInvocation(this.config, params, operationType);
+  }
+}
--- a/packages/core/src/tools/tools.ts
+++ b/packages/core/src/tools/tools.ts
@@ -491,7 +491,7 @@ export function hasCycleInSchema(schema: object): boolean {
  return traverse(schema, new Set<string>(), new Set<string>());
 }

-export type ToolResultDisplay = string | FileDiff;
+export type ToolResultDisplay = string | FileDiff | TodoResultDisplay;

 export interface FileDiff {
  fileDiff: string;
@@ -508,6 +508,15 @@ export interface DiffStat {
  user_removed_lines: number;
 }

+export interface TodoResultDisplay {
+  type: 'todo_list';
+  todos: Array<{
+    id: string;
+    content: string;
+    status: 'pending' | 'in_progress' | 'completed';
+  }>;
+}
+
 export interface ToolEditConfirmationDetails {
  type: 'edit';
  title: string;