fix: output token limit for qwen

chore: bump version to 0.0.12 (#662 )
fix: arrow keys on windows (#661 )
2025-12-25 11:09:13 +00:00 · 2025-09-19 23:37:15 +08:00 · 2025-09-19 20:13:31 +08:00 · 2025-09-19 19:44:57 +08:00 · 2025-09-19 15:19:30 +08:00
13 changed files with 482 additions and 58 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,25 @@
 # Changelog

+## 0.0.12
+
+- Added vision model support for Qwen-OAuth authentication.
+- Synced upstream `gemini-cli` to v0.3.4 with numerous improvements and bug fixes.
+- Enhanced subagent functionality with system reminders and improved user experience.
+- Added tool call type coercion for better compatibility.
+- Fixed arrow key navigation issues on Windows.
+- Fixed missing tool call chunks for OpenAI logging.
+- Fixed system prompt issues to avoid malformed tool calls.
+- Fixed terminal flicker when subagent is executing.
+- Fixed duplicate subagents configuration when running in home directory.
+- Fixed Esc key unable to cancel subagent dialog.
+- Added confirmation prompt for `/init` command when context file exists.
+- Added `skipLoopDetection` configuration option.
+- Fixed `is_background` parameter reset issues.
+- Enhanced Windows compatibility with multi-line paste handling.
+- Improved subagent documentation and branding consistency.
+- Fixed various linting errors and improved code quality.
+- Miscellaneous improvements and bug fixes.
+
 ## 0.0.11

 - Added subagents feature with file-based configuration system for specialized AI assistants.
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "@qwen-code/qwen-code",
-  "version": "0.0.11",
+  "version": "0.0.12",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "@qwen-code/qwen-code",
-      "version": "0.0.11",
+      "version": "0.0.12",
      "workspaces": [
        "packages/*"
      ],
@@ -13454,7 +13454,7 @@
    },
    "packages/cli": {
      "name": "@qwen-code/qwen-code",
-      "version": "0.0.11",
+      "version": "0.0.12",
      "dependencies": {
        "@google/genai": "1.9.0",
        "@iarna/toml": "^2.2.5",
@@ -13662,7 +13662,7 @@
    },
    "packages/core": {
      "name": "@qwen-code/qwen-code-core",
-      "version": "0.0.11",
+      "version": "0.0.12",
      "dependencies": {
        "@google/genai": "1.13.0",
        "@lvce-editor/ripgrep": "^1.6.0",
@@ -13788,7 +13788,7 @@
    },
    "packages/test-utils": {
      "name": "@qwen-code/qwen-code-test-utils",
-      "version": "0.0.11",
+      "version": "0.0.12",
      "dev": true,
      "license": "Apache-2.0",
      "devDependencies": {
@@ -13800,7 +13800,7 @@
    },
    "packages/vscode-ide-companion": {
      "name": "qwen-code-vscode-ide-companion",
-      "version": "0.0.11",
+      "version": "0.0.12",
      "license": "LICENSE",
      "dependencies": {
        "@modelcontextprotocol/sdk": "^1.15.1",
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@qwen-code/qwen-code",
-  "version": "0.0.11",
+  "version": "0.0.12",
  "engines": {
    "node": ">=20.0.0"
  },
@@ -13,7 +13,7 @@
    "url": "git+https://github.com/QwenLM/qwen-code.git"
  },
  "config": {
-    "sandboxImageUri": "ghcr.io/qwenlm/qwen-code:0.0.11"
+    "sandboxImageUri": "ghcr.io/qwenlm/qwen-code:0.0.12"
  },
  "scripts": {
    "start": "node scripts/start.js",
--- a/packages/cli/package.json
+++ b/packages/cli/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@qwen-code/qwen-code",
-  "version": "0.0.11",
+  "version": "0.0.12",
  "description": "Qwen Code",
  "repository": {
    "type": "git",
@@ -25,7 +25,7 @@
    "dist"
  ],
  "config": {
-    "sandboxImageUri": "ghcr.io/qwenlm/qwen-code:0.0.11"
+    "sandboxImageUri": "ghcr.io/qwenlm/qwen-code:0.0.12"
  },
  "dependencies": {
    "@google/genai": "1.9.0",
--- a/packages/cli/src/ui/contexts/KeypressContext.test.tsx
+++ b/packages/cli/src/ui/contexts/KeypressContext.test.tsx
@@ -526,7 +526,7 @@ describe('KeypressContext - Kitty Protocol', () => {
        });

        await waitFor(() => {
-          expect(keyHandler).toHaveBeenCalledTimes(2); // 1 paste event + 1 paste event for 'after'
+          expect(keyHandler).toHaveBeenCalledTimes(6); // 1 paste event + 5 individual chars for 'after'
        });

        // Should emit paste event first
@@ -538,12 +538,40 @@ describe('KeypressContext - Kitty Protocol', () => {
          }),
        );

-        // Then process 'after' as a paste event (since it's > 2 chars)
+        // Then process 'after' as individual characters (since it doesn't contain return)
        expect(keyHandler).toHaveBeenNthCalledWith(
          2,
          expect.objectContaining({
-            paste: true,
-            sequence: 'after',
+            name: 'a',
+            paste: false,
+          }),
+        );
+        expect(keyHandler).toHaveBeenNthCalledWith(
+          3,
+          expect.objectContaining({
+            name: 'f',
+            paste: false,
+          }),
+        );
+        expect(keyHandler).toHaveBeenNthCalledWith(
+          4,
+          expect.objectContaining({
+            name: 't',
+            paste: false,
+          }),
+        );
+        expect(keyHandler).toHaveBeenNthCalledWith(
+          5,
+          expect.objectContaining({
+            name: 'e',
+            paste: false,
+          }),
+        );
+        expect(keyHandler).toHaveBeenNthCalledWith(
+          6,
+          expect.objectContaining({
+            name: 'r',
+            paste: false,
          }),
        );
      });
@@ -571,7 +599,7 @@ describe('KeypressContext - Kitty Protocol', () => {
        });

        await waitFor(() => {
-          expect(keyHandler).toHaveBeenCalledTimes(14); // Adjusted based on actual behavior
+          expect(keyHandler).toHaveBeenCalledTimes(16); // 5 + 1 + 6 + 1 + 3 = 16 calls
        });

        // Check the sequence: 'start' (5 chars) + paste1 + 'middle' (6 chars) + paste2 + 'end' (3 chars as paste)
@@ -643,13 +671,18 @@ describe('KeypressContext - Kitty Protocol', () => {
          }),
        );

-        // 'end' as paste event (since it's > 2 chars)
+        // 'end' as individual characters (since it doesn't contain return)
        expect(keyHandler).toHaveBeenNthCalledWith(
          callIndex++,
-          expect.objectContaining({
-            paste: true,
-            sequence: 'end',
-          }),
+          expect.objectContaining({ name: 'e' }),
+        );
+        expect(keyHandler).toHaveBeenNthCalledWith(
+          callIndex++,
+          expect.objectContaining({ name: 'n' }),
+        );
+        expect(keyHandler).toHaveBeenNthCalledWith(
+          callIndex++,
+          expect.objectContaining({ name: 'd' }),
        );
      });

@@ -738,16 +771,18 @@ describe('KeypressContext - Kitty Protocol', () => {
        });

        await waitFor(() => {
-          // With the current implementation, fragmented data gets processed differently
-          // The first fragment '\x1b[20' gets processed as individual characters
-          // The second fragment '0~content\x1b[2' gets processed as paste + individual chars
-          // The third fragment '01~' gets processed as individual characters
-          expect(keyHandler).toHaveBeenCalled();
+          // With the current implementation, fragmented paste markers get reconstructed
+          // into a single paste event for 'content'
+          expect(keyHandler).toHaveBeenCalledTimes(1);
        });

-        // The current implementation processes fragmented paste markers as separate events
-        // rather than reconstructing them into a single paste event
-        expect(keyHandler.mock.calls.length).toBeGreaterThan(1);
+        // Should reconstruct the fragmented paste markers into a single paste event
+        expect(keyHandler).toHaveBeenCalledWith(
+          expect.objectContaining({
+            paste: true,
+            sequence: 'content',
+          }),
+        );
      });
    });

@@ -851,19 +886,38 @@ describe('KeypressContext - Kitty Protocol', () => {
          stdin.emit('data', Buffer.from('lo'));
        });

-        // With the current implementation, data is processed as it arrives
-        // First chunk 'hel' is treated as paste (multi-character)
+        // With the current implementation, data is processed as individual characters
+        // since 'hel' doesn't contain return (0x0d)
        expect(keyHandler).toHaveBeenNthCalledWith(
          1,
          expect.objectContaining({
-            paste: true,
-            sequence: 'hel',
+            name: 'h',
+            sequence: 'h',
+            paste: false,
          }),
        );

-        // Second chunk 'lo' is processed as individual characters
        expect(keyHandler).toHaveBeenNthCalledWith(
          2,
+          expect.objectContaining({
+            name: 'e',
+            sequence: 'e',
+            paste: false,
+          }),
+        );
+
+        expect(keyHandler).toHaveBeenNthCalledWith(
+          3,
+          expect.objectContaining({
+            name: 'l',
+            sequence: 'l',
+            paste: false,
+          }),
+        );
+
+        // Second chunk 'lo' is also processed as individual characters
+        expect(keyHandler).toHaveBeenNthCalledWith(
+          4,
          expect.objectContaining({
            name: 'l',
            sequence: 'l',
@@ -872,7 +926,7 @@ describe('KeypressContext - Kitty Protocol', () => {
        );

        expect(keyHandler).toHaveBeenNthCalledWith(
-          3,
+          5,
          expect.objectContaining({
            name: 'o',
            sequence: 'o',
@@ -880,7 +934,7 @@ describe('KeypressContext - Kitty Protocol', () => {
          }),
        );

-        expect(keyHandler).toHaveBeenCalledTimes(3);
+        expect(keyHandler).toHaveBeenCalledTimes(5);
      } finally {
        vi.useRealTimers();
      }
@@ -907,14 +961,20 @@ describe('KeypressContext - Kitty Protocol', () => {
        });

        // Should flush immediately without waiting for timeout
-        // Large data gets treated as paste event
-        expect(keyHandler).toHaveBeenCalledTimes(1);
-        expect(keyHandler).toHaveBeenCalledWith(
-          expect.objectContaining({
-            paste: true,
-            sequence: largeData,
-          }),
-        );
+        // Large data without return gets treated as individual characters
+        expect(keyHandler).toHaveBeenCalledTimes(65);
+
+        // Each character should be processed individually
+        for (let i = 0; i < 65; i++) {
+          expect(keyHandler).toHaveBeenNthCalledWith(
+            i + 1,
+            expect.objectContaining({
+              name: 'x',
+              sequence: 'x',
+              paste: false,
+            }),
+          );
+        }

        // Advancing timer should not cause additional calls
        const callCountBefore = keyHandler.mock.calls.length;
--- a/packages/cli/src/ui/contexts/KeypressContext.tsx
+++ b/packages/cli/src/ui/contexts/KeypressContext.tsx
@@ -407,7 +407,11 @@ export function KeypressProvider({
        return;
      }

-      if (rawDataBuffer.length <= 2 || isPaste) {
+      if (
+        (rawDataBuffer.length <= 2 && rawDataBuffer.includes(0x0d)) ||
+        !rawDataBuffer.includes(0x0d) ||
+        isPaste
+      ) {
        keypressStream.write(rawDataBuffer);
      } else {
        // Flush raw data buffer as a paste event
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@qwen-code/qwen-code-core",
-  "version": "0.0.11",
+  "version": "0.0.12",
  "description": "Qwen Code Core",
  "repository": {
    "type": "git",
--- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts
+++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts
@@ -560,4 +560,146 @@ describe('DashScopeOpenAICompatibleProvider', () => {
      ]);
    });
  });
+
+  describe('output token limits', () => {
+    it('should limit max_tokens when it exceeds model limit for qwen3-coder-plus', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'qwen3-coder-plus',
+        messages: [{ role: 'user', content: 'Hello' }],
+        max_tokens: 100000, // Exceeds the 65536 limit
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      expect(result.max_tokens).toBe(65536); // Should be limited to model's output limit
+    });
+
+    it('should limit max_tokens when it exceeds model limit for qwen-vl-max-latest', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'qwen-vl-max-latest',
+        messages: [{ role: 'user', content: 'Hello' }],
+        max_tokens: 20000, // Exceeds the 8192 limit
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      expect(result.max_tokens).toBe(8192); // Should be limited to model's output limit
+    });
+
+    it('should not modify max_tokens when it is within model limit', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'qwen3-coder-plus',
+        messages: [{ role: 'user', content: 'Hello' }],
+        max_tokens: 1000, // Within the 65536 limit
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      expect(result.max_tokens).toBe(1000); // Should remain unchanged
+    });
+
+    it('should not add max_tokens when not present in request', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'qwen3-coder-plus',
+        messages: [{ role: 'user', content: 'Hello' }],
+        // No max_tokens parameter
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      expect(result.max_tokens).toBeUndefined(); // Should remain undefined
+    });
+
+    it('should handle null max_tokens parameter', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'qwen3-coder-plus',
+        messages: [{ role: 'user', content: 'Hello' }],
+        max_tokens: null,
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      expect(result.max_tokens).toBeNull(); // Should remain null
+    });
+
+    it('should use default output limit for unknown models', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'unknown-model',
+        messages: [{ role: 'user', content: 'Hello' }],
+        max_tokens: 10000, // Exceeds the default 4096 limit
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      expect(result.max_tokens).toBe(4096); // Should be limited to default output limit
+    });
+
+    it('should preserve other request parameters when limiting max_tokens', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'qwen3-coder-plus',
+        messages: [{ role: 'user', content: 'Hello' }],
+        max_tokens: 100000, // Will be limited
+        temperature: 0.8,
+        top_p: 0.9,
+        frequency_penalty: 0.1,
+        presence_penalty: 0.2,
+        stop: ['END'],
+        user: 'test-user',
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      // max_tokens should be limited
+      expect(result.max_tokens).toBe(65536);
+
+      // Other parameters should be preserved
+      expect(result.temperature).toBe(0.8);
+      expect(result.top_p).toBe(0.9);
+      expect(result.frequency_penalty).toBe(0.1);
+      expect(result.presence_penalty).toBe(0.2);
+      expect(result.stop).toEqual(['END']);
+      expect(result.user).toBe('test-user');
+    });
+
+    it('should work with vision models and output token limits', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'qwen-vl-max-latest',
+        messages: [
+          {
+            role: 'user',
+            content: [
+              { type: 'text', text: 'Look at this image:' },
+              {
+                type: 'image_url',
+                image_url: { url: 'https://example.com/image.jpg' },
+              },
+            ],
+          },
+        ],
+        max_tokens: 20000, // Exceeds the 8192 limit
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      expect(result.max_tokens).toBe(8192); // Should be limited
+      expect(
+        (result as { vl_high_resolution_images?: boolean })
+          .vl_high_resolution_images,
+      ).toBe(true); // Vision-specific parameter should be preserved
+    });
+
+    it('should handle streaming requests with output token limits', () => {
+      const request: OpenAI.Chat.ChatCompletionCreateParams = {
+        model: 'qwen3-coder-plus',
+        messages: [{ role: 'user', content: 'Hello' }],
+        max_tokens: 100000, // Exceeds the 65536 limit
+        stream: true,
+      };
+
+      const result = provider.buildRequest(request, 'test-prompt-id');
+
+      expect(result.max_tokens).toBe(65536); // Should be limited
+      expect(result.stream).toBe(true); // Streaming should be preserved
+    });
+  });
 });
--- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts
+++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts
@@ -3,6 +3,7 @@ import type { Config } from '../../../config/config.js';
 import type { ContentGeneratorConfig } from '../../contentGenerator.js';
 import { AuthType } from '../../contentGenerator.js';
 import { DEFAULT_TIMEOUT, DEFAULT_MAX_RETRIES } from '../constants.js';
+import { tokenLimit } from '../../tokenLimits.js';
 import type {
  OpenAICompatibleProvider,
  DashScopeRequestMetadata,
@@ -65,6 +66,19 @@ export class DashScopeOpenAICompatibleProvider
    });
  }

+  /**
+   * Build and configure the request for DashScope API.
+   *
+   * This method applies DashScope-specific configurations including:
+   * - Cache control for system and user messages
+   * - Output token limits based on model capabilities
+   * - Vision model specific parameters (vl_high_resolution_images)
+   * - Request metadata for session tracking
+   *
+   * @param request - The original chat completion request parameters
+   * @param userPromptId - Unique identifier for the user prompt for session tracking
+   * @returns Configured request with DashScope-specific parameters applied
+   */
  buildRequest(
    request: OpenAI.Chat.ChatCompletionCreateParams,
    userPromptId: string,
@@ -79,21 +93,28 @@ export class DashScopeOpenAICompatibleProvider
      messages = this.addDashScopeCacheControl(messages, cacheTarget);
    }

+    // Apply output token limits based on model capabilities
+    // This ensures max_tokens doesn't exceed the model's maximum output limit
+    const requestWithTokenLimits = this.applyOutputTokenLimit(
+      request,
+      request.model,
+    );
+
    if (request.model.startsWith('qwen-vl')) {
      return {
-        ...request,
+        ...requestWithTokenLimits,
        messages,
        ...(this.buildMetadata(userPromptId) || {}),
        /* @ts-expect-error dashscope exclusive */
        vl_high_resolution_images: true,
-      };
+      } as OpenAI.Chat.ChatCompletionCreateParams;
    }

    return {
-      ...request, // Preserve all original parameters including sampling params
+      ...requestWithTokenLimits, // Preserve all original parameters including sampling params and adjusted max_tokens
      messages,
      ...(this.buildMetadata(userPromptId) || {}),
-    };
+    } as OpenAI.Chat.ChatCompletionCreateParams;
  }

  buildMetadata(userPromptId: string): DashScopeRequestMetadata {
@@ -246,6 +267,41 @@ export class DashScopeOpenAICompatibleProvider
    return contentArray;
  }

+  /**
+   * Apply output token limit to a request's max_tokens parameter.
+   *
+   * Ensures that existing max_tokens parameters don't exceed the model's maximum output
+   * token limit. Only modifies max_tokens when already present in the request.
+   *
+   * @param request - The chat completion request parameters
+   * @param model - The model name to get the output token limit for
+   * @returns The request with max_tokens adjusted to respect the model's limits (if present)
+   */
+  private applyOutputTokenLimit<T extends { max_tokens?: number | null }>(
+    request: T,
+    model: string,
+  ): T {
+    const currentMaxTokens = request.max_tokens;
+
+    // Only process if max_tokens is already present in the request
+    if (currentMaxTokens === undefined || currentMaxTokens === null) {
+      return request; // No max_tokens parameter, return unchanged
+    }
+
+    const modelLimit = tokenLimit(model, 'output');
+
+    // If max_tokens exceeds the model limit, cap it to the model's limit
+    if (currentMaxTokens > modelLimit) {
+      return {
+        ...request,
+        max_tokens: modelLimit,
+      };
+    }
+
+    // If max_tokens is within the limit, return the request unchanged
+    return request;
+  }
+
  /**
   * Check if cache control should be disabled based on configuration.
   *
--- a/packages/core/src/core/tokenLimits.test.ts
+++ b/packages/core/src/core/tokenLimits.test.ts
@@ -1,5 +1,10 @@
 import { describe, it, expect } from 'vitest';
-import { normalize, tokenLimit, DEFAULT_TOKEN_LIMIT } from './tokenLimits.js';
+import {
+  normalize,
+  tokenLimit,
+  DEFAULT_TOKEN_LIMIT,
+  DEFAULT_OUTPUT_TOKEN_LIMIT,
+} from './tokenLimits.js';

 describe('normalize', () => {
  it('should lowercase and trim the model string', () => {
@@ -225,3 +230,96 @@ describe('tokenLimit', () => {
    expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000);
  });
 });
+
+describe('tokenLimit with output type', () => {
+  describe('Qwen models with output limits', () => {
+    it('should return the correct output limit for qwen3-coder-plus', () => {
+      expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536);
+      expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536);
+    });
+
+    it('should return the correct output limit for qwen-vl-max-latest', () => {
+      expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192);
+    });
+  });
+
+  describe('Default output limits', () => {
+    it('should return the default output limit for unknown models', () => {
+      expect(tokenLimit('unknown-model', 'output')).toBe(
+        DEFAULT_OUTPUT_TOKEN_LIMIT,
+      );
+      expect(tokenLimit('gpt-4', 'output')).toBe(DEFAULT_OUTPUT_TOKEN_LIMIT);
+      expect(tokenLimit('claude-3.5-sonnet', 'output')).toBe(
+        DEFAULT_OUTPUT_TOKEN_LIMIT,
+      );
+    });
+
+    it('should return the default output limit for models without specific output patterns', () => {
+      expect(tokenLimit('qwen3-coder-7b', 'output')).toBe(
+        DEFAULT_OUTPUT_TOKEN_LIMIT,
+      );
+      expect(tokenLimit('qwen-plus', 'output')).toBe(
+        DEFAULT_OUTPUT_TOKEN_LIMIT,
+      );
+      expect(tokenLimit('qwen-vl-max', 'output')).toBe(
+        DEFAULT_OUTPUT_TOKEN_LIMIT,
+      );
+    });
+  });
+
+  describe('Input vs Output limits comparison', () => {
+    it('should return different limits for input vs output for qwen3-coder-plus', () => {
+      expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576); // 1M input
+      expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); // 64K output
+    });
+
+    it('should return different limits for input vs output for qwen-vl-max-latest', () => {
+      expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072); // 128K input
+      expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); // 8K output
+    });
+
+    it('should return same default limits for unknown models', () => {
+      expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT); // 128K input
+      expect(tokenLimit('unknown-model', 'output')).toBe(
+        DEFAULT_OUTPUT_TOKEN_LIMIT,
+      ); // 4K output
+    });
+  });
+
+  describe('Backward compatibility', () => {
+    it('should default to input type when no type is specified', () => {
+      expect(tokenLimit('qwen3-coder-plus')).toBe(1048576); // Should be input limit
+      expect(tokenLimit('qwen-vl-max-latest')).toBe(131072); // Should be input limit
+      expect(tokenLimit('unknown-model')).toBe(DEFAULT_TOKEN_LIMIT); // Should be input default
+    });
+
+    it('should work with explicit input type', () => {
+      expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576);
+      expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072);
+      expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT);
+    });
+  });
+
+  describe('Model normalization with output limits', () => {
+    it('should handle normalized model names for output limits', () => {
+      expect(tokenLimit('QWEN3-CODER-PLUS', 'output')).toBe(65536);
+      expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536);
+      expect(tokenLimit('QWEN-VL-MAX-LATEST', 'output')).toBe(8192);
+    });
+
+    it('should handle complex model strings for output limits', () => {
+      expect(
+        tokenLimit(
+          '  a/b/c|QWEN3-CODER-PLUS:qwen3-coder-plus-2024-05-13  ',
+          'output',
+        ),
+      ).toBe(65536);
+      expect(
+        tokenLimit(
+          'provider/qwen-vl-max-latest:qwen-vl-max-latest-v1',
+          'output',
+        ),
+      ).toBe(8192);
+    });
+  });
+});
--- a/packages/core/src/core/tokenLimits.ts
+++ b/packages/core/src/core/tokenLimits.ts
@@ -1,7 +1,15 @@
 type Model = string;
 type TokenCount = number;

+/**
+ * Token limit types for different use cases.
+ * - 'input': Maximum input context window size
+ * - 'output': Maximum output tokens that can be generated in a single response
+ */
+export type TokenLimitType = 'input' | 'output';
+
 export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two)
+export const DEFAULT_OUTPUT_TOKEN_LIMIT: TokenCount = 4_096; // 4K tokens

 /**
 * Accurate numeric limits:
@@ -18,6 +26,10 @@ const LIMITS = {
  '1m': 1_048_576,
  '2m': 2_097_152,
  '10m': 10_485_760, // 10 million tokens
+  // Output token limits (typically much smaller than input limits)
+  '4k': 4_096,
+  '8k': 8_192,
+  '16k': 16_384,
 } as const;

 /** Robust normalizer: strips provider prefixes, pipes/colons, date/version suffixes, etc. */
@@ -36,7 +48,7 @@ export function normalize(model: string): string {
  // - dates (e.g., -20250219), -v1, version numbers, 'latest', 'preview' etc.
  s = s.replace(/-preview/g, '');
  // Special handling for Qwen model names that include "-latest" as part of the model name
-  if (!s.match(/^qwen-(?:plus|flash)-latest$/)) {
+  if (!s.match(/^qwen-(?:plus|flash|vl-max)-latest$/)) {
    // \d{6,} - Match 6 or more digits (dates) like -20250219 (6+ digit dates)
    // \d+x\d+b - Match patterns like 4x8b, -7b, -70b
    // v\d+(?:\.\d+)* - Match version patterns starting with 'v' like -v1, -v1.2, -v2.1.3
@@ -142,16 +154,48 @@ const PATTERNS: Array<[RegExp, TokenCount]> = [
  [/^mistral-large-2.*$/, LIMITS['128k']],
 ];

-/** Return the token limit for a model string (uses normalize + ordered regex list). */
-export function tokenLimit(model: Model): TokenCount {
+/**
+ * Output token limit patterns for specific model families.
+ * These patterns define the maximum number of tokens that can be generated
+ * in a single response for specific models.
+ */
+const OUTPUT_PATTERNS: Array<[RegExp, TokenCount]> = [
+  // -------------------
+  // Alibaba / Qwen - DashScope Models
+  // -------------------
+  // Qwen3-Coder-Plus: 65,536 max output tokens
+  [/^qwen3-coder-plus(-.*)?$/, LIMITS['64k']],
+
+  // Qwen-VL-Max-Latest: 8,192 max output tokens
+  [/^qwen-vl-max-latest$/, LIMITS['8k']],
+];
+
+/**
+ * Return the token limit for a model string based on the specified type.
+ *
+ * This function determines the maximum number of tokens for either input context
+ * or output generation based on the model and token type. It uses the same
+ * normalization logic for consistency across both input and output limits.
+ *
+ * @param model - The model name to get the token limit for
+ * @param type - The type of token limit ('input' for context window, 'output' for generation)
+ * @returns The maximum number of tokens allowed for this model and type
+ */
+export function tokenLimit(
+  model: Model,
+  type: TokenLimitType = 'input',
+): TokenCount {
  const norm = normalize(model);

-  for (const [regex, limit] of PATTERNS) {
+  // Choose the appropriate patterns based on token type
+  const patterns = type === 'output' ? OUTPUT_PATTERNS : PATTERNS;
+
+  for (const [regex, limit] of patterns) {
    if (regex.test(norm)) {
      return limit;
    }
  }

-  // final fallback: DEFAULT_TOKEN_LIMIT (power-of-two 128K)
-  return DEFAULT_TOKEN_LIMIT;
+  // Return appropriate default based on token type
+  return type === 'output' ? DEFAULT_OUTPUT_TOKEN_LIMIT : DEFAULT_TOKEN_LIMIT;
 }
--- a/packages/test-utils/package.json
+++ b/packages/test-utils/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@qwen-code/qwen-code-test-utils",
-  "version": "0.0.11",
+  "version": "0.0.12",
  "private": true,
  "main": "src/index.ts",
  "license": "Apache-2.0",
--- a/packages/vscode-ide-companion/package.json
+++ b/packages/vscode-ide-companion/package.json
@@ -2,7 +2,7 @@
  "name": "qwen-code-vscode-ide-companion",
  "displayName": "Qwen Code Companion",
  "description": "Enable Qwen Code with direct access to your VS Code workspace.",
-  "version": "0.0.11",
+  "version": "0.0.12",
  "publisher": "qwenlm",
  "icon": "assets/icon.png",
  "repository": {
Author	SHA1	Message	Date
Mingholy	af03eaa57f	fix: output token limit for qwen	2025-09-19 23:37:15 +08:00
Mingholy	3579d6555a	chore: bump version to 0.0.12 (#662 )	2025-09-19 20:13:31 +08:00
Mingholy	9a56560eb4	fix: arrow keys on windows (#661 )	2025-09-19 19:44:57 +08:00
Mingholy	da0863b943	fix: missing tool call chunks for openai logging (#657 )	2025-09-19 15:19:30 +08:00