diff --git a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts index cdddd0ba..cc62c213 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts @@ -560,4 +560,146 @@ describe('DashScopeOpenAICompatibleProvider', () => { ]); }); }); + + describe('output token limits', () => { + it('should limit max_tokens when it exceeds model limit for qwen3-coder-plus', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'qwen3-coder-plus', + messages: [{ role: 'user', content: 'Hello' }], + max_tokens: 100000, // Exceeds the 65536 limit + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + expect(result.max_tokens).toBe(65536); // Should be limited to model's output limit + }); + + it('should limit max_tokens when it exceeds model limit for qwen-vl-max-latest', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'qwen-vl-max-latest', + messages: [{ role: 'user', content: 'Hello' }], + max_tokens: 20000, // Exceeds the 8192 limit + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + expect(result.max_tokens).toBe(8192); // Should be limited to model's output limit + }); + + it('should not modify max_tokens when it is within model limit', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'qwen3-coder-plus', + messages: [{ role: 'user', content: 'Hello' }], + max_tokens: 1000, // Within the 65536 limit + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + expect(result.max_tokens).toBe(1000); // Should remain unchanged + }); + + it('should not add max_tokens when not present in request', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'qwen3-coder-plus', + messages: [{ role: 'user', content: 'Hello' }], + // No max_tokens parameter + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + expect(result.max_tokens).toBeUndefined(); // Should remain undefined + }); + + it('should handle null max_tokens parameter', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'qwen3-coder-plus', + messages: [{ role: 'user', content: 'Hello' }], + max_tokens: null, + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + expect(result.max_tokens).toBeNull(); // Should remain null + }); + + it('should use default output limit for unknown models', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'unknown-model', + messages: [{ role: 'user', content: 'Hello' }], + max_tokens: 10000, // Exceeds the default 4096 limit + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + expect(result.max_tokens).toBe(4096); // Should be limited to default output limit + }); + + it('should preserve other request parameters when limiting max_tokens', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'qwen3-coder-plus', + messages: [{ role: 'user', content: 'Hello' }], + max_tokens: 100000, // Will be limited + temperature: 0.8, + top_p: 0.9, + frequency_penalty: 0.1, + presence_penalty: 0.2, + stop: ['END'], + user: 'test-user', + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + // max_tokens should be limited + expect(result.max_tokens).toBe(65536); + + // Other parameters should be preserved + expect(result.temperature).toBe(0.8); + expect(result.top_p).toBe(0.9); + expect(result.frequency_penalty).toBe(0.1); + expect(result.presence_penalty).toBe(0.2); + expect(result.stop).toEqual(['END']); + expect(result.user).toBe('test-user'); + }); + + it('should work with vision models and output token limits', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'qwen-vl-max-latest', + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: 'Look at this image:' }, + { + type: 'image_url', + image_url: { url: 'https://example.com/image.jpg' }, + }, + ], + }, + ], + max_tokens: 20000, // Exceeds the 8192 limit + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + expect(result.max_tokens).toBe(8192); // Should be limited + expect( + (result as { vl_high_resolution_images?: boolean }) + .vl_high_resolution_images, + ).toBe(true); // Vision-specific parameter should be preserved + }); + + it('should handle streaming requests with output token limits', () => { + const request: OpenAI.Chat.ChatCompletionCreateParams = { + model: 'qwen3-coder-plus', + messages: [{ role: 'user', content: 'Hello' }], + max_tokens: 100000, // Exceeds the 65536 limit + stream: true, + }; + + const result = provider.buildRequest(request, 'test-prompt-id'); + + expect(result.max_tokens).toBe(65536); // Should be limited + expect(result.stream).toBe(true); // Streaming should be preserved + }); + }); }); diff --git a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts index 86cb54c0..fda3d3e5 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts @@ -3,6 +3,7 @@ import type { Config } from '../../../config/config.js'; import type { ContentGeneratorConfig } from '../../contentGenerator.js'; import { AuthType } from '../../contentGenerator.js'; import { DEFAULT_TIMEOUT, DEFAULT_MAX_RETRIES } from '../constants.js'; +import { tokenLimit } from '../../tokenLimits.js'; import type { OpenAICompatibleProvider, DashScopeRequestMetadata, @@ -65,6 +66,19 @@ export class DashScopeOpenAICompatibleProvider }); } + /** + * Build and configure the request for DashScope API. + * + * This method applies DashScope-specific configurations including: + * - Cache control for system and user messages + * - Output token limits based on model capabilities + * - Vision model specific parameters (vl_high_resolution_images) + * - Request metadata for session tracking + * + * @param request - The original chat completion request parameters + * @param userPromptId - Unique identifier for the user prompt for session tracking + * @returns Configured request with DashScope-specific parameters applied + */ buildRequest( request: OpenAI.Chat.ChatCompletionCreateParams, userPromptId: string, @@ -79,21 +93,28 @@ export class DashScopeOpenAICompatibleProvider messages = this.addDashScopeCacheControl(messages, cacheTarget); } + // Apply output token limits based on model capabilities + // This ensures max_tokens doesn't exceed the model's maximum output limit + const requestWithTokenLimits = this.applyOutputTokenLimit( + request, + request.model, + ); + if (request.model.startsWith('qwen-vl')) { return { - ...request, + ...requestWithTokenLimits, messages, ...(this.buildMetadata(userPromptId) || {}), /* @ts-expect-error dashscope exclusive */ vl_high_resolution_images: true, - }; + } as OpenAI.Chat.ChatCompletionCreateParams; } return { - ...request, // Preserve all original parameters including sampling params + ...requestWithTokenLimits, // Preserve all original parameters including sampling params and adjusted max_tokens messages, ...(this.buildMetadata(userPromptId) || {}), - }; + } as OpenAI.Chat.ChatCompletionCreateParams; } buildMetadata(userPromptId: string): DashScopeRequestMetadata { @@ -246,6 +267,41 @@ export class DashScopeOpenAICompatibleProvider return contentArray; } + /** + * Apply output token limit to a request's max_tokens parameter. + * + * Ensures that existing max_tokens parameters don't exceed the model's maximum output + * token limit. Only modifies max_tokens when already present in the request. + * + * @param request - The chat completion request parameters + * @param model - The model name to get the output token limit for + * @returns The request with max_tokens adjusted to respect the model's limits (if present) + */ + private applyOutputTokenLimit( + request: T, + model: string, + ): T { + const currentMaxTokens = request.max_tokens; + + // Only process if max_tokens is already present in the request + if (currentMaxTokens === undefined || currentMaxTokens === null) { + return request; // No max_tokens parameter, return unchanged + } + + const modelLimit = tokenLimit(model, 'output'); + + // If max_tokens exceeds the model limit, cap it to the model's limit + if (currentMaxTokens > modelLimit) { + return { + ...request, + max_tokens: modelLimit, + }; + } + + // If max_tokens is within the limit, return the request unchanged + return request; + } + /** * Check if cache control should be disabled based on configuration. * diff --git a/packages/core/src/core/tokenLimits.test.ts b/packages/core/src/core/tokenLimits.test.ts index 6ef81634..150eb575 100644 --- a/packages/core/src/core/tokenLimits.test.ts +++ b/packages/core/src/core/tokenLimits.test.ts @@ -1,5 +1,10 @@ import { describe, it, expect } from 'vitest'; -import { normalize, tokenLimit, DEFAULT_TOKEN_LIMIT } from './tokenLimits.js'; +import { + normalize, + tokenLimit, + DEFAULT_TOKEN_LIMIT, + DEFAULT_OUTPUT_TOKEN_LIMIT, +} from './tokenLimits.js'; describe('normalize', () => { it('should lowercase and trim the model string', () => { @@ -225,3 +230,96 @@ describe('tokenLimit', () => { expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000); }); }); + +describe('tokenLimit with output type', () => { + describe('Qwen models with output limits', () => { + it('should return the correct output limit for qwen3-coder-plus', () => { + expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); + expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536); + }); + + it('should return the correct output limit for qwen-vl-max-latest', () => { + expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); + }); + }); + + describe('Default output limits', () => { + it('should return the default output limit for unknown models', () => { + expect(tokenLimit('unknown-model', 'output')).toBe( + DEFAULT_OUTPUT_TOKEN_LIMIT, + ); + expect(tokenLimit('gpt-4', 'output')).toBe(DEFAULT_OUTPUT_TOKEN_LIMIT); + expect(tokenLimit('claude-3.5-sonnet', 'output')).toBe( + DEFAULT_OUTPUT_TOKEN_LIMIT, + ); + }); + + it('should return the default output limit for models without specific output patterns', () => { + expect(tokenLimit('qwen3-coder-7b', 'output')).toBe( + DEFAULT_OUTPUT_TOKEN_LIMIT, + ); + expect(tokenLimit('qwen-plus', 'output')).toBe( + DEFAULT_OUTPUT_TOKEN_LIMIT, + ); + expect(tokenLimit('qwen-vl-max', 'output')).toBe( + DEFAULT_OUTPUT_TOKEN_LIMIT, + ); + }); + }); + + describe('Input vs Output limits comparison', () => { + it('should return different limits for input vs output for qwen3-coder-plus', () => { + expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576); // 1M input + expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); // 64K output + }); + + it('should return different limits for input vs output for qwen-vl-max-latest', () => { + expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072); // 128K input + expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); // 8K output + }); + + it('should return same default limits for unknown models', () => { + expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT); // 128K input + expect(tokenLimit('unknown-model', 'output')).toBe( + DEFAULT_OUTPUT_TOKEN_LIMIT, + ); // 4K output + }); + }); + + describe('Backward compatibility', () => { + it('should default to input type when no type is specified', () => { + expect(tokenLimit('qwen3-coder-plus')).toBe(1048576); // Should be input limit + expect(tokenLimit('qwen-vl-max-latest')).toBe(131072); // Should be input limit + expect(tokenLimit('unknown-model')).toBe(DEFAULT_TOKEN_LIMIT); // Should be input default + }); + + it('should work with explicit input type', () => { + expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576); + expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072); + expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT); + }); + }); + + describe('Model normalization with output limits', () => { + it('should handle normalized model names for output limits', () => { + expect(tokenLimit('QWEN3-CODER-PLUS', 'output')).toBe(65536); + expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536); + expect(tokenLimit('QWEN-VL-MAX-LATEST', 'output')).toBe(8192); + }); + + it('should handle complex model strings for output limits', () => { + expect( + tokenLimit( + ' a/b/c|QWEN3-CODER-PLUS:qwen3-coder-plus-2024-05-13 ', + 'output', + ), + ).toBe(65536); + expect( + tokenLimit( + 'provider/qwen-vl-max-latest:qwen-vl-max-latest-v1', + 'output', + ), + ).toBe(8192); + }); + }); +}); diff --git a/packages/core/src/core/tokenLimits.ts b/packages/core/src/core/tokenLimits.ts index 2e502037..67ff6a86 100644 --- a/packages/core/src/core/tokenLimits.ts +++ b/packages/core/src/core/tokenLimits.ts @@ -1,7 +1,15 @@ type Model = string; type TokenCount = number; +/** + * Token limit types for different use cases. + * - 'input': Maximum input context window size + * - 'output': Maximum output tokens that can be generated in a single response + */ +export type TokenLimitType = 'input' | 'output'; + export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two) +export const DEFAULT_OUTPUT_TOKEN_LIMIT: TokenCount = 4_096; // 4K tokens /** * Accurate numeric limits: @@ -18,6 +26,10 @@ const LIMITS = { '1m': 1_048_576, '2m': 2_097_152, '10m': 10_485_760, // 10 million tokens + // Output token limits (typically much smaller than input limits) + '4k': 4_096, + '8k': 8_192, + '16k': 16_384, } as const; /** Robust normalizer: strips provider prefixes, pipes/colons, date/version suffixes, etc. */ @@ -36,7 +48,7 @@ export function normalize(model: string): string { // - dates (e.g., -20250219), -v1, version numbers, 'latest', 'preview' etc. s = s.replace(/-preview/g, ''); // Special handling for Qwen model names that include "-latest" as part of the model name - if (!s.match(/^qwen-(?:plus|flash)-latest$/)) { + if (!s.match(/^qwen-(?:plus|flash|vl-max)-latest$/)) { // \d{6,} - Match 6 or more digits (dates) like -20250219 (6+ digit dates) // \d+x\d+b - Match patterns like 4x8b, -7b, -70b // v\d+(?:\.\d+)* - Match version patterns starting with 'v' like -v1, -v1.2, -v2.1.3 @@ -142,16 +154,48 @@ const PATTERNS: Array<[RegExp, TokenCount]> = [ [/^mistral-large-2.*$/, LIMITS['128k']], ]; -/** Return the token limit for a model string (uses normalize + ordered regex list). */ -export function tokenLimit(model: Model): TokenCount { +/** + * Output token limit patterns for specific model families. + * These patterns define the maximum number of tokens that can be generated + * in a single response for specific models. + */ +const OUTPUT_PATTERNS: Array<[RegExp, TokenCount]> = [ + // ------------------- + // Alibaba / Qwen - DashScope Models + // ------------------- + // Qwen3-Coder-Plus: 65,536 max output tokens + [/^qwen3-coder-plus(-.*)?$/, LIMITS['64k']], + + // Qwen-VL-Max-Latest: 8,192 max output tokens + [/^qwen-vl-max-latest$/, LIMITS['8k']], +]; + +/** + * Return the token limit for a model string based on the specified type. + * + * This function determines the maximum number of tokens for either input context + * or output generation based on the model and token type. It uses the same + * normalization logic for consistency across both input and output limits. + * + * @param model - The model name to get the token limit for + * @param type - The type of token limit ('input' for context window, 'output' for generation) + * @returns The maximum number of tokens allowed for this model and type + */ +export function tokenLimit( + model: Model, + type: TokenLimitType = 'input', +): TokenCount { const norm = normalize(model); - for (const [regex, limit] of PATTERNS) { + // Choose the appropriate patterns based on token type + const patterns = type === 'output' ? OUTPUT_PATTERNS : PATTERNS; + + for (const [regex, limit] of patterns) { if (regex.test(norm)) { return limit; } } - // final fallback: DEFAULT_TOKEN_LIMIT (power-of-two 128K) - return DEFAULT_TOKEN_LIMIT; + // Return appropriate default based on token type + return type === 'output' ? DEFAULT_OUTPUT_TOKEN_LIMIT : DEFAULT_TOKEN_LIMIT; }