fix: output token limit for qwen (#664)

This commit is contained in:
Mingholy
2025-09-23 14:28:59 +08:00
committed by GitHub
parent 3579d6555a
commit 014059e8a6
4 changed files with 351 additions and 11 deletions

View File

@@ -560,4 +560,146 @@ describe('DashScopeOpenAICompatibleProvider', () => {
]); ]);
}); });
}); });
describe('output token limits', () => {
it('should limit max_tokens when it exceeds model limit for qwen3-coder-plus', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'qwen3-coder-plus',
messages: [{ role: 'user', content: 'Hello' }],
max_tokens: 100000, // Exceeds the 65536 limit
};
const result = provider.buildRequest(request, 'test-prompt-id');
expect(result.max_tokens).toBe(65536); // Should be limited to model's output limit
});
it('should limit max_tokens when it exceeds model limit for qwen-vl-max-latest', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'qwen-vl-max-latest',
messages: [{ role: 'user', content: 'Hello' }],
max_tokens: 20000, // Exceeds the 8192 limit
};
const result = provider.buildRequest(request, 'test-prompt-id');
expect(result.max_tokens).toBe(8192); // Should be limited to model's output limit
});
it('should not modify max_tokens when it is within model limit', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'qwen3-coder-plus',
messages: [{ role: 'user', content: 'Hello' }],
max_tokens: 1000, // Within the 65536 limit
};
const result = provider.buildRequest(request, 'test-prompt-id');
expect(result.max_tokens).toBe(1000); // Should remain unchanged
});
it('should not add max_tokens when not present in request', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'qwen3-coder-plus',
messages: [{ role: 'user', content: 'Hello' }],
// No max_tokens parameter
};
const result = provider.buildRequest(request, 'test-prompt-id');
expect(result.max_tokens).toBeUndefined(); // Should remain undefined
});
it('should handle null max_tokens parameter', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'qwen3-coder-plus',
messages: [{ role: 'user', content: 'Hello' }],
max_tokens: null,
};
const result = provider.buildRequest(request, 'test-prompt-id');
expect(result.max_tokens).toBeNull(); // Should remain null
});
it('should use default output limit for unknown models', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'unknown-model',
messages: [{ role: 'user', content: 'Hello' }],
max_tokens: 10000, // Exceeds the default 4096 limit
};
const result = provider.buildRequest(request, 'test-prompt-id');
expect(result.max_tokens).toBe(4096); // Should be limited to default output limit
});
it('should preserve other request parameters when limiting max_tokens', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'qwen3-coder-plus',
messages: [{ role: 'user', content: 'Hello' }],
max_tokens: 100000, // Will be limited
temperature: 0.8,
top_p: 0.9,
frequency_penalty: 0.1,
presence_penalty: 0.2,
stop: ['END'],
user: 'test-user',
};
const result = provider.buildRequest(request, 'test-prompt-id');
// max_tokens should be limited
expect(result.max_tokens).toBe(65536);
// Other parameters should be preserved
expect(result.temperature).toBe(0.8);
expect(result.top_p).toBe(0.9);
expect(result.frequency_penalty).toBe(0.1);
expect(result.presence_penalty).toBe(0.2);
expect(result.stop).toEqual(['END']);
expect(result.user).toBe('test-user');
});
it('should work with vision models and output token limits', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'qwen-vl-max-latest',
messages: [
{
role: 'user',
content: [
{ type: 'text', text: 'Look at this image:' },
{
type: 'image_url',
image_url: { url: 'https://example.com/image.jpg' },
},
],
},
],
max_tokens: 20000, // Exceeds the 8192 limit
};
const result = provider.buildRequest(request, 'test-prompt-id');
expect(result.max_tokens).toBe(8192); // Should be limited
expect(
(result as { vl_high_resolution_images?: boolean })
.vl_high_resolution_images,
).toBe(true); // Vision-specific parameter should be preserved
});
it('should handle streaming requests with output token limits', () => {
const request: OpenAI.Chat.ChatCompletionCreateParams = {
model: 'qwen3-coder-plus',
messages: [{ role: 'user', content: 'Hello' }],
max_tokens: 100000, // Exceeds the 65536 limit
stream: true,
};
const result = provider.buildRequest(request, 'test-prompt-id');
expect(result.max_tokens).toBe(65536); // Should be limited
expect(result.stream).toBe(true); // Streaming should be preserved
});
});
}); });

View File

@@ -3,6 +3,7 @@ import type { Config } from '../../../config/config.js';
import type { ContentGeneratorConfig } from '../../contentGenerator.js'; import type { ContentGeneratorConfig } from '../../contentGenerator.js';
import { AuthType } from '../../contentGenerator.js'; import { AuthType } from '../../contentGenerator.js';
import { DEFAULT_TIMEOUT, DEFAULT_MAX_RETRIES } from '../constants.js'; import { DEFAULT_TIMEOUT, DEFAULT_MAX_RETRIES } from '../constants.js';
import { tokenLimit } from '../../tokenLimits.js';
import type { import type {
OpenAICompatibleProvider, OpenAICompatibleProvider,
DashScopeRequestMetadata, DashScopeRequestMetadata,
@@ -65,6 +66,19 @@ export class DashScopeOpenAICompatibleProvider
}); });
} }
/**
* Build and configure the request for DashScope API.
*
* This method applies DashScope-specific configurations including:
* - Cache control for system and user messages
* - Output token limits based on model capabilities
* - Vision model specific parameters (vl_high_resolution_images)
* - Request metadata for session tracking
*
* @param request - The original chat completion request parameters
* @param userPromptId - Unique identifier for the user prompt for session tracking
* @returns Configured request with DashScope-specific parameters applied
*/
buildRequest( buildRequest(
request: OpenAI.Chat.ChatCompletionCreateParams, request: OpenAI.Chat.ChatCompletionCreateParams,
userPromptId: string, userPromptId: string,
@@ -79,21 +93,28 @@ export class DashScopeOpenAICompatibleProvider
messages = this.addDashScopeCacheControl(messages, cacheTarget); messages = this.addDashScopeCacheControl(messages, cacheTarget);
} }
// Apply output token limits based on model capabilities
// This ensures max_tokens doesn't exceed the model's maximum output limit
const requestWithTokenLimits = this.applyOutputTokenLimit(
request,
request.model,
);
if (request.model.startsWith('qwen-vl')) { if (request.model.startsWith('qwen-vl')) {
return { return {
...request, ...requestWithTokenLimits,
messages, messages,
...(this.buildMetadata(userPromptId) || {}), ...(this.buildMetadata(userPromptId) || {}),
/* @ts-expect-error dashscope exclusive */ /* @ts-expect-error dashscope exclusive */
vl_high_resolution_images: true, vl_high_resolution_images: true,
}; } as OpenAI.Chat.ChatCompletionCreateParams;
} }
return { return {
...request, // Preserve all original parameters including sampling params ...requestWithTokenLimits, // Preserve all original parameters including sampling params and adjusted max_tokens
messages, messages,
...(this.buildMetadata(userPromptId) || {}), ...(this.buildMetadata(userPromptId) || {}),
}; } as OpenAI.Chat.ChatCompletionCreateParams;
} }
buildMetadata(userPromptId: string): DashScopeRequestMetadata { buildMetadata(userPromptId: string): DashScopeRequestMetadata {
@@ -246,6 +267,41 @@ export class DashScopeOpenAICompatibleProvider
return contentArray; return contentArray;
} }
/**
* Apply output token limit to a request's max_tokens parameter.
*
* Ensures that existing max_tokens parameters don't exceed the model's maximum output
* token limit. Only modifies max_tokens when already present in the request.
*
* @param request - The chat completion request parameters
* @param model - The model name to get the output token limit for
* @returns The request with max_tokens adjusted to respect the model's limits (if present)
*/
private applyOutputTokenLimit<T extends { max_tokens?: number | null }>(
request: T,
model: string,
): T {
const currentMaxTokens = request.max_tokens;
// Only process if max_tokens is already present in the request
if (currentMaxTokens === undefined || currentMaxTokens === null) {
return request; // No max_tokens parameter, return unchanged
}
const modelLimit = tokenLimit(model, 'output');
// If max_tokens exceeds the model limit, cap it to the model's limit
if (currentMaxTokens > modelLimit) {
return {
...request,
max_tokens: modelLimit,
};
}
// If max_tokens is within the limit, return the request unchanged
return request;
}
/** /**
* Check if cache control should be disabled based on configuration. * Check if cache control should be disabled based on configuration.
* *

View File

@@ -1,5 +1,10 @@
import { describe, it, expect } from 'vitest'; import { describe, it, expect } from 'vitest';
import { normalize, tokenLimit, DEFAULT_TOKEN_LIMIT } from './tokenLimits.js'; import {
normalize,
tokenLimit,
DEFAULT_TOKEN_LIMIT,
DEFAULT_OUTPUT_TOKEN_LIMIT,
} from './tokenLimits.js';
describe('normalize', () => { describe('normalize', () => {
it('should lowercase and trim the model string', () => { it('should lowercase and trim the model string', () => {
@@ -225,3 +230,96 @@ describe('tokenLimit', () => {
expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000); expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000);
}); });
}); });
describe('tokenLimit with output type', () => {
describe('Qwen models with output limits', () => {
it('should return the correct output limit for qwen3-coder-plus', () => {
expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536);
expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536);
});
it('should return the correct output limit for qwen-vl-max-latest', () => {
expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192);
});
});
describe('Default output limits', () => {
it('should return the default output limit for unknown models', () => {
expect(tokenLimit('unknown-model', 'output')).toBe(
DEFAULT_OUTPUT_TOKEN_LIMIT,
);
expect(tokenLimit('gpt-4', 'output')).toBe(DEFAULT_OUTPUT_TOKEN_LIMIT);
expect(tokenLimit('claude-3.5-sonnet', 'output')).toBe(
DEFAULT_OUTPUT_TOKEN_LIMIT,
);
});
it('should return the default output limit for models without specific output patterns', () => {
expect(tokenLimit('qwen3-coder-7b', 'output')).toBe(
DEFAULT_OUTPUT_TOKEN_LIMIT,
);
expect(tokenLimit('qwen-plus', 'output')).toBe(
DEFAULT_OUTPUT_TOKEN_LIMIT,
);
expect(tokenLimit('qwen-vl-max', 'output')).toBe(
DEFAULT_OUTPUT_TOKEN_LIMIT,
);
});
});
describe('Input vs Output limits comparison', () => {
it('should return different limits for input vs output for qwen3-coder-plus', () => {
expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576); // 1M input
expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); // 64K output
});
it('should return different limits for input vs output for qwen-vl-max-latest', () => {
expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072); // 128K input
expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); // 8K output
});
it('should return same default limits for unknown models', () => {
expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT); // 128K input
expect(tokenLimit('unknown-model', 'output')).toBe(
DEFAULT_OUTPUT_TOKEN_LIMIT,
); // 4K output
});
});
describe('Backward compatibility', () => {
it('should default to input type when no type is specified', () => {
expect(tokenLimit('qwen3-coder-plus')).toBe(1048576); // Should be input limit
expect(tokenLimit('qwen-vl-max-latest')).toBe(131072); // Should be input limit
expect(tokenLimit('unknown-model')).toBe(DEFAULT_TOKEN_LIMIT); // Should be input default
});
it('should work with explicit input type', () => {
expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576);
expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072);
expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT);
});
});
describe('Model normalization with output limits', () => {
it('should handle normalized model names for output limits', () => {
expect(tokenLimit('QWEN3-CODER-PLUS', 'output')).toBe(65536);
expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536);
expect(tokenLimit('QWEN-VL-MAX-LATEST', 'output')).toBe(8192);
});
it('should handle complex model strings for output limits', () => {
expect(
tokenLimit(
' a/b/c|QWEN3-CODER-PLUS:qwen3-coder-plus-2024-05-13 ',
'output',
),
).toBe(65536);
expect(
tokenLimit(
'provider/qwen-vl-max-latest:qwen-vl-max-latest-v1',
'output',
),
).toBe(8192);
});
});
});

View File

@@ -1,7 +1,15 @@
type Model = string; type Model = string;
type TokenCount = number; type TokenCount = number;
/**
* Token limit types for different use cases.
* - 'input': Maximum input context window size
* - 'output': Maximum output tokens that can be generated in a single response
*/
export type TokenLimitType = 'input' | 'output';
export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two) export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two)
export const DEFAULT_OUTPUT_TOKEN_LIMIT: TokenCount = 4_096; // 4K tokens
/** /**
* Accurate numeric limits: * Accurate numeric limits:
@@ -18,6 +26,10 @@ const LIMITS = {
'1m': 1_048_576, '1m': 1_048_576,
'2m': 2_097_152, '2m': 2_097_152,
'10m': 10_485_760, // 10 million tokens '10m': 10_485_760, // 10 million tokens
// Output token limits (typically much smaller than input limits)
'4k': 4_096,
'8k': 8_192,
'16k': 16_384,
} as const; } as const;
/** Robust normalizer: strips provider prefixes, pipes/colons, date/version suffixes, etc. */ /** Robust normalizer: strips provider prefixes, pipes/colons, date/version suffixes, etc. */
@@ -36,7 +48,7 @@ export function normalize(model: string): string {
// - dates (e.g., -20250219), -v1, version numbers, 'latest', 'preview' etc. // - dates (e.g., -20250219), -v1, version numbers, 'latest', 'preview' etc.
s = s.replace(/-preview/g, ''); s = s.replace(/-preview/g, '');
// Special handling for Qwen model names that include "-latest" as part of the model name // Special handling for Qwen model names that include "-latest" as part of the model name
if (!s.match(/^qwen-(?:plus|flash)-latest$/)) { if (!s.match(/^qwen-(?:plus|flash|vl-max)-latest$/)) {
// \d{6,} - Match 6 or more digits (dates) like -20250219 (6+ digit dates) // \d{6,} - Match 6 or more digits (dates) like -20250219 (6+ digit dates)
// \d+x\d+b - Match patterns like 4x8b, -7b, -70b // \d+x\d+b - Match patterns like 4x8b, -7b, -70b
// v\d+(?:\.\d+)* - Match version patterns starting with 'v' like -v1, -v1.2, -v2.1.3 // v\d+(?:\.\d+)* - Match version patterns starting with 'v' like -v1, -v1.2, -v2.1.3
@@ -142,16 +154,48 @@ const PATTERNS: Array<[RegExp, TokenCount]> = [
[/^mistral-large-2.*$/, LIMITS['128k']], [/^mistral-large-2.*$/, LIMITS['128k']],
]; ];
/** Return the token limit for a model string (uses normalize + ordered regex list). */ /**
export function tokenLimit(model: Model): TokenCount { * Output token limit patterns for specific model families.
* These patterns define the maximum number of tokens that can be generated
* in a single response for specific models.
*/
const OUTPUT_PATTERNS: Array<[RegExp, TokenCount]> = [
// -------------------
// Alibaba / Qwen - DashScope Models
// -------------------
// Qwen3-Coder-Plus: 65,536 max output tokens
[/^qwen3-coder-plus(-.*)?$/, LIMITS['64k']],
// Qwen-VL-Max-Latest: 8,192 max output tokens
[/^qwen-vl-max-latest$/, LIMITS['8k']],
];
/**
* Return the token limit for a model string based on the specified type.
*
* This function determines the maximum number of tokens for either input context
* or output generation based on the model and token type. It uses the same
* normalization logic for consistency across both input and output limits.
*
* @param model - The model name to get the token limit for
* @param type - The type of token limit ('input' for context window, 'output' for generation)
* @returns The maximum number of tokens allowed for this model and type
*/
export function tokenLimit(
model: Model,
type: TokenLimitType = 'input',
): TokenCount {
const norm = normalize(model); const norm = normalize(model);
for (const [regex, limit] of PATTERNS) { // Choose the appropriate patterns based on token type
const patterns = type === 'output' ? OUTPUT_PATTERNS : PATTERNS;
for (const [regex, limit] of patterns) {
if (regex.test(norm)) { if (regex.test(norm)) {
return limit; return limit;
} }
} }
// final fallback: DEFAULT_TOKEN_LIMIT (power-of-two 128K) // Return appropriate default based on token type
return DEFAULT_TOKEN_LIMIT; return type === 'output' ? DEFAULT_OUTPUT_TOKEN_LIMIT : DEFAULT_TOKEN_LIMIT;
} }