From e63233cefc4e241aa504a3d55194896d18ab9e9d Mon Sep 17 00:00:00 2001 From: zhutao100 Date: Mon, 8 Sep 2025 20:38:47 -0700 Subject: [PATCH] Re-implement tokenLimits class to make it work correctly for Qwen and many other model types. (#542) The original tokenLimits was copied over from gemini-cli and only works with gemini. --- packages/core/src/core/tokenLimits.test.ts | 227 +++++++++++++++++++++ packages/core/src/core/tokenLimits.ts | 174 +++++++++++++--- 2 files changed, 375 insertions(+), 26 deletions(-) create mode 100644 packages/core/src/core/tokenLimits.test.ts diff --git a/packages/core/src/core/tokenLimits.test.ts b/packages/core/src/core/tokenLimits.test.ts new file mode 100644 index 00000000..6ef81634 --- /dev/null +++ b/packages/core/src/core/tokenLimits.test.ts @@ -0,0 +1,227 @@ +import { describe, it, expect } from 'vitest'; +import { normalize, tokenLimit, DEFAULT_TOKEN_LIMIT } from './tokenLimits.js'; + +describe('normalize', () => { + it('should lowercase and trim the model string', () => { + expect(normalize(' GEMINI-1.5-PRO ')).toBe('gemini-1.5-pro'); + }); + + it('should strip provider prefixes', () => { + expect(normalize('google/gemini-1.5-pro')).toBe('gemini-1.5-pro'); + expect(normalize('anthropic/claude-3.5-sonnet')).toBe('claude-3.5-sonnet'); + }); + + it('should handle pipe and colon separators', () => { + expect(normalize('qwen|qwen2.5:qwen2.5-1m')).toBe('qwen2.5-1m'); + }); + + it('should collapse whitespace to a single hyphen', () => { + expect(normalize('claude 3.5 sonnet')).toBe('claude-3.5-sonnet'); + }); + + it('should remove date and version suffixes', () => { + expect(normalize('gemini-1.5-pro-20250219')).toBe('gemini-1.5-pro'); + expect(normalize('gpt-4o-mini-v1')).toBe('gpt-4o-mini'); + expect(normalize('claude-3.7-sonnet-20240715')).toBe('claude-3.7-sonnet'); + expect(normalize('gpt-4.1-latest')).toBe('gpt-4.1'); + expect(normalize('gemini-2.0-flash-preview-20250520')).toBe( + 'gemini-2.0-flash', + ); + }); + + it('should remove quantization and numeric suffixes', () => { + expect(normalize('qwen3-coder-7b-4bit')).toBe('qwen3-coder-7b'); + expect(normalize('llama-4-scout-int8')).toBe('llama-4-scout'); + expect(normalize('mistral-large-2-bf16')).toBe('mistral-large-2'); + expect(normalize('deepseek-v3.1-q4')).toBe('deepseek-v3.1'); + expect(normalize('qwen2.5-quantized')).toBe('qwen2.5'); + }); + + it('should handle a combination of normalization rules', () => { + expect(normalize(' Google/GEMINI-2.5-PRO:gemini-2.5-pro-20250605 ')).toBe( + 'gemini-2.5-pro', + ); + }); + + it('should handle empty or null input', () => { + expect(normalize('')).toBe(''); + expect(normalize(undefined as unknown as string)).toBe(''); + expect(normalize(null as unknown as string)).toBe(''); + }); + + it('should remove preview suffixes', () => { + expect(normalize('gemini-2.0-flash-preview')).toBe('gemini-2.0-flash'); + }); + + it('should remove version numbers with dots when they are at the end', () => { + expect(normalize('gpt-4.1.1-latest')).toBe('gpt-4.1.1'); + expect(normalize('gpt-4.1-latest')).toBe('gpt-4.1'); + }); +}); + +describe('tokenLimit', () => { + // Test cases for each model family + describe('Google Gemini', () => { + it('should return the correct limit for Gemini 1.5 Pro', () => { + expect(tokenLimit('gemini-1.5-pro')).toBe(2097152); + }); + it('should return the correct limit for Gemini 1.5 Flash', () => { + expect(tokenLimit('gemini-1.5-flash')).toBe(1048576); + }); + it('should return the correct limit for Gemini 2.5 Pro', () => { + expect(tokenLimit('gemini-2.5-pro')).toBe(1048576); + }); + it('should return the correct limit for Gemini 2.5 Flash', () => { + expect(tokenLimit('gemini-2.5-flash')).toBe(1048576); + }); + it('should return the correct limit for Gemini 2.0 Flash with image generation', () => { + expect(tokenLimit('gemini-2.0-flash-image-generation')).toBe(32768); + }); + it('should return the correct limit for Gemini 2.0 Flash', () => { + expect(tokenLimit('gemini-2.0-flash')).toBe(1048576); + }); + }); + + describe('OpenAI', () => { + it('should return the correct limit for o3-mini', () => { + expect(tokenLimit('o3-mini')).toBe(200000); + }); + it('should return the correct limit for o3 models', () => { + expect(tokenLimit('o3')).toBe(200000); + }); + it('should return the correct limit for o4-mini', () => { + expect(tokenLimit('o4-mini')).toBe(200000); + }); + it('should return the correct limit for gpt-4o-mini', () => { + expect(tokenLimit('gpt-4o-mini')).toBe(131072); + }); + it('should return the correct limit for gpt-4o', () => { + expect(tokenLimit('gpt-4o')).toBe(131072); + }); + it('should return the correct limit for gpt-4.1-mini', () => { + expect(tokenLimit('gpt-4.1-mini')).toBe(1048576); + }); + it('should return the correct limit for gpt-4.1 models', () => { + expect(tokenLimit('gpt-4.1')).toBe(1048576); + }); + it('should return the correct limit for gpt-4', () => { + expect(tokenLimit('gpt-4')).toBe(131072); + }); + }); + + describe('Anthropic Claude', () => { + it('should return the correct limit for Claude 3.5 Sonnet', () => { + expect(tokenLimit('claude-3.5-sonnet')).toBe(200000); + }); + it('should return the correct limit for Claude 3.7 Sonnet', () => { + expect(tokenLimit('claude-3.7-sonnet')).toBe(1048576); + }); + it('should return the correct limit for Claude Sonnet 4', () => { + expect(tokenLimit('claude-sonnet-4')).toBe(1048576); + }); + it('should return the correct limit for Claude Opus 4', () => { + expect(tokenLimit('claude-opus-4')).toBe(1048576); + }); + }); + + describe('Alibaba Qwen', () => { + it('should return the correct limit for qwen3-coder commercial models', () => { + expect(tokenLimit('qwen3-coder-plus')).toBe(1048576); + expect(tokenLimit('qwen3-coder-plus-20250601')).toBe(1048576); + expect(tokenLimit('qwen3-coder-flash')).toBe(1048576); + expect(tokenLimit('qwen3-coder-flash-20250601')).toBe(1048576); + }); + + it('should return the correct limit for qwen3-coder open source models', () => { + expect(tokenLimit('qwen3-coder-7b')).toBe(262144); + expect(tokenLimit('qwen3-coder-480b-a35b-instruct')).toBe(262144); + expect(tokenLimit('qwen3-coder-30b-a3b-instruct')).toBe(262144); + }); + + it('should return the correct limit for qwen3 2507 variants', () => { + expect(tokenLimit('qwen3-some-model-2507-instruct')).toBe(262144); + }); + + it('should return the correct limit for qwen2.5-1m', () => { + expect(tokenLimit('qwen2.5-1m')).toBe(1048576); + expect(tokenLimit('qwen2.5-1m-instruct')).toBe(1048576); + }); + + it('should return the correct limit for qwen2.5', () => { + expect(tokenLimit('qwen2.5')).toBe(131072); + expect(tokenLimit('qwen2.5-instruct')).toBe(131072); + }); + + it('should return the correct limit for qwen-plus', () => { + expect(tokenLimit('qwen-plus-latest')).toBe(1048576); + expect(tokenLimit('qwen-plus')).toBe(131072); + }); + + it('should return the correct limit for qwen-flash', () => { + expect(tokenLimit('qwen-flash-latest')).toBe(1048576); + }); + + it('should return the correct limit for qwen-turbo', () => { + expect(tokenLimit('qwen-turbo')).toBe(131072); + expect(tokenLimit('qwen-turbo-latest')).toBe(131072); + }); + }); + + describe('ByteDance Seed-OSS', () => { + it('should return the correct limit for seed-oss', () => { + expect(tokenLimit('seed-oss')).toBe(524288); + }); + }); + + describe('Zhipu GLM', () => { + it('should return the correct limit for glm-4.5v', () => { + expect(tokenLimit('glm-4.5v')).toBe(65536); + }); + it('should return the correct limit for glm-4.5-air', () => { + expect(tokenLimit('glm-4.5-air')).toBe(131072); + }); + it('should return the correct limit for glm-4.5', () => { + expect(tokenLimit('glm-4.5')).toBe(131072); + }); + }); + + describe('Other models', () => { + it('should return the correct limit for deepseek-r1', () => { + expect(tokenLimit('deepseek-r1')).toBe(131072); + }); + it('should return the correct limit for deepseek-v3', () => { + expect(tokenLimit('deepseek-v3')).toBe(131072); + }); + it('should return the correct limit for deepseek-v3.1', () => { + expect(tokenLimit('deepseek-v3.1')).toBe(131072); + }); + it('should return the correct limit for kimi-k2-instruct', () => { + expect(tokenLimit('kimi-k2-instruct')).toBe(131072); + }); + it('should return the correct limit for gpt-oss', () => { + expect(tokenLimit('gpt-oss')).toBe(131072); + }); + it('should return the correct limit for llama-4-scout', () => { + expect(tokenLimit('llama-4-scout')).toBe(10485760); + }); + it('should return the correct limit for mistral-large-2', () => { + expect(tokenLimit('mistral-large-2')).toBe(131072); + }); + }); + + // Test for default limit + it('should return the default token limit for an unknown model', () => { + expect(tokenLimit('unknown-model-v1.0')).toBe(DEFAULT_TOKEN_LIMIT); + }); + + // Test with complex model string + it('should return the correct limit for a complex model string', () => { + expect(tokenLimit(' a/b/c|GPT-4o:gpt-4o-2024-05-13-q4 ')).toBe(131072); + }); + + // Test case-insensitive matching + it('should handle case-insensitive model names', () => { + expect(tokenLimit('GPT-4O')).toBe(131072); + expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000); + }); +}); diff --git a/packages/core/src/core/tokenLimits.ts b/packages/core/src/core/tokenLimits.ts index d238cdb3..e51becab 100644 --- a/packages/core/src/core/tokenLimits.ts +++ b/packages/core/src/core/tokenLimits.ts @@ -1,32 +1,154 @@ -/** - * @license - * Copyright 2025 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - type Model = string; type TokenCount = number; -export const DEFAULT_TOKEN_LIMIT = 1_048_576; +export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two) -export function tokenLimit(model: Model): TokenCount { - // Add other models as they become relevant or if specified by config - // Pulled from https://ai.google.dev/gemini-api/docs/models - switch (model) { - case 'gemini-1.5-pro': - return 2_097_152; - case 'gemini-1.5-flash': - case 'gemini-2.5-pro-preview-05-06': - case 'gemini-2.5-pro-preview-06-05': - case 'gemini-2.5-pro': - case 'gemini-2.5-flash-preview-05-20': - case 'gemini-2.5-flash': - case 'gemini-2.5-flash-lite': - case 'gemini-2.0-flash': - return 1_048_576; - case 'gemini-2.0-flash-preview-image-generation': - return 32_000; - default: - return DEFAULT_TOKEN_LIMIT; +/** + * Accurate numeric limits: + * - power-of-two approximations (128K -> 131072, 256K -> 262144, etc.) + * - vendor-declared exact values (e.g., 200k -> 200000) are used as stated in docs. + */ +const LIMITS = { + '32k': 32_768, + '64k': 65_536, + '128k': 131_072, + '200k': 200_000, // vendor-declared decimal (OpenAI / Anthropic use 200k) + '256k': 262_144, + '512k': 524_288, + '1m': 1_048_576, + '2m': 2_097_152, + '10m': 10_485_760, // 10 million tokens +} as const; + +/** Robust normalizer: strips provider prefixes, pipes/colons, date/version suffixes, etc. */ +export function normalize(model: string): string { + let s = (model ?? '').toLowerCase().trim(); + + // keep final path segment (strip provider prefixes), handle pipe/colon + s = s.replace(/^.*\//, ''); + s = s.split('|').pop() ?? s; + s = s.split(':').pop() ?? s; + + // collapse whitespace to single hyphen + s = s.replace(/\s+/g, '-'); + + // remove trailing build / date / revision suffixes: + // - dates (e.g., -20250219), -v1, version numbers, 'latest', 'preview' etc. + s = s.replace(/-preview/g, ''); + // Special handling for Qwen model names that include "-latest" as part of the model name + if (!s.match(/^qwen-(?:plus|flash)-latest$/)) { + // \d{6,} - Match 6 or more digits (dates) like -20250219 (6+ digit dates) + // \d+x\d+b - Match patterns like 4x8b, -7b, -70b + // v\d+(?:\.\d+)* - Match version patterns starting with 'v' like -v1, -v1.2, -v2.1.3 + // -\d+(?:\.\d+)+ - Match version numbers with dots (that are preceded by a dash), + // like -1.1, -2.0.1 but only when they're suffixes, Example: model-test-1.1 → model-test; + // Note: this does NOT match 4.1 in gpt-4.1 because there's no dash before 4.1 in that context. + // latest - Match the literal string "latest" + s = s.replace( + /-(?:\d{6,}|\d+x\d+b|v\d+(?:\.\d+)*|-\d+(?:\.\d+)+|latest)$/g, + '', + ); } + + // remove quantization / numeric / precision suffixes common in local/community models + s = s.replace(/-(?:\d?bit|int[48]|bf16|fp16|q[45]|quantized)$/g, ''); + + return s; +} + +/** Ordered regex patterns: most specific -> most general (first match wins). */ +const PATTERNS: Array<[RegExp, TokenCount]> = [ + // ------------------- + // Google Gemini + // ------------------- + [/^gemini-1\.5-pro$/, LIMITS['2m']], + [/^gemini-1\.5-flash$/, LIMITS['1m']], + [/^gemini-2\.5-pro.*$/, LIMITS['1m']], + [/^gemini-2\.5-flash.*$/, LIMITS['1m']], + [/^gemini-2\.0-flash-image-generation$/, LIMITS['32k']], + [/^gemini-2\.0-flash.*$/, LIMITS['1m']], + + // ------------------- + // OpenAI (o3 / o4-mini / gpt-4.1 / gpt-4o family) + // o3 and o4-mini document a 200,000-token context window (decimal). + // Note: GPT-4.1 models typically report 1_048_576 (1M) context in OpenAI announcements. + [/^o3(?:-mini|$).*$/, LIMITS['200k']], + [/^o3.*$/, LIMITS['200k']], + [/^o4-mini.*$/, LIMITS['200k']], + [/^gpt-4\.1-mini.*$/, LIMITS['1m']], + [/^gpt-4\.1.*$/, LIMITS['1m']], + [/^gpt-4o-mini.*$/, LIMITS['128k']], + [/^gpt-4o.*$/, LIMITS['128k']], + [/^gpt-4.*$/, LIMITS['128k']], + + // ------------------- + // Anthropic Claude + // - Claude Sonnet / Sonnet 3.5 and related Sonnet variants: 200,000 tokens documented. + // - Some Sonnet/Opus models offer 1M in beta/enterprise tiers (handled separately if needed). + [/^claude-3\.5-sonnet.*$/, LIMITS['200k']], + [/^claude-3\.7-sonnet.*$/, LIMITS['1m']], // some Sonnet 3.7/Opus variants advertise 1M beta in docs + [/^claude-sonnet-4.*$/, LIMITS['1m']], + [/^claude-opus-4.*$/, LIMITS['1m']], + + // ------------------- + // Alibaba / Qwen + // ------------------- + // Commercial Qwen3-Coder-Plus: 1M token context + [/^qwen3-coder-plus(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-plus" and date variants + + // Commercial Qwen3-Coder-Flash: 1M token context + [/^qwen3-coder-flash(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-flash" and date variants + + // Open-source Qwen3-Coder variants: 256K native + [/^qwen3-coder-.*$/, LIMITS['256k']], + // Open-source Qwen3 2507 variants: 256K native + [/^qwen3-.*-2507-.*$/, LIMITS['256k']], + + // Open-source long-context Qwen2.5-1M + [/^qwen2\.5-1m.*$/, LIMITS['1m']], + + // Standard Qwen2.5: 128K + [/^qwen2\.5.*$/, LIMITS['128k']], + + // Studio commercial Qwen-Plus / Qwen-Flash / Qwen-Turbo + [/^qwen-plus-latest$/, LIMITS['1m']], // Commercial latest: 1M + [/^qwen-plus.*$/, LIMITS['128k']], // Standard: 128K + [/^qwen-flash-latest$/, LIMITS['1m']], + [/^qwen-turbo.*$/, LIMITS['128k']], + + // ------------------- + // ByteDance Seed-OSS (512K) + // ------------------- + [/^seed-oss.*$/, LIMITS['512k']], + + // ------------------- + // Zhipu GLM + // ------------------- + [/^glm-4\.5v.*$/, LIMITS['64k']], + [/^glm-4\.5-air.*$/, LIMITS['128k']], + [/^glm-4\.5.*$/, LIMITS['128k']], + + // ------------------- + // DeepSeek / GPT-OSS / Kimi / Llama & Mistral examples + // ------------------- + [/^deepseek-r1.*$/, LIMITS['128k']], + [/^deepseek-v3(?:\.1)?.*$/, LIMITS['128k']], + [/^kimi-k2-instruct.*$/, LIMITS['128k']], + [/^gpt-oss.*$/, LIMITS['128k']], + [/^llama-4-scout.*$/, LIMITS['10m'] as unknown as TokenCount], // ultra-long variants - handle carefully + [/^mistral-large-2.*$/, LIMITS['128k']], +]; + +/** Return the token limit for a model string (uses normalize + ordered regex list). */ +export function tokenLimit(model: Model): TokenCount { + const norm = normalize(model); + + for (const [regex, limit] of PATTERNS) { + if (regex.test(norm)) { + return limit; + } + } + + // final fallback: DEFAULT_TOKEN_LIMIT (power-of-two 128K) + return DEFAULT_TOKEN_LIMIT; }