Re-implement tokenLimits class to make it work correctly for Qwen and many other model types. (#542)

The original tokenLimits was copied over from gemini-cli and only works with gemini.
This commit is contained in:
zhutao100
2025-09-08 20:38:47 -07:00
committed by GitHub
parent 621fe2e8ba
commit e63233cefc
2 changed files with 375 additions and 26 deletions

View File

@@ -0,0 +1,227 @@
import { describe, it, expect } from 'vitest';
import { normalize, tokenLimit, DEFAULT_TOKEN_LIMIT } from './tokenLimits.js';
describe('normalize', () => {
it('should lowercase and trim the model string', () => {
expect(normalize(' GEMINI-1.5-PRO ')).toBe('gemini-1.5-pro');
});
it('should strip provider prefixes', () => {
expect(normalize('google/gemini-1.5-pro')).toBe('gemini-1.5-pro');
expect(normalize('anthropic/claude-3.5-sonnet')).toBe('claude-3.5-sonnet');
});
it('should handle pipe and colon separators', () => {
expect(normalize('qwen|qwen2.5:qwen2.5-1m')).toBe('qwen2.5-1m');
});
it('should collapse whitespace to a single hyphen', () => {
expect(normalize('claude 3.5 sonnet')).toBe('claude-3.5-sonnet');
});
it('should remove date and version suffixes', () => {
expect(normalize('gemini-1.5-pro-20250219')).toBe('gemini-1.5-pro');
expect(normalize('gpt-4o-mini-v1')).toBe('gpt-4o-mini');
expect(normalize('claude-3.7-sonnet-20240715')).toBe('claude-3.7-sonnet');
expect(normalize('gpt-4.1-latest')).toBe('gpt-4.1');
expect(normalize('gemini-2.0-flash-preview-20250520')).toBe(
'gemini-2.0-flash',
);
});
it('should remove quantization and numeric suffixes', () => {
expect(normalize('qwen3-coder-7b-4bit')).toBe('qwen3-coder-7b');
expect(normalize('llama-4-scout-int8')).toBe('llama-4-scout');
expect(normalize('mistral-large-2-bf16')).toBe('mistral-large-2');
expect(normalize('deepseek-v3.1-q4')).toBe('deepseek-v3.1');
expect(normalize('qwen2.5-quantized')).toBe('qwen2.5');
});
it('should handle a combination of normalization rules', () => {
expect(normalize(' Google/GEMINI-2.5-PRO:gemini-2.5-pro-20250605 ')).toBe(
'gemini-2.5-pro',
);
});
it('should handle empty or null input', () => {
expect(normalize('')).toBe('');
expect(normalize(undefined as unknown as string)).toBe('');
expect(normalize(null as unknown as string)).toBe('');
});
it('should remove preview suffixes', () => {
expect(normalize('gemini-2.0-flash-preview')).toBe('gemini-2.0-flash');
});
it('should remove version numbers with dots when they are at the end', () => {
expect(normalize('gpt-4.1.1-latest')).toBe('gpt-4.1.1');
expect(normalize('gpt-4.1-latest')).toBe('gpt-4.1');
});
});
describe('tokenLimit', () => {
// Test cases for each model family
describe('Google Gemini', () => {
it('should return the correct limit for Gemini 1.5 Pro', () => {
expect(tokenLimit('gemini-1.5-pro')).toBe(2097152);
});
it('should return the correct limit for Gemini 1.5 Flash', () => {
expect(tokenLimit('gemini-1.5-flash')).toBe(1048576);
});
it('should return the correct limit for Gemini 2.5 Pro', () => {
expect(tokenLimit('gemini-2.5-pro')).toBe(1048576);
});
it('should return the correct limit for Gemini 2.5 Flash', () => {
expect(tokenLimit('gemini-2.5-flash')).toBe(1048576);
});
it('should return the correct limit for Gemini 2.0 Flash with image generation', () => {
expect(tokenLimit('gemini-2.0-flash-image-generation')).toBe(32768);
});
it('should return the correct limit for Gemini 2.0 Flash', () => {
expect(tokenLimit('gemini-2.0-flash')).toBe(1048576);
});
});
describe('OpenAI', () => {
it('should return the correct limit for o3-mini', () => {
expect(tokenLimit('o3-mini')).toBe(200000);
});
it('should return the correct limit for o3 models', () => {
expect(tokenLimit('o3')).toBe(200000);
});
it('should return the correct limit for o4-mini', () => {
expect(tokenLimit('o4-mini')).toBe(200000);
});
it('should return the correct limit for gpt-4o-mini', () => {
expect(tokenLimit('gpt-4o-mini')).toBe(131072);
});
it('should return the correct limit for gpt-4o', () => {
expect(tokenLimit('gpt-4o')).toBe(131072);
});
it('should return the correct limit for gpt-4.1-mini', () => {
expect(tokenLimit('gpt-4.1-mini')).toBe(1048576);
});
it('should return the correct limit for gpt-4.1 models', () => {
expect(tokenLimit('gpt-4.1')).toBe(1048576);
});
it('should return the correct limit for gpt-4', () => {
expect(tokenLimit('gpt-4')).toBe(131072);
});
});
describe('Anthropic Claude', () => {
it('should return the correct limit for Claude 3.5 Sonnet', () => {
expect(tokenLimit('claude-3.5-sonnet')).toBe(200000);
});
it('should return the correct limit for Claude 3.7 Sonnet', () => {
expect(tokenLimit('claude-3.7-sonnet')).toBe(1048576);
});
it('should return the correct limit for Claude Sonnet 4', () => {
expect(tokenLimit('claude-sonnet-4')).toBe(1048576);
});
it('should return the correct limit for Claude Opus 4', () => {
expect(tokenLimit('claude-opus-4')).toBe(1048576);
});
});
describe('Alibaba Qwen', () => {
it('should return the correct limit for qwen3-coder commercial models', () => {
expect(tokenLimit('qwen3-coder-plus')).toBe(1048576);
expect(tokenLimit('qwen3-coder-plus-20250601')).toBe(1048576);
expect(tokenLimit('qwen3-coder-flash')).toBe(1048576);
expect(tokenLimit('qwen3-coder-flash-20250601')).toBe(1048576);
});
it('should return the correct limit for qwen3-coder open source models', () => {
expect(tokenLimit('qwen3-coder-7b')).toBe(262144);
expect(tokenLimit('qwen3-coder-480b-a35b-instruct')).toBe(262144);
expect(tokenLimit('qwen3-coder-30b-a3b-instruct')).toBe(262144);
});
it('should return the correct limit for qwen3 2507 variants', () => {
expect(tokenLimit('qwen3-some-model-2507-instruct')).toBe(262144);
});
it('should return the correct limit for qwen2.5-1m', () => {
expect(tokenLimit('qwen2.5-1m')).toBe(1048576);
expect(tokenLimit('qwen2.5-1m-instruct')).toBe(1048576);
});
it('should return the correct limit for qwen2.5', () => {
expect(tokenLimit('qwen2.5')).toBe(131072);
expect(tokenLimit('qwen2.5-instruct')).toBe(131072);
});
it('should return the correct limit for qwen-plus', () => {
expect(tokenLimit('qwen-plus-latest')).toBe(1048576);
expect(tokenLimit('qwen-plus')).toBe(131072);
});
it('should return the correct limit for qwen-flash', () => {
expect(tokenLimit('qwen-flash-latest')).toBe(1048576);
});
it('should return the correct limit for qwen-turbo', () => {
expect(tokenLimit('qwen-turbo')).toBe(131072);
expect(tokenLimit('qwen-turbo-latest')).toBe(131072);
});
});
describe('ByteDance Seed-OSS', () => {
it('should return the correct limit for seed-oss', () => {
expect(tokenLimit('seed-oss')).toBe(524288);
});
});
describe('Zhipu GLM', () => {
it('should return the correct limit for glm-4.5v', () => {
expect(tokenLimit('glm-4.5v')).toBe(65536);
});
it('should return the correct limit for glm-4.5-air', () => {
expect(tokenLimit('glm-4.5-air')).toBe(131072);
});
it('should return the correct limit for glm-4.5', () => {
expect(tokenLimit('glm-4.5')).toBe(131072);
});
});
describe('Other models', () => {
it('should return the correct limit for deepseek-r1', () => {
expect(tokenLimit('deepseek-r1')).toBe(131072);
});
it('should return the correct limit for deepseek-v3', () => {
expect(tokenLimit('deepseek-v3')).toBe(131072);
});
it('should return the correct limit for deepseek-v3.1', () => {
expect(tokenLimit('deepseek-v3.1')).toBe(131072);
});
it('should return the correct limit for kimi-k2-instruct', () => {
expect(tokenLimit('kimi-k2-instruct')).toBe(131072);
});
it('should return the correct limit for gpt-oss', () => {
expect(tokenLimit('gpt-oss')).toBe(131072);
});
it('should return the correct limit for llama-4-scout', () => {
expect(tokenLimit('llama-4-scout')).toBe(10485760);
});
it('should return the correct limit for mistral-large-2', () => {
expect(tokenLimit('mistral-large-2')).toBe(131072);
});
});
// Test for default limit
it('should return the default token limit for an unknown model', () => {
expect(tokenLimit('unknown-model-v1.0')).toBe(DEFAULT_TOKEN_LIMIT);
});
// Test with complex model string
it('should return the correct limit for a complex model string', () => {
expect(tokenLimit(' a/b/c|GPT-4o:gpt-4o-2024-05-13-q4 ')).toBe(131072);
});
// Test case-insensitive matching
it('should handle case-insensitive model names', () => {
expect(tokenLimit('GPT-4O')).toBe(131072);
expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000);
});
});

View File

@@ -1,32 +1,154 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
type Model = string;
type TokenCount = number;
export const DEFAULT_TOKEN_LIMIT = 1_048_576;
export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two)
export function tokenLimit(model: Model): TokenCount {
// Add other models as they become relevant or if specified by config
// Pulled from https://ai.google.dev/gemini-api/docs/models
switch (model) {
case 'gemini-1.5-pro':
return 2_097_152;
case 'gemini-1.5-flash':
case 'gemini-2.5-pro-preview-05-06':
case 'gemini-2.5-pro-preview-06-05':
case 'gemini-2.5-pro':
case 'gemini-2.5-flash-preview-05-20':
case 'gemini-2.5-flash':
case 'gemini-2.5-flash-lite':
case 'gemini-2.0-flash':
return 1_048_576;
case 'gemini-2.0-flash-preview-image-generation':
return 32_000;
default:
return DEFAULT_TOKEN_LIMIT;
/**
* Accurate numeric limits:
* - power-of-two approximations (128K -> 131072, 256K -> 262144, etc.)
* - vendor-declared exact values (e.g., 200k -> 200000) are used as stated in docs.
*/
const LIMITS = {
'32k': 32_768,
'64k': 65_536,
'128k': 131_072,
'200k': 200_000, // vendor-declared decimal (OpenAI / Anthropic use 200k)
'256k': 262_144,
'512k': 524_288,
'1m': 1_048_576,
'2m': 2_097_152,
'10m': 10_485_760, // 10 million tokens
} as const;
/** Robust normalizer: strips provider prefixes, pipes/colons, date/version suffixes, etc. */
export function normalize(model: string): string {
let s = (model ?? '').toLowerCase().trim();
// keep final path segment (strip provider prefixes), handle pipe/colon
s = s.replace(/^.*\//, '');
s = s.split('|').pop() ?? s;
s = s.split(':').pop() ?? s;
// collapse whitespace to single hyphen
s = s.replace(/\s+/g, '-');
// remove trailing build / date / revision suffixes:
// - dates (e.g., -20250219), -v1, version numbers, 'latest', 'preview' etc.
s = s.replace(/-preview/g, '');
// Special handling for Qwen model names that include "-latest" as part of the model name
if (!s.match(/^qwen-(?:plus|flash)-latest$/)) {
// \d{6,} - Match 6 or more digits (dates) like -20250219 (6+ digit dates)
// \d+x\d+b - Match patterns like 4x8b, -7b, -70b
// v\d+(?:\.\d+)* - Match version patterns starting with 'v' like -v1, -v1.2, -v2.1.3
// -\d+(?:\.\d+)+ - Match version numbers with dots (that are preceded by a dash),
// like -1.1, -2.0.1 but only when they're suffixes, Example: model-test-1.1 → model-test;
// Note: this does NOT match 4.1 in gpt-4.1 because there's no dash before 4.1 in that context.
// latest - Match the literal string "latest"
s = s.replace(
/-(?:\d{6,}|\d+x\d+b|v\d+(?:\.\d+)*|-\d+(?:\.\d+)+|latest)$/g,
'',
);
}
// remove quantization / numeric / precision suffixes common in local/community models
s = s.replace(/-(?:\d?bit|int[48]|bf16|fp16|q[45]|quantized)$/g, '');
return s;
}
/** Ordered regex patterns: most specific -> most general (first match wins). */
const PATTERNS: Array<[RegExp, TokenCount]> = [
// -------------------
// Google Gemini
// -------------------
[/^gemini-1\.5-pro$/, LIMITS['2m']],
[/^gemini-1\.5-flash$/, LIMITS['1m']],
[/^gemini-2\.5-pro.*$/, LIMITS['1m']],
[/^gemini-2\.5-flash.*$/, LIMITS['1m']],
[/^gemini-2\.0-flash-image-generation$/, LIMITS['32k']],
[/^gemini-2\.0-flash.*$/, LIMITS['1m']],
// -------------------
// OpenAI (o3 / o4-mini / gpt-4.1 / gpt-4o family)
// o3 and o4-mini document a 200,000-token context window (decimal).
// Note: GPT-4.1 models typically report 1_048_576 (1M) context in OpenAI announcements.
[/^o3(?:-mini|$).*$/, LIMITS['200k']],
[/^o3.*$/, LIMITS['200k']],
[/^o4-mini.*$/, LIMITS['200k']],
[/^gpt-4\.1-mini.*$/, LIMITS['1m']],
[/^gpt-4\.1.*$/, LIMITS['1m']],
[/^gpt-4o-mini.*$/, LIMITS['128k']],
[/^gpt-4o.*$/, LIMITS['128k']],
[/^gpt-4.*$/, LIMITS['128k']],
// -------------------
// Anthropic Claude
// - Claude Sonnet / Sonnet 3.5 and related Sonnet variants: 200,000 tokens documented.
// - Some Sonnet/Opus models offer 1M in beta/enterprise tiers (handled separately if needed).
[/^claude-3\.5-sonnet.*$/, LIMITS['200k']],
[/^claude-3\.7-sonnet.*$/, LIMITS['1m']], // some Sonnet 3.7/Opus variants advertise 1M beta in docs
[/^claude-sonnet-4.*$/, LIMITS['1m']],
[/^claude-opus-4.*$/, LIMITS['1m']],
// -------------------
// Alibaba / Qwen
// -------------------
// Commercial Qwen3-Coder-Plus: 1M token context
[/^qwen3-coder-plus(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-plus" and date variants
// Commercial Qwen3-Coder-Flash: 1M token context
[/^qwen3-coder-flash(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-flash" and date variants
// Open-source Qwen3-Coder variants: 256K native
[/^qwen3-coder-.*$/, LIMITS['256k']],
// Open-source Qwen3 2507 variants: 256K native
[/^qwen3-.*-2507-.*$/, LIMITS['256k']],
// Open-source long-context Qwen2.5-1M
[/^qwen2\.5-1m.*$/, LIMITS['1m']],
// Standard Qwen2.5: 128K
[/^qwen2\.5.*$/, LIMITS['128k']],
// Studio commercial Qwen-Plus / Qwen-Flash / Qwen-Turbo
[/^qwen-plus-latest$/, LIMITS['1m']], // Commercial latest: 1M
[/^qwen-plus.*$/, LIMITS['128k']], // Standard: 128K
[/^qwen-flash-latest$/, LIMITS['1m']],
[/^qwen-turbo.*$/, LIMITS['128k']],
// -------------------
// ByteDance Seed-OSS (512K)
// -------------------
[/^seed-oss.*$/, LIMITS['512k']],
// -------------------
// Zhipu GLM
// -------------------
[/^glm-4\.5v.*$/, LIMITS['64k']],
[/^glm-4\.5-air.*$/, LIMITS['128k']],
[/^glm-4\.5.*$/, LIMITS['128k']],
// -------------------
// DeepSeek / GPT-OSS / Kimi / Llama & Mistral examples
// -------------------
[/^deepseek-r1.*$/, LIMITS['128k']],
[/^deepseek-v3(?:\.1)?.*$/, LIMITS['128k']],
[/^kimi-k2-instruct.*$/, LIMITS['128k']],
[/^gpt-oss.*$/, LIMITS['128k']],
[/^llama-4-scout.*$/, LIMITS['10m'] as unknown as TokenCount], // ultra-long variants - handle carefully
[/^mistral-large-2.*$/, LIMITS['128k']],
];
/** Return the token limit for a model string (uses normalize + ordered regex list). */
export function tokenLimit(model: Model): TokenCount {
const norm = normalize(model);
for (const [regex, limit] of PATTERNS) {
if (regex.test(norm)) {
return limit;
}
}
// final fallback: DEFAULT_TOKEN_LIMIT (power-of-two 128K)
return DEFAULT_TOKEN_LIMIT;
}