Re-implement tokenLimits class to make it work correctly for Qwen and many other model types. (#542)

The original tokenLimits was copied over from gemini-cli and only works with gemini.
2025-12-19 09:33:53 +00:00 · 2025-09-08 20:38:47 -07:00
parent 621fe2e8ba
commit e63233cefc
2 changed files with 375 additions and 26 deletions
--- a/packages/core/src/core/tokenLimits.test.ts
+++ b/packages/core/src/core/tokenLimits.test.ts
@@ -0,0 +1,227 @@
+import { describe, it, expect } from 'vitest';
+import { normalize, tokenLimit, DEFAULT_TOKEN_LIMIT } from './tokenLimits.js';
+
+describe('normalize', () => {
+  it('should lowercase and trim the model string', () => {
+    expect(normalize('  GEMINI-1.5-PRO  ')).toBe('gemini-1.5-pro');
+  });
+
+  it('should strip provider prefixes', () => {
+    expect(normalize('google/gemini-1.5-pro')).toBe('gemini-1.5-pro');
+    expect(normalize('anthropic/claude-3.5-sonnet')).toBe('claude-3.5-sonnet');
+  });
+
+  it('should handle pipe and colon separators', () => {
+    expect(normalize('qwen|qwen2.5:qwen2.5-1m')).toBe('qwen2.5-1m');
+  });
+
+  it('should collapse whitespace to a single hyphen', () => {
+    expect(normalize('claude 3.5 sonnet')).toBe('claude-3.5-sonnet');
+  });
+
+  it('should remove date and version suffixes', () => {
+    expect(normalize('gemini-1.5-pro-20250219')).toBe('gemini-1.5-pro');
+    expect(normalize('gpt-4o-mini-v1')).toBe('gpt-4o-mini');
+    expect(normalize('claude-3.7-sonnet-20240715')).toBe('claude-3.7-sonnet');
+    expect(normalize('gpt-4.1-latest')).toBe('gpt-4.1');
+    expect(normalize('gemini-2.0-flash-preview-20250520')).toBe(
+      'gemini-2.0-flash',
+    );
+  });
+
+  it('should remove quantization and numeric suffixes', () => {
+    expect(normalize('qwen3-coder-7b-4bit')).toBe('qwen3-coder-7b');
+    expect(normalize('llama-4-scout-int8')).toBe('llama-4-scout');
+    expect(normalize('mistral-large-2-bf16')).toBe('mistral-large-2');
+    expect(normalize('deepseek-v3.1-q4')).toBe('deepseek-v3.1');
+    expect(normalize('qwen2.5-quantized')).toBe('qwen2.5');
+  });
+
+  it('should handle a combination of normalization rules', () => {
+    expect(normalize('  Google/GEMINI-2.5-PRO:gemini-2.5-pro-20250605  ')).toBe(
+      'gemini-2.5-pro',
+    );
+  });
+
+  it('should handle empty or null input', () => {
+    expect(normalize('')).toBe('');
+    expect(normalize(undefined as unknown as string)).toBe('');
+    expect(normalize(null as unknown as string)).toBe('');
+  });
+
+  it('should remove preview suffixes', () => {
+    expect(normalize('gemini-2.0-flash-preview')).toBe('gemini-2.0-flash');
+  });
+
+  it('should remove version numbers with dots when they are at the end', () => {
+    expect(normalize('gpt-4.1.1-latest')).toBe('gpt-4.1.1');
+    expect(normalize('gpt-4.1-latest')).toBe('gpt-4.1');
+  });
+});
+
+describe('tokenLimit', () => {
+  // Test cases for each model family
+  describe('Google Gemini', () => {
+    it('should return the correct limit for Gemini 1.5 Pro', () => {
+      expect(tokenLimit('gemini-1.5-pro')).toBe(2097152);
+    });
+    it('should return the correct limit for Gemini 1.5 Flash', () => {
+      expect(tokenLimit('gemini-1.5-flash')).toBe(1048576);
+    });
+    it('should return the correct limit for Gemini 2.5 Pro', () => {
+      expect(tokenLimit('gemini-2.5-pro')).toBe(1048576);
+    });
+    it('should return the correct limit for Gemini 2.5 Flash', () => {
+      expect(tokenLimit('gemini-2.5-flash')).toBe(1048576);
+    });
+    it('should return the correct limit for Gemini 2.0 Flash with image generation', () => {
+      expect(tokenLimit('gemini-2.0-flash-image-generation')).toBe(32768);
+    });
+    it('should return the correct limit for Gemini 2.0 Flash', () => {
+      expect(tokenLimit('gemini-2.0-flash')).toBe(1048576);
+    });
+  });
+
+  describe('OpenAI', () => {
+    it('should return the correct limit for o3-mini', () => {
+      expect(tokenLimit('o3-mini')).toBe(200000);
+    });
+    it('should return the correct limit for o3 models', () => {
+      expect(tokenLimit('o3')).toBe(200000);
+    });
+    it('should return the correct limit for o4-mini', () => {
+      expect(tokenLimit('o4-mini')).toBe(200000);
+    });
+    it('should return the correct limit for gpt-4o-mini', () => {
+      expect(tokenLimit('gpt-4o-mini')).toBe(131072);
+    });
+    it('should return the correct limit for gpt-4o', () => {
+      expect(tokenLimit('gpt-4o')).toBe(131072);
+    });
+    it('should return the correct limit for gpt-4.1-mini', () => {
+      expect(tokenLimit('gpt-4.1-mini')).toBe(1048576);
+    });
+    it('should return the correct limit for gpt-4.1 models', () => {
+      expect(tokenLimit('gpt-4.1')).toBe(1048576);
+    });
+    it('should return the correct limit for gpt-4', () => {
+      expect(tokenLimit('gpt-4')).toBe(131072);
+    });
+  });
+
+  describe('Anthropic Claude', () => {
+    it('should return the correct limit for Claude 3.5 Sonnet', () => {
+      expect(tokenLimit('claude-3.5-sonnet')).toBe(200000);
+    });
+    it('should return the correct limit for Claude 3.7 Sonnet', () => {
+      expect(tokenLimit('claude-3.7-sonnet')).toBe(1048576);
+    });
+    it('should return the correct limit for Claude Sonnet 4', () => {
+      expect(tokenLimit('claude-sonnet-4')).toBe(1048576);
+    });
+    it('should return the correct limit for Claude Opus 4', () => {
+      expect(tokenLimit('claude-opus-4')).toBe(1048576);
+    });
+  });
+
+  describe('Alibaba Qwen', () => {
+    it('should return the correct limit for qwen3-coder commercial models', () => {
+      expect(tokenLimit('qwen3-coder-plus')).toBe(1048576);
+      expect(tokenLimit('qwen3-coder-plus-20250601')).toBe(1048576);
+      expect(tokenLimit('qwen3-coder-flash')).toBe(1048576);
+      expect(tokenLimit('qwen3-coder-flash-20250601')).toBe(1048576);
+    });
+
+    it('should return the correct limit for qwen3-coder open source models', () => {
+      expect(tokenLimit('qwen3-coder-7b')).toBe(262144);
+      expect(tokenLimit('qwen3-coder-480b-a35b-instruct')).toBe(262144);
+      expect(tokenLimit('qwen3-coder-30b-a3b-instruct')).toBe(262144);
+    });
+
+    it('should return the correct limit for qwen3 2507 variants', () => {
+      expect(tokenLimit('qwen3-some-model-2507-instruct')).toBe(262144);
+    });
+
+    it('should return the correct limit for qwen2.5-1m', () => {
+      expect(tokenLimit('qwen2.5-1m')).toBe(1048576);
+      expect(tokenLimit('qwen2.5-1m-instruct')).toBe(1048576);
+    });
+
+    it('should return the correct limit for qwen2.5', () => {
+      expect(tokenLimit('qwen2.5')).toBe(131072);
+      expect(tokenLimit('qwen2.5-instruct')).toBe(131072);
+    });
+
+    it('should return the correct limit for qwen-plus', () => {
+      expect(tokenLimit('qwen-plus-latest')).toBe(1048576);
+      expect(tokenLimit('qwen-plus')).toBe(131072);
+    });
+
+    it('should return the correct limit for qwen-flash', () => {
+      expect(tokenLimit('qwen-flash-latest')).toBe(1048576);
+    });
+
+    it('should return the correct limit for qwen-turbo', () => {
+      expect(tokenLimit('qwen-turbo')).toBe(131072);
+      expect(tokenLimit('qwen-turbo-latest')).toBe(131072);
+    });
+  });
+
+  describe('ByteDance Seed-OSS', () => {
+    it('should return the correct limit for seed-oss', () => {
+      expect(tokenLimit('seed-oss')).toBe(524288);
+    });
+  });
+
+  describe('Zhipu GLM', () => {
+    it('should return the correct limit for glm-4.5v', () => {
+      expect(tokenLimit('glm-4.5v')).toBe(65536);
+    });
+    it('should return the correct limit for glm-4.5-air', () => {
+      expect(tokenLimit('glm-4.5-air')).toBe(131072);
+    });
+    it('should return the correct limit for glm-4.5', () => {
+      expect(tokenLimit('glm-4.5')).toBe(131072);
+    });
+  });
+
+  describe('Other models', () => {
+    it('should return the correct limit for deepseek-r1', () => {
+      expect(tokenLimit('deepseek-r1')).toBe(131072);
+    });
+    it('should return the correct limit for deepseek-v3', () => {
+      expect(tokenLimit('deepseek-v3')).toBe(131072);
+    });
+    it('should return the correct limit for deepseek-v3.1', () => {
+      expect(tokenLimit('deepseek-v3.1')).toBe(131072);
+    });
+    it('should return the correct limit for kimi-k2-instruct', () => {
+      expect(tokenLimit('kimi-k2-instruct')).toBe(131072);
+    });
+    it('should return the correct limit for gpt-oss', () => {
+      expect(tokenLimit('gpt-oss')).toBe(131072);
+    });
+    it('should return the correct limit for llama-4-scout', () => {
+      expect(tokenLimit('llama-4-scout')).toBe(10485760);
+    });
+    it('should return the correct limit for mistral-large-2', () => {
+      expect(tokenLimit('mistral-large-2')).toBe(131072);
+    });
+  });
+
+  // Test for default limit
+  it('should return the default token limit for an unknown model', () => {
+    expect(tokenLimit('unknown-model-v1.0')).toBe(DEFAULT_TOKEN_LIMIT);
+  });
+
+  // Test with complex model string
+  it('should return the correct limit for a complex model string', () => {
+    expect(tokenLimit('  a/b/c|GPT-4o:gpt-4o-2024-05-13-q4  ')).toBe(131072);
+  });
+
+  // Test case-insensitive matching
+  it('should handle case-insensitive model names', () => {
+    expect(tokenLimit('GPT-4O')).toBe(131072);
+    expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000);
+  });
+});
--- a/packages/core/src/core/tokenLimits.ts
+++ b/packages/core/src/core/tokenLimits.ts
@@ -1,32 +1,154 @@
-/**
- * @license
- * Copyright 2025 Google LLC
- * SPDX-License-Identifier: Apache-2.0
- */
-
 type Model = string;
 type TokenCount = number;

-export const DEFAULT_TOKEN_LIMIT = 1_048_576;
+export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two)

-export function tokenLimit(model: Model): TokenCount {
-  // Add other models as they become relevant or if specified by config
-  // Pulled from https://ai.google.dev/gemini-api/docs/models
-  switch (model) {
-    case 'gemini-1.5-pro':
-      return 2_097_152;
-    case 'gemini-1.5-flash':
-    case 'gemini-2.5-pro-preview-05-06':
-    case 'gemini-2.5-pro-preview-06-05':
-    case 'gemini-2.5-pro':
-    case 'gemini-2.5-flash-preview-05-20':
-    case 'gemini-2.5-flash':
-    case 'gemini-2.5-flash-lite':
-    case 'gemini-2.0-flash':
-      return 1_048_576;
-    case 'gemini-2.0-flash-preview-image-generation':
-      return 32_000;
-    default:
-      return DEFAULT_TOKEN_LIMIT;
+/**
+ * Accurate numeric limits:
+ * - power-of-two approximations (128K -> 131072, 256K -> 262144, etc.)
+ * - vendor-declared exact values (e.g., 200k -> 200000) are used as stated in docs.
+ */
+const LIMITS = {
+  '32k': 32_768,
+  '64k': 65_536,
+  '128k': 131_072,
+  '200k': 200_000, // vendor-declared decimal (OpenAI / Anthropic use 200k)
+  '256k': 262_144,
+  '512k': 524_288,
+  '1m': 1_048_576,
+  '2m': 2_097_152,
+  '10m': 10_485_760, // 10 million tokens
+} as const;
+
+/** Robust normalizer: strips provider prefixes, pipes/colons, date/version suffixes, etc. */
+export function normalize(model: string): string {
+  let s = (model ?? '').toLowerCase().trim();
+
+  // keep final path segment (strip provider prefixes), handle pipe/colon
+  s = s.replace(/^.*\//, '');
+  s = s.split('|').pop() ?? s;
+  s = s.split(':').pop() ?? s;
+
+  // collapse whitespace to single hyphen
+  s = s.replace(/\s+/g, '-');
+
+  // remove trailing build / date / revision suffixes:
+  // - dates (e.g., -20250219), -v1, version numbers, 'latest', 'preview' etc.
+  s = s.replace(/-preview/g, '');
+  // Special handling for Qwen model names that include "-latest" as part of the model name
+  if (!s.match(/^qwen-(?:plus|flash)-latest$/)) {
+    // \d{6,} - Match 6 or more digits (dates) like -20250219 (6+ digit dates)
+    // \d+x\d+b - Match patterns like 4x8b, -7b, -70b
+    // v\d+(?:\.\d+)* - Match version patterns starting with 'v' like -v1, -v1.2, -v2.1.3
+    // -\d+(?:\.\d+)+ - Match version numbers with dots (that are preceded by a dash),
+    //   like -1.1, -2.0.1 but only when they're suffixes, Example: model-test-1.1 → model-test;
+    //   Note: this does NOT match 4.1 in gpt-4.1 because there's no dash before 4.1 in that context.
+    // latest - Match the literal string "latest"
+    s = s.replace(
+      /-(?:\d{6,}|\d+x\d+b|v\d+(?:\.\d+)*|-\d+(?:\.\d+)+|latest)$/g,
+      '',
+    );
  }
+
+  // remove quantization / numeric / precision suffixes common in local/community models
+  s = s.replace(/-(?:\d?bit|int[48]|bf16|fp16|q[45]|quantized)$/g, '');
+
+  return s;
+}
+
+/** Ordered regex patterns: most specific -> most general (first match wins). */
+const PATTERNS: Array<[RegExp, TokenCount]> = [
+  // -------------------
+  // Google Gemini
+  // -------------------
+  [/^gemini-1\.5-pro$/, LIMITS['2m']],
+  [/^gemini-1\.5-flash$/, LIMITS['1m']],
+  [/^gemini-2\.5-pro.*$/, LIMITS['1m']],
+  [/^gemini-2\.5-flash.*$/, LIMITS['1m']],
+  [/^gemini-2\.0-flash-image-generation$/, LIMITS['32k']],
+  [/^gemini-2\.0-flash.*$/, LIMITS['1m']],
+
+  // -------------------
+  // OpenAI (o3 / o4-mini / gpt-4.1 / gpt-4o family)
+  // o3 and o4-mini document a 200,000-token context window (decimal).
+  // Note: GPT-4.1 models typically report 1_048_576 (1M) context in OpenAI announcements.
+  [/^o3(?:-mini|$).*$/, LIMITS['200k']],
+  [/^o3.*$/, LIMITS['200k']],
+  [/^o4-mini.*$/, LIMITS['200k']],
+  [/^gpt-4\.1-mini.*$/, LIMITS['1m']],
+  [/^gpt-4\.1.*$/, LIMITS['1m']],
+  [/^gpt-4o-mini.*$/, LIMITS['128k']],
+  [/^gpt-4o.*$/, LIMITS['128k']],
+  [/^gpt-4.*$/, LIMITS['128k']],
+
+  // -------------------
+  // Anthropic Claude
+  // - Claude Sonnet / Sonnet 3.5 and related Sonnet variants: 200,000 tokens documented.
+  // - Some Sonnet/Opus models offer 1M in beta/enterprise tiers (handled separately if needed).
+  [/^claude-3\.5-sonnet.*$/, LIMITS['200k']],
+  [/^claude-3\.7-sonnet.*$/, LIMITS['1m']], // some Sonnet 3.7/Opus variants advertise 1M beta in docs
+  [/^claude-sonnet-4.*$/, LIMITS['1m']],
+  [/^claude-opus-4.*$/, LIMITS['1m']],
+
+  // -------------------
+  // Alibaba / Qwen
+  // -------------------
+  // Commercial Qwen3-Coder-Plus: 1M token context
+  [/^qwen3-coder-plus(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-plus" and date variants
+
+  // Commercial Qwen3-Coder-Flash: 1M token context
+  [/^qwen3-coder-flash(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-flash" and date variants
+
+  // Open-source Qwen3-Coder variants: 256K native
+  [/^qwen3-coder-.*$/, LIMITS['256k']],
+  // Open-source Qwen3 2507 variants: 256K native
+  [/^qwen3-.*-2507-.*$/, LIMITS['256k']],
+
+  // Open-source long-context Qwen2.5-1M
+  [/^qwen2\.5-1m.*$/, LIMITS['1m']],
+
+  // Standard Qwen2.5: 128K
+  [/^qwen2\.5.*$/, LIMITS['128k']],
+
+  // Studio commercial Qwen-Plus / Qwen-Flash / Qwen-Turbo
+  [/^qwen-plus-latest$/, LIMITS['1m']], // Commercial latest: 1M
+  [/^qwen-plus.*$/, LIMITS['128k']], // Standard: 128K
+  [/^qwen-flash-latest$/, LIMITS['1m']],
+  [/^qwen-turbo.*$/, LIMITS['128k']],
+
+  // -------------------
+  // ByteDance Seed-OSS (512K)
+  // -------------------
+  [/^seed-oss.*$/, LIMITS['512k']],
+
+  // -------------------
+  // Zhipu GLM
+  // -------------------
+  [/^glm-4\.5v.*$/, LIMITS['64k']],
+  [/^glm-4\.5-air.*$/, LIMITS['128k']],
+  [/^glm-4\.5.*$/, LIMITS['128k']],
+
+  // -------------------
+  // DeepSeek / GPT-OSS / Kimi / Llama & Mistral examples
+  // -------------------
+  [/^deepseek-r1.*$/, LIMITS['128k']],
+  [/^deepseek-v3(?:\.1)?.*$/, LIMITS['128k']],
+  [/^kimi-k2-instruct.*$/, LIMITS['128k']],
+  [/^gpt-oss.*$/, LIMITS['128k']],
+  [/^llama-4-scout.*$/, LIMITS['10m'] as unknown as TokenCount], // ultra-long variants - handle carefully
+  [/^mistral-large-2.*$/, LIMITS['128k']],
+];
+
+/** Return the token limit for a model string (uses normalize + ordered regex list). */
+export function tokenLimit(model: Model): TokenCount {
+  const norm = normalize(model);
+
+  for (const [regex, limit] of PATTERNS) {
+    if (regex.test(norm)) {
+      return limit;
+    }
+  }
+
+  // final fallback: DEFAULT_TOKEN_LIMIT (power-of-two 128K)
+  return DEFAULT_TOKEN_LIMIT;
 }