Vision model support for Qwen-OAuth (#525)

* refactor: openaiContentGenerator * refactor: optimize stream handling * refactor: re-organize refactored files * fix: unit test cases * feat: `/model` command for switching to vision model * fix: lint error * feat: add image tokenizer to fit vlm context window * fix: lint and type errors * feat: add `visionModelPreview` to control default visibility of vision models * fix: remove deprecated files * fix: align supported image formats with bailian doc
2025-12-20 16:57:46 +00:00 · 2025-09-18 13:32:00 +08:00
parent 56808ac210
commit 761833c915
41 changed files with 4083 additions and 5336 deletions
--- a/packages/core/src/utils/request-tokenizer/imageTokenizer.test.ts
+++ b/packages/core/src/utils/request-tokenizer/imageTokenizer.test.ts
@@ -0,0 +1,157 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect } from 'vitest';
+import { ImageTokenizer } from './imageTokenizer.js';
+
+describe('ImageTokenizer', () => {
+  const tokenizer = new ImageTokenizer();
+
+  describe('token calculation', () => {
+    it('should calculate tokens based on image dimensions with reference logic', () => {
+      const metadata = {
+        width: 28,
+        height: 28,
+        mimeType: 'image/png',
+        dataSize: 1000,
+      };
+
+      const tokens = tokenizer.calculateTokens(metadata);
+
+      // 28x28 = 784 pixels = 1 image token + 2 special tokens = 3 total
+      // But minimum scaling may apply for small images
+      expect(tokens).toBeGreaterThanOrEqual(6); // Minimum after scaling + special tokens
+    });
+
+    it('should calculate tokens for larger images', () => {
+      const metadata = {
+        width: 512,
+        height: 512,
+        mimeType: 'image/png',
+        dataSize: 10000,
+      };
+
+      const tokens = tokenizer.calculateTokens(metadata);
+
+      // 512x512 with reference logic: rounded dimensions + scaling + special tokens
+      expect(tokens).toBeGreaterThan(300);
+      expect(tokens).toBeLessThan(400); // Should be reasonable for 512x512
+    });
+
+    it('should enforce minimum tokens per image with scaling', () => {
+      const metadata = {
+        width: 1,
+        height: 1,
+        mimeType: 'image/png',
+        dataSize: 100,
+      };
+
+      const tokens = tokenizer.calculateTokens(metadata);
+
+      // Tiny images get scaled up to minimum pixels + special tokens
+      expect(tokens).toBeGreaterThanOrEqual(6); // 4 image tokens + 2 special tokens
+    });
+
+    it('should handle very large images with scaling', () => {
+      const metadata = {
+        width: 8192,
+        height: 8192,
+        mimeType: 'image/png',
+        dataSize: 100000,
+      };
+
+      const tokens = tokenizer.calculateTokens(metadata);
+
+      // Very large images should be scaled down to max limit + special tokens
+      expect(tokens).toBeLessThanOrEqual(16386); // 16384 max + 2 special tokens
+      expect(tokens).toBeGreaterThan(16000); // Should be close to the limit
+    });
+  });
+
+  describe('PNG dimension extraction', () => {
+    it('should extract dimensions from valid PNG', async () => {
+      // 1x1 PNG image in base64
+      const pngBase64 =
+        'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77yQAAAABJRU5ErkJggg==';
+
+      const metadata = await tokenizer.extractImageMetadata(
+        pngBase64,
+        'image/png',
+      );
+
+      expect(metadata.width).toBe(1);
+      expect(metadata.height).toBe(1);
+      expect(metadata.mimeType).toBe('image/png');
+    });
+
+    it('should handle invalid PNG gracefully', async () => {
+      const invalidBase64 = 'invalid-png-data';
+
+      const metadata = await tokenizer.extractImageMetadata(
+        invalidBase64,
+        'image/png',
+      );
+
+      // Should return default dimensions
+      expect(metadata.width).toBe(512);
+      expect(metadata.height).toBe(512);
+      expect(metadata.mimeType).toBe('image/png');
+    });
+  });
+
+  describe('batch processing', () => {
+    it('should process multiple images serially', async () => {
+      const pngBase64 =
+        'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77yQAAAABJRU5ErkJggg==';
+
+      const images = [
+        { data: pngBase64, mimeType: 'image/png' },
+        { data: pngBase64, mimeType: 'image/png' },
+        { data: pngBase64, mimeType: 'image/png' },
+      ];
+
+      const tokens = await tokenizer.calculateTokensBatch(images);
+
+      expect(tokens).toHaveLength(3);
+      expect(tokens.every((t) => t >= 4)).toBe(true); // All should have at least 4 tokens
+    });
+
+    it('should handle mixed valid and invalid images', async () => {
+      const validPng =
+        'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77yQAAAABJRU5ErkJggg==';
+      const invalidPng = 'invalid-data';
+
+      const images = [
+        { data: validPng, mimeType: 'image/png' },
+        { data: invalidPng, mimeType: 'image/png' },
+      ];
+
+      const tokens = await tokenizer.calculateTokensBatch(images);
+
+      expect(tokens).toHaveLength(2);
+      expect(tokens.every((t) => t >= 4)).toBe(true); // All should have at least minimum tokens
+    });
+  });
+
+  describe('different image formats', () => {
+    it('should handle different MIME types', async () => {
+      const pngBase64 =
+        'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77yQAAAABJRU5ErkJggg==';
+
+      const formats = ['image/png', 'image/jpeg', 'image/webp', 'image/gif'];
+
+      for (const mimeType of formats) {
+        const metadata = await tokenizer.extractImageMetadata(
+          pngBase64,
+          mimeType,
+        );
+        expect(metadata.mimeType).toBe(mimeType);
+        expect(metadata.width).toBeGreaterThan(0);
+        expect(metadata.height).toBeGreaterThan(0);
+      }
+    });
+  });
+});
--- a/packages/core/src/utils/request-tokenizer/imageTokenizer.ts
+++ b/packages/core/src/utils/request-tokenizer/imageTokenizer.ts
@@ -0,0 +1,505 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { ImageMetadata } from './types.js';
+import { isSupportedImageMimeType } from './supportedImageFormats.js';
+
+/**
+ * Image tokenizer for calculating image tokens based on dimensions
+ *
+ * Key rules:
+ * - 28x28 pixels = 1 token
+ * - Minimum: 4 tokens per image
+ * - Maximum: 16384 tokens per image
+ * - Additional: 2 special tokens (vision_bos + vision_eos)
+ * - Supports: PNG, JPEG, WebP, GIF, BMP, TIFF, HEIC formats
+ */
+export class ImageTokenizer {
+  /** 28x28 pixels = 1 token */
+  private static readonly PIXELS_PER_TOKEN = 28 * 28;
+
+  /** Minimum tokens per image */
+  private static readonly MIN_TOKENS_PER_IMAGE = 4;
+
+  /** Maximum tokens per image */
+  private static readonly MAX_TOKENS_PER_IMAGE = 16384;
+
+  /** Special tokens for vision markers */
+  private static readonly VISION_SPECIAL_TOKENS = 2;
+
+  /**
+   * Extract image metadata from base64 data
+   *
+   * @param base64Data Base64-encoded image data (with or without data URL prefix)
+   * @param mimeType MIME type of the image
+   * @returns Promise resolving to ImageMetadata with dimensions and format info
+   */
+  async extractImageMetadata(
+    base64Data: string,
+    mimeType: string,
+  ): Promise<ImageMetadata> {
+    try {
+      // Check if the MIME type is supported
+      if (!isSupportedImageMimeType(mimeType)) {
+        console.warn(`Unsupported image format: ${mimeType}`);
+        // Return default metadata for unsupported formats
+        return {
+          width: 512,
+          height: 512,
+          mimeType,
+          dataSize: Math.floor(base64Data.length * 0.75),
+        };
+      }
+
+      const cleanBase64 = base64Data.replace(/^data:[^;]+;base64,/, '');
+      const buffer = Buffer.from(cleanBase64, 'base64');
+      const dimensions = await this.extractDimensions(buffer, mimeType);
+
+      return {
+        width: dimensions.width,
+        height: dimensions.height,
+        mimeType,
+        dataSize: buffer.length,
+      };
+    } catch (error) {
+      console.warn('Failed to extract image metadata:', error);
+      // Return default metadata for fallback
+      return {
+        width: 512,
+        height: 512,
+        mimeType,
+        dataSize: Math.floor(base64Data.length * 0.75),
+      };
+    }
+  }
+
+  /**
+   * Extract image dimensions from buffer based on format
+   *
+   * @param buffer Binary image data buffer
+   * @param mimeType MIME type to determine parsing strategy
+   * @returns Promise resolving to width and height dimensions
+   */
+  private async extractDimensions(
+    buffer: Buffer,
+    mimeType: string,
+  ): Promise<{ width: number; height: number }> {
+    if (mimeType.includes('png')) {
+      return this.extractPngDimensions(buffer);
+    }
+
+    if (mimeType.includes('jpeg') || mimeType.includes('jpg')) {
+      return this.extractJpegDimensions(buffer);
+    }
+
+    if (mimeType.includes('webp')) {
+      return this.extractWebpDimensions(buffer);
+    }
+
+    if (mimeType.includes('gif')) {
+      return this.extractGifDimensions(buffer);
+    }
+
+    if (mimeType.includes('bmp')) {
+      return this.extractBmpDimensions(buffer);
+    }
+
+    if (mimeType.includes('tiff')) {
+      return this.extractTiffDimensions(buffer);
+    }
+
+    if (mimeType.includes('heic')) {
+      return this.extractHeicDimensions(buffer);
+    }
+
+    return { width: 512, height: 512 };
+  }
+
+  /**
+   * Extract PNG dimensions from IHDR chunk
+   * PNG signature: 89 50 4E 47 0D 0A 1A 0A
+   * Width/height at bytes 16-19 and 20-23 (big-endian)
+   */
+  private extractPngDimensions(buffer: Buffer): {
+    width: number;
+    height: number;
+  } {
+    if (buffer.length < 24) {
+      throw new Error('Invalid PNG: buffer too short');
+    }
+
+    // Verify PNG signature
+    const signature = buffer.subarray(0, 8);
+    const expectedSignature = Buffer.from([
+      0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a,
+    ]);
+    if (!signature.equals(expectedSignature)) {
+      throw new Error('Invalid PNG signature');
+    }
+
+    const width = buffer.readUInt32BE(16);
+    const height = buffer.readUInt32BE(20);
+
+    return { width, height };
+  }
+
+  /**
+   * Extract JPEG dimensions from SOF (Start of Frame) markers
+   * JPEG starts with FF D8, SOF markers: 0xC0-0xC3, 0xC5-0xC7, 0xC9-0xCB, 0xCD-0xCF
+   * Dimensions at offset +5 (height) and +7 (width) from SOF marker
+   */
+  private extractJpegDimensions(buffer: Buffer): {
+    width: number;
+    height: number;
+  } {
+    if (buffer.length < 4 || buffer[0] !== 0xff || buffer[1] !== 0xd8) {
+      throw new Error('Invalid JPEG signature');
+    }
+
+    let offset = 2;
+
+    while (offset < buffer.length - 8) {
+      if (buffer[offset] !== 0xff) {
+        offset++;
+        continue;
+      }
+
+      const marker = buffer[offset + 1];
+
+      // SOF markers
+      if (
+        (marker >= 0xc0 && marker <= 0xc3) ||
+        (marker >= 0xc5 && marker <= 0xc7) ||
+        (marker >= 0xc9 && marker <= 0xcb) ||
+        (marker >= 0xcd && marker <= 0xcf)
+      ) {
+        const height = buffer.readUInt16BE(offset + 5);
+        const width = buffer.readUInt16BE(offset + 7);
+        return { width, height };
+      }
+
+      const segmentLength = buffer.readUInt16BE(offset + 2);
+      offset += 2 + segmentLength;
+    }
+
+    throw new Error('Could not find JPEG dimensions');
+  }
+
+  /**
+   * Extract WebP dimensions from RIFF container
+   * Supports VP8, VP8L, and VP8X formats
+   */
+  private extractWebpDimensions(buffer: Buffer): {
+    width: number;
+    height: number;
+  } {
+    if (buffer.length < 30) {
+      throw new Error('Invalid WebP: too short');
+    }
+
+    const riffSignature = buffer.subarray(0, 4).toString('ascii');
+    const webpSignature = buffer.subarray(8, 12).toString('ascii');
+
+    if (riffSignature !== 'RIFF' || webpSignature !== 'WEBP') {
+      throw new Error('Invalid WebP signature');
+    }
+
+    const format = buffer.subarray(12, 16).toString('ascii');
+
+    if (format === 'VP8 ') {
+      const width = buffer.readUInt16LE(26) & 0x3fff;
+      const height = buffer.readUInt16LE(28) & 0x3fff;
+      return { width, height };
+    } else if (format === 'VP8L') {
+      const bits = buffer.readUInt32LE(21);
+      const width = (bits & 0x3fff) + 1;
+      const height = ((bits >> 14) & 0x3fff) + 1;
+      return { width, height };
+    } else if (format === 'VP8X') {
+      const width = (buffer.readUInt32LE(24) & 0xffffff) + 1;
+      const height = (buffer.readUInt32LE(26) & 0xffffff) + 1;
+      return { width, height };
+    }
+
+    throw new Error('Unsupported WebP format');
+  }
+
+  /**
+   * Extract GIF dimensions from header
+   * Supports GIF87a and GIF89a formats
+   */
+  private extractGifDimensions(buffer: Buffer): {
+    width: number;
+    height: number;
+  } {
+    if (buffer.length < 10) {
+      throw new Error('Invalid GIF: too short');
+    }
+
+    const signature = buffer.subarray(0, 6).toString('ascii');
+    if (signature !== 'GIF87a' && signature !== 'GIF89a') {
+      throw new Error('Invalid GIF signature');
+    }
+
+    const width = buffer.readUInt16LE(6);
+    const height = buffer.readUInt16LE(8);
+
+    return { width, height };
+  }
+
+  /**
+   * Calculate tokens for an image based on its metadata
+   *
+   * @param metadata Image metadata containing width, height, and format info
+   * @returns Total token count including base image tokens and special tokens
+   */
+  calculateTokens(metadata: ImageMetadata): number {
+    return this.calculateTokensWithScaling(metadata.width, metadata.height);
+  }
+
+  /**
+   * Calculate tokens with scaling logic
+   *
+   * Steps:
+   * 1. Normalize to 28-pixel multiples
+   * 2. Scale large images down, small images up
+   * 3. Calculate tokens: pixels / 784 + 2 special tokens
+   *
+   * @param width Original image width in pixels
+   * @param height Original image height in pixels
+   * @returns Total token count for the image
+   */
+  private calculateTokensWithScaling(width: number, height: number): number {
+    // Normalize to 28-pixel multiples
+    let hBar = Math.round(height / 28) * 28;
+    let wBar = Math.round(width / 28) * 28;
+
+    // Define pixel boundaries
+    const minPixels =
+      ImageTokenizer.MIN_TOKENS_PER_IMAGE * ImageTokenizer.PIXELS_PER_TOKEN;
+    const maxPixels =
+      ImageTokenizer.MAX_TOKENS_PER_IMAGE * ImageTokenizer.PIXELS_PER_TOKEN;
+
+    // Apply scaling
+    if (hBar * wBar > maxPixels) {
+      // Scale down large images
+      const beta = Math.sqrt((height * width) / maxPixels);
+      hBar = Math.floor(height / beta / 28) * 28;
+      wBar = Math.floor(width / beta / 28) * 28;
+    } else if (hBar * wBar < minPixels) {
+      // Scale up small images
+      const beta = Math.sqrt(minPixels / (height * width));
+      hBar = Math.ceil((height * beta) / 28) * 28;
+      wBar = Math.ceil((width * beta) / 28) * 28;
+    }
+
+    // Calculate tokens
+    const imageTokens = Math.floor(
+      (hBar * wBar) / ImageTokenizer.PIXELS_PER_TOKEN,
+    );
+
+    return imageTokens + ImageTokenizer.VISION_SPECIAL_TOKENS;
+  }
+
+  /**
+   * Calculate tokens for multiple images serially
+   *
+   * @param base64DataArray Array of image data with MIME type information
+   * @returns Promise resolving to array of token counts in same order as input
+   */
+  async calculateTokensBatch(
+    base64DataArray: Array<{ data: string; mimeType: string }>,
+  ): Promise<number[]> {
+    const results: number[] = [];
+
+    for (const { data, mimeType } of base64DataArray) {
+      try {
+        const metadata = await this.extractImageMetadata(data, mimeType);
+        results.push(this.calculateTokens(metadata));
+      } catch (error) {
+        console.warn('Error calculating tokens for image:', error);
+        // Return minimum tokens as fallback
+        results.push(
+          ImageTokenizer.MIN_TOKENS_PER_IMAGE +
+            ImageTokenizer.VISION_SPECIAL_TOKENS,
+        );
+      }
+    }
+
+    return results;
+  }
+
+  /**
+   * Extract BMP dimensions from header
+   * BMP signature: 42 4D (BM)
+   * Width/height at bytes 18-21 and 22-25 (little-endian)
+   */
+  private extractBmpDimensions(buffer: Buffer): {
+    width: number;
+    height: number;
+  } {
+    if (buffer.length < 26) {
+      throw new Error('Invalid BMP: buffer too short');
+    }
+
+    // Verify BMP signature
+    if (buffer[0] !== 0x42 || buffer[1] !== 0x4d) {
+      throw new Error('Invalid BMP signature');
+    }
+
+    const width = buffer.readUInt32LE(18);
+    const height = buffer.readUInt32LE(22);
+
+    return { width, height: Math.abs(height) }; // Height can be negative for top-down BMPs
+  }
+
+  /**
+   * Extract TIFF dimensions from IFD (Image File Directory)
+   * TIFF can be little-endian (II) or big-endian (MM)
+   * Width/height are stored in IFD entries with tags 0x0100 and 0x0101
+   */
+  private extractTiffDimensions(buffer: Buffer): {
+    width: number;
+    height: number;
+  } {
+    if (buffer.length < 8) {
+      throw new Error('Invalid TIFF: buffer too short');
+    }
+
+    // Check byte order
+    const byteOrder = buffer.subarray(0, 2).toString('ascii');
+    const isLittleEndian = byteOrder === 'II';
+    const isBigEndian = byteOrder === 'MM';
+
+    if (!isLittleEndian && !isBigEndian) {
+      throw new Error('Invalid TIFF byte order');
+    }
+
+    // Read magic number (should be 42)
+    const magic = isLittleEndian
+      ? buffer.readUInt16LE(2)
+      : buffer.readUInt16BE(2);
+    if (magic !== 42) {
+      throw new Error('Invalid TIFF magic number');
+    }
+
+    // Read IFD offset
+    const ifdOffset = isLittleEndian
+      ? buffer.readUInt32LE(4)
+      : buffer.readUInt32BE(4);
+
+    if (ifdOffset >= buffer.length) {
+      throw new Error('Invalid TIFF IFD offset');
+    }
+
+    // Read number of directory entries
+    const numEntries = isLittleEndian
+      ? buffer.readUInt16LE(ifdOffset)
+      : buffer.readUInt16BE(ifdOffset);
+
+    let width = 0;
+    let height = 0;
+
+    // Parse IFD entries
+    for (let i = 0; i < numEntries; i++) {
+      const entryOffset = ifdOffset + 2 + i * 12;
+
+      if (entryOffset + 12 > buffer.length) break;
+
+      const tag = isLittleEndian
+        ? buffer.readUInt16LE(entryOffset)
+        : buffer.readUInt16BE(entryOffset);
+
+      const type = isLittleEndian
+        ? buffer.readUInt16LE(entryOffset + 2)
+        : buffer.readUInt16BE(entryOffset + 2);
+
+      const value = isLittleEndian
+        ? buffer.readUInt32LE(entryOffset + 8)
+        : buffer.readUInt32BE(entryOffset + 8);
+
+      if (tag === 0x0100) {
+        // ImageWidth
+        width = type === 3 ? value : value; // SHORT or LONG
+      } else if (tag === 0x0101) {
+        // ImageLength (height)
+        height = type === 3 ? value : value; // SHORT or LONG
+      }
+
+      if (width > 0 && height > 0) break;
+    }
+
+    if (width === 0 || height === 0) {
+      throw new Error('Could not find TIFF dimensions');
+    }
+
+    return { width, height };
+  }
+
+  /**
+   * Extract HEIC dimensions from meta box
+   * HEIC is based on ISO Base Media File Format
+   * This is a simplified implementation that looks for 'ispe' (Image Spatial Extents) box
+   */
+  private extractHeicDimensions(buffer: Buffer): {
+    width: number;
+    height: number;
+  } {
+    if (buffer.length < 12) {
+      throw new Error('Invalid HEIC: buffer too short');
+    }
+
+    // Check for ftyp box with HEIC brand
+    const ftypBox = buffer.subarray(4, 8).toString('ascii');
+    if (ftypBox !== 'ftyp') {
+      throw new Error('Invalid HEIC: missing ftyp box');
+    }
+
+    const brand = buffer.subarray(8, 12).toString('ascii');
+    if (!['heic', 'heix', 'hevc', 'hevx'].includes(brand)) {
+      throw new Error('Invalid HEIC brand');
+    }
+
+    // Look for meta box and then ispe box
+    let offset = 0;
+    while (offset < buffer.length - 8) {
+      const boxSize = buffer.readUInt32BE(offset);
+      const boxType = buffer.subarray(offset + 4, offset + 8).toString('ascii');
+
+      if (boxType === 'meta') {
+        // Look for ispe box inside meta box
+        const metaOffset = offset + 8;
+        let innerOffset = metaOffset + 4; // Skip version and flags
+
+        while (innerOffset < offset + boxSize - 8) {
+          const innerBoxSize = buffer.readUInt32BE(innerOffset);
+          const innerBoxType = buffer
+            .subarray(innerOffset + 4, innerOffset + 8)
+            .toString('ascii');
+
+          if (innerBoxType === 'ispe') {
+            // Found Image Spatial Extents box
+            if (innerOffset + 20 <= buffer.length) {
+              const width = buffer.readUInt32BE(innerOffset + 12);
+              const height = buffer.readUInt32BE(innerOffset + 16);
+              return { width, height };
+            }
+          }
+
+          if (innerBoxSize === 0) break;
+          innerOffset += innerBoxSize;
+        }
+      }
+
+      if (boxSize === 0) break;
+      offset += boxSize;
+    }
+
+    // Fallback: return default dimensions if we can't parse the structure
+    console.warn('Could not extract HEIC dimensions, using default');
+    return { width: 512, height: 512 };
+  }
+}
--- a/packages/core/src/utils/request-tokenizer/index.ts
+++ b/packages/core/src/utils/request-tokenizer/index.ts
@@ -0,0 +1,40 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+export { DefaultRequestTokenizer } from './requestTokenizer.js';
+import { DefaultRequestTokenizer } from './requestTokenizer.js';
+export { TextTokenizer } from './textTokenizer.js';
+export { ImageTokenizer } from './imageTokenizer.js';
+
+export type {
+  RequestTokenizer,
+  TokenizerConfig,
+  TokenCalculationResult,
+  ImageMetadata,
+} from './types.js';
+
+// Singleton instance for convenient usage
+let defaultTokenizer: DefaultRequestTokenizer | null = null;
+
+/**
+ * Get the default request tokenizer instance
+ */
+export function getDefaultTokenizer(): DefaultRequestTokenizer {
+  if (!defaultTokenizer) {
+    defaultTokenizer = new DefaultRequestTokenizer();
+  }
+  return defaultTokenizer;
+}
+
+/**
+ * Dispose of the default tokenizer instance
+ */
+export async function disposeDefaultTokenizer(): Promise<void> {
+  if (defaultTokenizer) {
+    await defaultTokenizer.dispose();
+    defaultTokenizer = null;
+  }
+}
--- a/packages/core/src/utils/request-tokenizer/requestTokenizer.test.ts
+++ b/packages/core/src/utils/request-tokenizer/requestTokenizer.test.ts
@@ -0,0 +1,293 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { DefaultRequestTokenizer } from './requestTokenizer.js';
+import type { CountTokensParameters } from '@google/genai';
+
+describe('DefaultRequestTokenizer', () => {
+  let tokenizer: DefaultRequestTokenizer;
+
+  beforeEach(() => {
+    tokenizer = new DefaultRequestTokenizer();
+  });
+
+  afterEach(async () => {
+    await tokenizer.dispose();
+  });
+
+  describe('text token calculation', () => {
+    it('should calculate tokens for simple text content', async () => {
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: [{ text: 'Hello, world!' }],
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBeGreaterThan(0);
+      expect(result.breakdown.textTokens).toBeGreaterThan(0);
+      expect(result.breakdown.imageTokens).toBe(0);
+      expect(result.processingTime).toBeGreaterThan(0);
+    });
+
+    it('should handle multiple text parts', async () => {
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              { text: 'First part' },
+              { text: 'Second part' },
+              { text: 'Third part' },
+            ],
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBeGreaterThan(0);
+      expect(result.breakdown.textTokens).toBeGreaterThan(0);
+    });
+
+    it('should handle string content', async () => {
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: ['Simple string content'],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBeGreaterThan(0);
+      expect(result.breakdown.textTokens).toBeGreaterThan(0);
+    });
+  });
+
+  describe('image token calculation', () => {
+    it('should calculate tokens for image content', async () => {
+      // Create a simple 1x1 PNG image in base64
+      const pngBase64 =
+        'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77yQAAAABJRU5ErkJggg==';
+
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              {
+                inlineData: {
+                  mimeType: 'image/png',
+                  data: pngBase64,
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBeGreaterThanOrEqual(4); // Minimum 4 tokens per image
+      expect(result.breakdown.imageTokens).toBeGreaterThanOrEqual(4);
+      expect(result.breakdown.textTokens).toBe(0);
+    });
+
+    it('should handle multiple images', async () => {
+      const pngBase64 =
+        'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77yQAAAABJRU5ErkJggg==';
+
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              {
+                inlineData: {
+                  mimeType: 'image/png',
+                  data: pngBase64,
+                },
+              },
+              {
+                inlineData: {
+                  mimeType: 'image/png',
+                  data: pngBase64,
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBeGreaterThanOrEqual(8); // At least 4 tokens per image
+      expect(result.breakdown.imageTokens).toBeGreaterThanOrEqual(8);
+    });
+  });
+
+  describe('mixed content', () => {
+    it('should handle text and image content together', async () => {
+      const pngBase64 =
+        'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77yQAAAABJRU5ErkJggg==';
+
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              { text: 'Here is an image:' },
+              {
+                inlineData: {
+                  mimeType: 'image/png',
+                  data: pngBase64,
+                },
+              },
+              { text: 'What do you see?' },
+            ],
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBeGreaterThan(4);
+      expect(result.breakdown.textTokens).toBeGreaterThan(0);
+      expect(result.breakdown.imageTokens).toBeGreaterThanOrEqual(4);
+    });
+  });
+
+  describe('function content', () => {
+    it('should handle function calls', async () => {
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              {
+                functionCall: {
+                  name: 'test_function',
+                  args: { param1: 'value1', param2: 42 },
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBeGreaterThan(0);
+      expect(result.breakdown.otherTokens).toBeGreaterThan(0);
+    });
+  });
+
+  describe('empty content', () => {
+    it('should handle empty request', async () => {
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBe(0);
+      expect(result.breakdown.textTokens).toBe(0);
+      expect(result.breakdown.imageTokens).toBe(0);
+    });
+
+    it('should handle undefined contents', async () => {
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBe(0);
+    });
+  });
+
+  describe('configuration', () => {
+    it('should use custom text encoding', async () => {
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: [{ text: 'Test text for encoding' }],
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request, {
+        textEncoding: 'cl100k_base',
+      });
+
+      expect(result.totalTokens).toBeGreaterThan(0);
+    });
+
+    it('should process multiple images serially', async () => {
+      const pngBase64 =
+        'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77yQAAAABJRU5ErkJggg==';
+
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: Array(10).fill({
+              inlineData: {
+                mimeType: 'image/png',
+                data: pngBase64,
+              },
+            }),
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      expect(result.totalTokens).toBeGreaterThanOrEqual(60); // At least 6 tokens per image * 10 images
+    });
+  });
+
+  describe('error handling', () => {
+    it('should handle malformed image data gracefully', async () => {
+      const request: CountTokensParameters = {
+        model: 'test-model',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              {
+                inlineData: {
+                  mimeType: 'image/png',
+                  data: 'invalid-base64-data',
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const result = await tokenizer.calculateTokens(request);
+
+      // Should still return some tokens (fallback to minimum)
+      expect(result.totalTokens).toBeGreaterThanOrEqual(4);
+    });
+  });
+});
--- a/packages/core/src/utils/request-tokenizer/requestTokenizer.ts
+++ b/packages/core/src/utils/request-tokenizer/requestTokenizer.ts
@@ -0,0 +1,341 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type {
+  CountTokensParameters,
+  Content,
+  Part,
+  PartUnion,
+} from '@google/genai';
+import type {
+  RequestTokenizer,
+  TokenizerConfig,
+  TokenCalculationResult,
+} from './types.js';
+import { TextTokenizer } from './textTokenizer.js';
+import { ImageTokenizer } from './imageTokenizer.js';
+
+/**
+ * Simple request tokenizer that handles text and image content serially
+ */
+export class DefaultRequestTokenizer implements RequestTokenizer {
+  private textTokenizer: TextTokenizer;
+  private imageTokenizer: ImageTokenizer;
+
+  constructor() {
+    this.textTokenizer = new TextTokenizer();
+    this.imageTokenizer = new ImageTokenizer();
+  }
+
+  /**
+   * Calculate tokens for a request using serial processing
+   */
+  async calculateTokens(
+    request: CountTokensParameters,
+    config: TokenizerConfig = {},
+  ): Promise<TokenCalculationResult> {
+    const startTime = performance.now();
+
+    // Apply configuration
+    if (config.textEncoding) {
+      this.textTokenizer = new TextTokenizer(config.textEncoding);
+    }
+
+    try {
+      // Process request content and group by type
+      const { textContents, imageContents, audioContents, otherContents } =
+        this.processAndGroupContents(request);
+
+      if (
+        textContents.length === 0 &&
+        imageContents.length === 0 &&
+        audioContents.length === 0 &&
+        otherContents.length === 0
+      ) {
+        return {
+          totalTokens: 0,
+          breakdown: {
+            textTokens: 0,
+            imageTokens: 0,
+            audioTokens: 0,
+            otherTokens: 0,
+          },
+          processingTime: performance.now() - startTime,
+        };
+      }
+
+      // Calculate tokens for each content type serially
+      const textTokens = await this.calculateTextTokens(textContents);
+      const imageTokens = await this.calculateImageTokens(imageContents);
+      const audioTokens = await this.calculateAudioTokens(audioContents);
+      const otherTokens = await this.calculateOtherTokens(otherContents);
+
+      const totalTokens = textTokens + imageTokens + audioTokens + otherTokens;
+      const processingTime = performance.now() - startTime;
+
+      return {
+        totalTokens,
+        breakdown: {
+          textTokens,
+          imageTokens,
+          audioTokens,
+          otherTokens,
+        },
+        processingTime,
+      };
+    } catch (error) {
+      console.error('Error calculating tokens:', error);
+
+      // Fallback calculation
+      const fallbackTokens = this.calculateFallbackTokens(request);
+
+      return {
+        totalTokens: fallbackTokens,
+        breakdown: {
+          textTokens: fallbackTokens,
+          imageTokens: 0,
+          audioTokens: 0,
+          otherTokens: 0,
+        },
+        processingTime: performance.now() - startTime,
+      };
+    }
+  }
+
+  /**
+   * Calculate tokens for text contents
+   */
+  private async calculateTextTokens(textContents: string[]): Promise<number> {
+    if (textContents.length === 0) return 0;
+
+    try {
+      const tokenCounts =
+        await this.textTokenizer.calculateTokensBatch(textContents);
+      return tokenCounts.reduce((sum, count) => sum + count, 0);
+    } catch (error) {
+      console.warn('Error calculating text tokens:', error);
+      // Fallback: character-based estimation
+      const totalChars = textContents.join('').length;
+      return Math.ceil(totalChars / 4);
+    }
+  }
+
+  /**
+   * Calculate tokens for image contents using serial processing
+   */
+  private async calculateImageTokens(
+    imageContents: Array<{ data: string; mimeType: string }>,
+  ): Promise<number> {
+    if (imageContents.length === 0) return 0;
+
+    try {
+      const tokenCounts =
+        await this.imageTokenizer.calculateTokensBatch(imageContents);
+      return tokenCounts.reduce((sum, count) => sum + count, 0);
+    } catch (error) {
+      console.warn('Error calculating image tokens:', error);
+      // Fallback: minimum tokens per image
+      return imageContents.length * 6; // 4 image tokens + 2 special tokens as minimum
+    }
+  }
+
+  /**
+   * Calculate tokens for audio contents
+   * TODO: Implement proper audio token calculation
+   */
+  private async calculateAudioTokens(
+    audioContents: Array<{ data: string; mimeType: string }>,
+  ): Promise<number> {
+    if (audioContents.length === 0) return 0;
+
+    // Placeholder implementation - audio token calculation would depend on
+    // the specific model's audio processing capabilities
+    // For now, estimate based on data size
+    let totalTokens = 0;
+
+    for (const audioContent of audioContents) {
+      try {
+        const dataSize = Math.floor(audioContent.data.length * 0.75); // Approximate binary size
+        // Rough estimate: 1 token per 100 bytes of audio data
+        totalTokens += Math.max(Math.ceil(dataSize / 100), 10); // Minimum 10 tokens per audio
+      } catch (error) {
+        console.warn('Error calculating audio tokens:', error);
+        totalTokens += 10; // Fallback minimum
+      }
+    }
+
+    return totalTokens;
+  }
+
+  /**
+   * Calculate tokens for other content types (functions, files, etc.)
+   */
+  private async calculateOtherTokens(otherContents: string[]): Promise<number> {
+    if (otherContents.length === 0) return 0;
+
+    try {
+      // Treat other content as text for token calculation
+      const tokenCounts =
+        await this.textTokenizer.calculateTokensBatch(otherContents);
+      return tokenCounts.reduce((sum, count) => sum + count, 0);
+    } catch (error) {
+      console.warn('Error calculating other content tokens:', error);
+      // Fallback: character-based estimation
+      const totalChars = otherContents.join('').length;
+      return Math.ceil(totalChars / 4);
+    }
+  }
+
+  /**
+   * Fallback token calculation using simple string serialization
+   */
+  private calculateFallbackTokens(request: CountTokensParameters): number {
+    try {
+      const content = JSON.stringify(request.contents);
+      return Math.ceil(content.length / 4); // Rough estimate: 1 token ≈ 4 characters
+    } catch (error) {
+      console.warn('Error in fallback token calculation:', error);
+      return 100; // Conservative fallback
+    }
+  }
+
+  /**
+   * Process request contents and group by type
+   */
+  private processAndGroupContents(request: CountTokensParameters): {
+    textContents: string[];
+    imageContents: Array<{ data: string; mimeType: string }>;
+    audioContents: Array<{ data: string; mimeType: string }>;
+    otherContents: string[];
+  } {
+    const textContents: string[] = [];
+    const imageContents: Array<{ data: string; mimeType: string }> = [];
+    const audioContents: Array<{ data: string; mimeType: string }> = [];
+    const otherContents: string[] = [];
+
+    if (!request.contents) {
+      return { textContents, imageContents, audioContents, otherContents };
+    }
+
+    const contents = Array.isArray(request.contents)
+      ? request.contents
+      : [request.contents];
+
+    for (const content of contents) {
+      this.processContent(
+        content,
+        textContents,
+        imageContents,
+        audioContents,
+        otherContents,
+      );
+    }
+
+    return { textContents, imageContents, audioContents, otherContents };
+  }
+
+  /**
+   * Process a single content item and add to appropriate arrays
+   */
+  private processContent(
+    content: Content | string | PartUnion,
+    textContents: string[],
+    imageContents: Array<{ data: string; mimeType: string }>,
+    audioContents: Array<{ data: string; mimeType: string }>,
+    otherContents: string[],
+  ): void {
+    if (typeof content === 'string') {
+      if (content.trim()) {
+        textContents.push(content);
+      }
+      return;
+    }
+
+    if ('parts' in content && content.parts) {
+      for (const part of content.parts) {
+        this.processPart(
+          part,
+          textContents,
+          imageContents,
+          audioContents,
+          otherContents,
+        );
+      }
+    }
+  }
+
+  /**
+   * Process a single part and add to appropriate arrays
+   */
+  private processPart(
+    part: Part | string,
+    textContents: string[],
+    imageContents: Array<{ data: string; mimeType: string }>,
+    audioContents: Array<{ data: string; mimeType: string }>,
+    otherContents: string[],
+  ): void {
+    if (typeof part === 'string') {
+      if (part.trim()) {
+        textContents.push(part);
+      }
+      return;
+    }
+
+    if ('text' in part && part.text) {
+      textContents.push(part.text);
+      return;
+    }
+
+    if ('inlineData' in part && part.inlineData) {
+      const { data, mimeType } = part.inlineData;
+      if (mimeType && mimeType.startsWith('image/')) {
+        imageContents.push({ data: data || '', mimeType });
+        return;
+      }
+      if (mimeType && mimeType.startsWith('audio/')) {
+        audioContents.push({ data: data || '', mimeType });
+        return;
+      }
+    }
+
+    if ('fileData' in part && part.fileData) {
+      otherContents.push(JSON.stringify(part.fileData));
+      return;
+    }
+
+    if ('functionCall' in part && part.functionCall) {
+      otherContents.push(JSON.stringify(part.functionCall));
+      return;
+    }
+
+    if ('functionResponse' in part && part.functionResponse) {
+      otherContents.push(JSON.stringify(part.functionResponse));
+      return;
+    }
+
+    // Unknown part type - try to serialize
+    try {
+      const serialized = JSON.stringify(part);
+      if (serialized && serialized !== '{}') {
+        otherContents.push(serialized);
+      }
+    } catch (error) {
+      console.warn('Failed to serialize unknown part type:', error);
+    }
+  }
+
+  /**
+   * Dispose of resources
+   */
+  async dispose(): Promise<void> {
+    try {
+      // Dispose of tokenizers
+      this.textTokenizer.dispose();
+    } catch (error) {
+      console.warn('Error disposing request tokenizer:', error);
+    }
+  }
+}
--- a/packages/core/src/utils/request-tokenizer/supportedImageFormats.ts
+++ b/packages/core/src/utils/request-tokenizer/supportedImageFormats.ts
@@ -0,0 +1,56 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * Supported image MIME types for vision models
+ * These formats are supported by the vision model and can be processed by the image tokenizer
+ */
+export const SUPPORTED_IMAGE_MIME_TYPES = [
+  'image/bmp',
+  'image/jpeg',
+  'image/jpg', // Alternative MIME type for JPEG
+  'image/png',
+  'image/tiff',
+  'image/webp',
+  'image/heic',
+] as const;
+
+/**
+ * Type for supported image MIME types
+ */
+export type SupportedImageMimeType =
+  (typeof SUPPORTED_IMAGE_MIME_TYPES)[number];
+
+/**
+ * Check if a MIME type is supported for vision processing
+ * @param mimeType The MIME type to check
+ * @returns True if the MIME type is supported
+ */
+export function isSupportedImageMimeType(
+  mimeType: string,
+): mimeType is SupportedImageMimeType {
+  return SUPPORTED_IMAGE_MIME_TYPES.includes(
+    mimeType as SupportedImageMimeType,
+  );
+}
+
+/**
+ * Get a human-readable list of supported image formats
+ * @returns Comma-separated string of supported formats
+ */
+export function getSupportedImageFormatsString(): string {
+  return SUPPORTED_IMAGE_MIME_TYPES.map((type) =>
+    type.replace('image/', '').toUpperCase(),
+  ).join(', ');
+}
+
+/**
+ * Get warning message for unsupported image formats
+ * @returns Warning message string
+ */
+export function getUnsupportedImageFormatWarning(): string {
+  return `Only the following image formats are supported: ${getSupportedImageFormatsString()}. Other formats may not work as expected.`;
+}
--- a/packages/core/src/utils/request-tokenizer/textTokenizer.test.ts
+++ b/packages/core/src/utils/request-tokenizer/textTokenizer.test.ts
@@ -0,0 +1,347 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { TextTokenizer } from './textTokenizer.js';
+
+// Mock tiktoken at the top level with hoisted functions
+const mockEncode = vi.hoisted(() => vi.fn());
+const mockFree = vi.hoisted(() => vi.fn());
+const mockGetEncoding = vi.hoisted(() => vi.fn());
+
+vi.mock('tiktoken', () => ({
+  get_encoding: mockGetEncoding,
+}));
+
+describe('TextTokenizer', () => {
+  let tokenizer: TextTokenizer;
+  let consoleWarnSpy: ReturnType<typeof vi.spyOn>;
+
+  beforeEach(() => {
+    vi.resetAllMocks();
+    consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+
+    // Default mock implementation
+    mockGetEncoding.mockReturnValue({
+      encode: mockEncode,
+      free: mockFree,
+    });
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+    tokenizer?.dispose();
+  });
+
+  describe('constructor', () => {
+    it('should create tokenizer with default encoding', () => {
+      tokenizer = new TextTokenizer();
+      expect(tokenizer).toBeInstanceOf(TextTokenizer);
+    });
+
+    it('should create tokenizer with custom encoding', () => {
+      tokenizer = new TextTokenizer('gpt2');
+      expect(tokenizer).toBeInstanceOf(TextTokenizer);
+    });
+  });
+
+  describe('calculateTokens', () => {
+    beforeEach(() => {
+      tokenizer = new TextTokenizer();
+    });
+
+    it('should return 0 for empty text', async () => {
+      const result = await tokenizer.calculateTokens('');
+      expect(result).toBe(0);
+    });
+
+    it('should return 0 for null/undefined text', async () => {
+      const result1 = await tokenizer.calculateTokens(
+        null as unknown as string,
+      );
+      const result2 = await tokenizer.calculateTokens(
+        undefined as unknown as string,
+      );
+      expect(result1).toBe(0);
+      expect(result2).toBe(0);
+    });
+
+    it('should calculate tokens using tiktoken when available', async () => {
+      const testText = 'Hello, world!';
+      const mockTokens = [1, 2, 3, 4, 5]; // 5 tokens
+      mockEncode.mockReturnValue(mockTokens);
+
+      const result = await tokenizer.calculateTokens(testText);
+
+      expect(mockGetEncoding).toHaveBeenCalledWith('cl100k_base');
+      expect(mockEncode).toHaveBeenCalledWith(testText);
+      expect(result).toBe(5);
+    });
+
+    it('should use fallback calculation when tiktoken fails to load', async () => {
+      mockGetEncoding.mockImplementation(() => {
+        throw new Error('Failed to load tiktoken');
+      });
+
+      const testText = 'Hello, world!'; // 13 characters
+      const result = await tokenizer.calculateTokens(testText);
+
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Failed to load tiktoken with encoding cl100k_base:',
+        expect.any(Error),
+      );
+      // Fallback: Math.ceil(13 / 4) = 4
+      expect(result).toBe(4);
+    });
+
+    it('should use fallback calculation when encoding fails', async () => {
+      mockEncode.mockImplementation(() => {
+        throw new Error('Encoding failed');
+      });
+
+      const testText = 'Hello, world!'; // 13 characters
+      const result = await tokenizer.calculateTokens(testText);
+
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Error encoding text with tiktoken:',
+        expect.any(Error),
+      );
+      // Fallback: Math.ceil(13 / 4) = 4
+      expect(result).toBe(4);
+    });
+
+    it('should handle very long text', async () => {
+      const longText = 'a'.repeat(10000);
+      const mockTokens = new Array(2500); // 2500 tokens
+      mockEncode.mockReturnValue(mockTokens);
+
+      const result = await tokenizer.calculateTokens(longText);
+
+      expect(result).toBe(2500);
+    });
+
+    it('should handle unicode characters', async () => {
+      const unicodeText = '你好世界 🌍';
+      const mockTokens = [1, 2, 3, 4, 5, 6];
+      mockEncode.mockReturnValue(mockTokens);
+
+      const result = await tokenizer.calculateTokens(unicodeText);
+
+      expect(result).toBe(6);
+    });
+
+    it('should use custom encoding when specified', async () => {
+      tokenizer = new TextTokenizer('gpt2');
+      const testText = 'Hello, world!';
+      const mockTokens = [1, 2, 3];
+      mockEncode.mockReturnValue(mockTokens);
+
+      const result = await tokenizer.calculateTokens(testText);
+
+      expect(mockGetEncoding).toHaveBeenCalledWith('gpt2');
+      expect(result).toBe(3);
+    });
+  });
+
+  describe('calculateTokensBatch', () => {
+    beforeEach(() => {
+      tokenizer = new TextTokenizer();
+    });
+
+    it('should process multiple texts and return token counts', async () => {
+      const texts = ['Hello', 'world', 'test'];
+      mockEncode
+        .mockReturnValueOnce([1, 2]) // 2 tokens for 'Hello'
+        .mockReturnValueOnce([3, 4, 5]) // 3 tokens for 'world'
+        .mockReturnValueOnce([6]); // 1 token for 'test'
+
+      const result = await tokenizer.calculateTokensBatch(texts);
+
+      expect(result).toEqual([2, 3, 1]);
+      expect(mockEncode).toHaveBeenCalledTimes(3);
+    });
+
+    it('should handle empty array', async () => {
+      const result = await tokenizer.calculateTokensBatch([]);
+      expect(result).toEqual([]);
+    });
+
+    it('should handle array with empty strings', async () => {
+      const texts = ['', 'hello', ''];
+      mockEncode.mockReturnValue([1, 2, 3]); // Only called for 'hello'
+
+      const result = await tokenizer.calculateTokensBatch(texts);
+
+      expect(result).toEqual([0, 3, 0]);
+      expect(mockEncode).toHaveBeenCalledTimes(1);
+      expect(mockEncode).toHaveBeenCalledWith('hello');
+    });
+
+    it('should use fallback calculation when tiktoken fails to load', async () => {
+      mockGetEncoding.mockImplementation(() => {
+        throw new Error('Failed to load tiktoken');
+      });
+
+      const texts = ['Hello', 'world']; // 5 and 5 characters
+      const result = await tokenizer.calculateTokensBatch(texts);
+
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Failed to load tiktoken with encoding cl100k_base:',
+        expect.any(Error),
+      );
+      // Fallback: Math.ceil(5/4) = 2 for both
+      expect(result).toEqual([2, 2]);
+    });
+
+    it('should use fallback calculation when encoding fails during batch processing', async () => {
+      mockEncode.mockImplementation(() => {
+        throw new Error('Encoding failed');
+      });
+
+      const texts = ['Hello', 'world']; // 5 and 5 characters
+      const result = await tokenizer.calculateTokensBatch(texts);
+
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Error encoding texts with tiktoken:',
+        expect.any(Error),
+      );
+      // Fallback: Math.ceil(5/4) = 2 for both
+      expect(result).toEqual([2, 2]);
+    });
+
+    it('should handle null and undefined values in batch', async () => {
+      const texts = [null, 'hello', undefined, 'world'] as unknown as string[];
+      mockEncode
+        .mockReturnValueOnce([1, 2, 3]) // 3 tokens for 'hello'
+        .mockReturnValueOnce([4, 5]); // 2 tokens for 'world'
+
+      const result = await tokenizer.calculateTokensBatch(texts);
+
+      expect(result).toEqual([0, 3, 0, 2]);
+    });
+  });
+
+  describe('dispose', () => {
+    beforeEach(() => {
+      tokenizer = new TextTokenizer();
+    });
+
+    it('should free tiktoken encoding when disposing', async () => {
+      // Initialize the encoding by calling calculateTokens
+      await tokenizer.calculateTokens('test');
+
+      tokenizer.dispose();
+
+      expect(mockFree).toHaveBeenCalled();
+    });
+
+    it('should handle disposal when encoding is not initialized', () => {
+      expect(() => tokenizer.dispose()).not.toThrow();
+      expect(mockFree).not.toHaveBeenCalled();
+    });
+
+    it('should handle disposal when encoding is null', async () => {
+      // Force encoding to be null by making tiktoken fail
+      mockGetEncoding.mockImplementation(() => {
+        throw new Error('Failed to load');
+      });
+
+      await tokenizer.calculateTokens('test');
+
+      expect(() => tokenizer.dispose()).not.toThrow();
+      expect(mockFree).not.toHaveBeenCalled();
+    });
+
+    it('should handle errors during disposal gracefully', async () => {
+      await tokenizer.calculateTokens('test');
+
+      mockFree.mockImplementation(() => {
+        throw new Error('Free failed');
+      });
+
+      tokenizer.dispose();
+
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Error freeing tiktoken encoding:',
+        expect.any(Error),
+      );
+    });
+
+    it('should allow multiple calls to dispose', async () => {
+      await tokenizer.calculateTokens('test');
+
+      tokenizer.dispose();
+      tokenizer.dispose(); // Second call should not throw
+
+      expect(mockFree).toHaveBeenCalledTimes(1);
+    });
+  });
+
+  describe('lazy initialization', () => {
+    beforeEach(() => {
+      tokenizer = new TextTokenizer();
+    });
+
+    it('should not initialize tiktoken until first use', () => {
+      expect(mockGetEncoding).not.toHaveBeenCalled();
+    });
+
+    it('should initialize tiktoken on first calculateTokens call', async () => {
+      await tokenizer.calculateTokens('test');
+      expect(mockGetEncoding).toHaveBeenCalledTimes(1);
+    });
+
+    it('should not reinitialize tiktoken on subsequent calls', async () => {
+      await tokenizer.calculateTokens('test1');
+      await tokenizer.calculateTokens('test2');
+
+      expect(mockGetEncoding).toHaveBeenCalledTimes(1);
+    });
+
+    it('should initialize tiktoken on first calculateTokensBatch call', async () => {
+      await tokenizer.calculateTokensBatch(['test']);
+      expect(mockGetEncoding).toHaveBeenCalledTimes(1);
+    });
+  });
+
+  describe('edge cases', () => {
+    beforeEach(() => {
+      tokenizer = new TextTokenizer();
+    });
+
+    it('should handle very short text', async () => {
+      const result = await tokenizer.calculateTokens('a');
+
+      if (mockGetEncoding.mock.calls.length > 0) {
+        // If tiktoken was called, use its result
+        expect(mockEncode).toHaveBeenCalledWith('a');
+      } else {
+        // If tiktoken failed, should use fallback: Math.ceil(1/4) = 1
+        expect(result).toBe(1);
+      }
+    });
+
+    it('should handle text with only whitespace', async () => {
+      const whitespaceText = '   \n\t  ';
+      const mockTokens = [1];
+      mockEncode.mockReturnValue(mockTokens);
+
+      const result = await tokenizer.calculateTokens(whitespaceText);
+
+      expect(result).toBe(1);
+    });
+
+    it('should handle special characters and symbols', async () => {
+      const specialText = '!@#$%^&*()_+-=[]{}|;:,.<>?';
+      const mockTokens = new Array(10);
+      mockEncode.mockReturnValue(mockTokens);
+
+      const result = await tokenizer.calculateTokens(specialText);
+
+      expect(result).toBe(10);
+    });
+  });
+});
--- a/packages/core/src/utils/request-tokenizer/textTokenizer.ts
+++ b/packages/core/src/utils/request-tokenizer/textTokenizer.ts
@@ -0,0 +1,97 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { TiktokenEncoding, Tiktoken } from 'tiktoken';
+import { get_encoding } from 'tiktoken';
+
+/**
+ * Text tokenizer for calculating text tokens using tiktoken
+ */
+export class TextTokenizer {
+  private encoding: Tiktoken | null = null;
+  private encodingName: string;
+
+  constructor(encodingName: string = 'cl100k_base') {
+    this.encodingName = encodingName;
+  }
+
+  /**
+   * Initialize the tokenizer (lazy loading)
+   */
+  private async ensureEncoding(): Promise<void> {
+    if (this.encoding) return;
+
+    try {
+      // Use type assertion since we know the encoding name is valid
+      this.encoding = get_encoding(this.encodingName as TiktokenEncoding);
+    } catch (error) {
+      console.warn(
+        `Failed to load tiktoken with encoding ${this.encodingName}:`,
+        error,
+      );
+      this.encoding = null;
+    }
+  }
+
+  /**
+   * Calculate tokens for text content
+   */
+  async calculateTokens(text: string): Promise<number> {
+    if (!text) return 0;
+
+    await this.ensureEncoding();
+
+    if (this.encoding) {
+      try {
+        return this.encoding.encode(text).length;
+      } catch (error) {
+        console.warn('Error encoding text with tiktoken:', error);
+      }
+    }
+
+    // Fallback: rough approximation using character count
+    // This is a conservative estimate: 1 token ≈ 4 characters for most languages
+    return Math.ceil(text.length / 4);
+  }
+
+  /**
+   * Calculate tokens for multiple text strings in parallel
+   */
+  async calculateTokensBatch(texts: string[]): Promise<number[]> {
+    await this.ensureEncoding();
+
+    if (this.encoding) {
+      try {
+        return texts.map((text) => {
+          if (!text) return 0;
+          // this.encoding may be null, add a null check to satisfy lint
+          return this.encoding ? this.encoding.encode(text).length : 0;
+        });
+      } catch (error) {
+        console.warn('Error encoding texts with tiktoken:', error);
+        // In case of error, return fallback estimation for all texts
+        return texts.map((text) => Math.ceil((text || '').length / 4));
+      }
+    }
+
+    // Fallback for batch processing
+    return texts.map((text) => Math.ceil((text || '').length / 4));
+  }
+
+  /**
+   * Dispose of resources
+   */
+  dispose(): void {
+    if (this.encoding) {
+      try {
+        this.encoding.free();
+      } catch (error) {
+        console.warn('Error freeing tiktoken encoding:', error);
+      }
+      this.encoding = null;
+    }
+  }
+}
--- a/packages/core/src/utils/request-tokenizer/types.ts
+++ b/packages/core/src/utils/request-tokenizer/types.ts
@@ -0,0 +1,64 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { CountTokensParameters } from '@google/genai';
+
+/**
+ * Token calculation result for different content types
+ */
+export interface TokenCalculationResult {
+  /** Total tokens calculated */
+  totalTokens: number;
+  /** Breakdown by content type */
+  breakdown: {
+    textTokens: number;
+    imageTokens: number;
+    audioTokens: number;
+    otherTokens: number;
+  };
+  /** Processing time in milliseconds */
+  processingTime: number;
+}
+
+/**
+ * Configuration for token calculation
+ */
+export interface TokenizerConfig {
+  /** Custom text tokenizer encoding (defaults to cl100k_base) */
+  textEncoding?: string;
+}
+
+/**
+ * Image metadata extracted from base64 data
+ */
+export interface ImageMetadata {
+  /** Image width in pixels */
+  width: number;
+  /** Image height in pixels */
+  height: number;
+  /** MIME type of the image */
+  mimeType: string;
+  /** Size of the base64 data in bytes */
+  dataSize: number;
+}
+
+/**
+ * Request tokenizer interface
+ */
+export interface RequestTokenizer {
+  /**
+   * Calculate tokens for a request
+   */
+  calculateTokens(
+    request: CountTokensParameters,
+    config?: TokenizerConfig,
+  ): Promise<TokenCalculationResult>;
+
+  /**
+   * Dispose of resources (worker threads, etc.)
+   */
+  dispose(): Promise<void>;
+}