From f0e21374c1985be60d780dd05521c513f8137674 Mon Sep 17 00:00:00 2001 From: hj C <76840587+seems20@users.noreply.github.com> Date: Fri, 14 Nov 2025 18:09:33 +0800 Subject: [PATCH] feat: add support for alternative cached_tokens format in OpenAI converter (#1035) Co-authored-by: chenhuanjie --- .../core/openaiContentGenerator/converter.ts | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/packages/core/src/core/openaiContentGenerator/converter.ts b/packages/core/src/core/openaiContentGenerator/converter.ts index 7966f384..1edbdd6e 100644 --- a/packages/core/src/core/openaiContentGenerator/converter.ts +++ b/packages/core/src/core/openaiContentGenerator/converter.ts @@ -23,6 +23,14 @@ import type OpenAI from 'openai'; import { safeJsonParse } from '../../utils/safeJsonParse.js'; import { StreamingToolCallParser } from './streamingToolCallParser.js'; +/** + * Extended usage type that supports both OpenAI standard format and alternative formats + * Some models return cached_tokens at the top level instead of in prompt_tokens_details + */ +interface ExtendedCompletionUsage extends OpenAI.CompletionUsage { + cached_tokens?: number; +} + /** * Tool call accumulator for streaming responses */ @@ -582,7 +590,13 @@ export class OpenAIContentConverter { const promptTokens = usage.prompt_tokens || 0; const completionTokens = usage.completion_tokens || 0; const totalTokens = usage.total_tokens || 0; - const cachedTokens = usage.prompt_tokens_details?.cached_tokens || 0; + // Support both formats: prompt_tokens_details.cached_tokens (OpenAI standard) + // and cached_tokens (some models return it at top level) + const extendedUsage = usage as ExtendedCompletionUsage; + const cachedTokens = + usage.prompt_tokens_details?.cached_tokens ?? + extendedUsage.cached_tokens ?? + 0; // If we only have total tokens but no breakdown, estimate the split // Typically input is ~70% and output is ~30% for most conversations @@ -707,7 +721,13 @@ export class OpenAIContentConverter { const promptTokens = usage.prompt_tokens || 0; const completionTokens = usage.completion_tokens || 0; const totalTokens = usage.total_tokens || 0; - const cachedTokens = usage.prompt_tokens_details?.cached_tokens || 0; + // Support both formats: prompt_tokens_details.cached_tokens (OpenAI standard) + // and cached_tokens (some models return it at top level) + const extendedUsage = usage as ExtendedCompletionUsage; + const cachedTokens = + usage.prompt_tokens_details?.cached_tokens ?? + extendedUsage.cached_tokens ?? + 0; // If we only have total tokens but no breakdown, estimate the split // Typically input is ~70% and output is ~30% for most conversations