From f0e21374c1985be60d780dd05521c513f8137674 Mon Sep 17 00:00:00 2001
From: hj C <76840587+seems20@users.noreply.github.com>
Date: Fri, 14 Nov 2025 18:09:33 +0800
Subject: [PATCH] feat: add support for alternative cached_tokens format in
 OpenAI converter (#1035)

Co-authored-by: chenhuanjie <chenhuanjie@xiaohongshu.com>
---
 .../core/openaiContentGenerator/converter.ts  | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/packages/core/src/core/openaiContentGenerator/converter.ts b/packages/core/src/core/openaiContentGenerator/converter.ts
index 7966f384..1edbdd6e 100644
--- a/packages/core/src/core/openaiContentGenerator/converter.ts
+++ b/packages/core/src/core/openaiContentGenerator/converter.ts
@@ -23,6 +23,14 @@ import type OpenAI from 'openai';
 import { safeJsonParse } from '../../utils/safeJsonParse.js';
 import { StreamingToolCallParser } from './streamingToolCallParser.js';
 
+/**
+ * Extended usage type that supports both OpenAI standard format and alternative formats
+ * Some models return cached_tokens at the top level instead of in prompt_tokens_details
+ */
+interface ExtendedCompletionUsage extends OpenAI.CompletionUsage {
+  cached_tokens?: number;
+}
+
 /**
  * Tool call accumulator for streaming responses
  */
@@ -582,7 +590,13 @@ export class OpenAIContentConverter {
       const promptTokens = usage.prompt_tokens || 0;
       const completionTokens = usage.completion_tokens || 0;
       const totalTokens = usage.total_tokens || 0;
-      const cachedTokens = usage.prompt_tokens_details?.cached_tokens || 0;
+      // Support both formats: prompt_tokens_details.cached_tokens (OpenAI standard)
+      // and cached_tokens (some models return it at top level)
+      const extendedUsage = usage as ExtendedCompletionUsage;
+      const cachedTokens =
+        usage.prompt_tokens_details?.cached_tokens ??
+        extendedUsage.cached_tokens ??
+        0;
 
       // If we only have total tokens but no breakdown, estimate the split
       // Typically input is ~70% and output is ~30% for most conversations
@@ -707,7 +721,13 @@ export class OpenAIContentConverter {
       const promptTokens = usage.prompt_tokens || 0;
       const completionTokens = usage.completion_tokens || 0;
       const totalTokens = usage.total_tokens || 0;
-      const cachedTokens = usage.prompt_tokens_details?.cached_tokens || 0;
+      // Support both formats: prompt_tokens_details.cached_tokens (OpenAI standard)
+      // and cached_tokens (some models return it at top level)
+      const extendedUsage = usage as ExtendedCompletionUsage;
+      const cachedTokens =
+        usage.prompt_tokens_details?.cached_tokens ??
+        extendedUsage.cached_tokens ??
+        0;
 
       // If we only have total tokens but no breakdown, estimate the split
       // Typically input is ~70% and output is ~30% for most conversations