feat: add multi-modal input support (image, PDF, audio) across all content generators

Merge pull request #1548 from QwenLM/mingholy/fix/qwen-oauth-model-info
Fix: Update Qwen OAuth model information
2026-01-21 08:16:21 +00:00 · 2026-01-21 15:44:58 +08:00 · 2026-01-20 16:16:30 +08:00 · 2026-01-20 15:11:11 +08:00
19 changed files with 1108 additions and 434 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,7 @@
 !.gemini/config.yaml
 !.gemini/commands/

-# Note: .gemini-clipboard/ is NOT in gitignore so Gemini can access pasted images
+# Note: .qwen-clipboard/ is NOT in gitignore so Gemini can access pasted images

 # Dependency directory
 node_modules
--- a/packages/cli/src/ui/components/InputPrompt.test.tsx
+++ b/packages/cli/src/ui/components/InputPrompt.test.tsx
@@ -376,7 +376,7 @@ describe('InputPrompt', () => {
    it('should handle Ctrl+V when clipboard has an image', async () => {
      vi.mocked(clipboardUtils.clipboardHasImage).mockResolvedValue(true);
      vi.mocked(clipboardUtils.saveClipboardImage).mockResolvedValue(
-        '/test/.gemini-clipboard/clipboard-123.png',
+        '/test/.qwen-clipboard/clipboard-123.png',
      );

      const { stdin, unmount } = renderWithProviders(
@@ -436,7 +436,7 @@ describe('InputPrompt', () => {
    it('should insert image path at cursor position with proper spacing', async () => {
      const imagePath = path.join(
        'test',
-        '.gemini-clipboard',
+        '.qwen-clipboard',
        'clipboard-456.png',
      );
      vi.mocked(clipboardUtils.clipboardHasImage).mockResolvedValue(true);
--- a/packages/cli/src/ui/utils/clipboardUtils.ts
+++ b/packages/cli/src/ui/utils/clipboardUtils.ts
@@ -44,7 +44,7 @@ export async function saveClipboardImage(
    // Create a temporary directory for clipboard images within the target directory
    // This avoids security restrictions on paths outside the target directory
    const baseDir = targetDir || process.cwd();
-    const tempDir = path.join(baseDir, '.gemini-clipboard');
+    const tempDir = path.join(baseDir, '.qwen-clipboard');
    await fs.mkdir(tempDir, { recursive: true });

    // Generate a unique filename with timestamp
@@ -120,7 +120,7 @@ export async function cleanupOldClipboardImages(
 ): Promise<void> {
  try {
    const baseDir = targetDir || process.cwd();
-    const tempDir = path.join(baseDir, '.gemini-clipboard');
+    const tempDir = path.join(baseDir, '.qwen-clipboard');
    const files = await fs.readdir(tempDir);
    const oneHourAgo = Date.now() - 60 * 60 * 1000;

--- a/packages/core/src/core/anthropicContentGenerator/converter.test.ts
+++ b/packages/core/src/core/anthropicContentGenerator/converter.test.ts
@@ -208,6 +208,238 @@ describe('AnthropicContentConverter', () => {
        ],
      });
    });
+
+    it('converts function response with inlineData image parts into tool_result with images', () => {
+      const { messages } = converter.convertGeminiRequestToAnthropic({
+        model: 'models/test',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call-1',
+                  name: 'Read',
+                  response: { output: 'Image content' },
+                  parts: [
+                    {
+                      inlineData: {
+                        mimeType: 'image/png',
+                        data: 'base64encodeddata',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      });
+
+      expect(messages).toEqual([
+        {
+          role: 'user',
+          content: [
+            {
+              type: 'tool_result',
+              tool_use_id: 'call-1',
+              content: [
+                { type: 'text', text: 'Image content' },
+                {
+                  type: 'image',
+                  source: {
+                    type: 'base64',
+                    media_type: 'image/png',
+                    data: 'base64encodeddata',
+                  },
+                },
+              ],
+            },
+          ],
+        },
+      ]);
+    });
+
+    it('renders non-image inlineData as a text block (avoids invalid image media_type)', () => {
+      const { messages } = converter.convertGeminiRequestToAnthropic({
+        model: 'models/test',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call-1',
+                  name: 'Read',
+                  response: { output: 'Audio content' },
+                  parts: [
+                    {
+                      inlineData: {
+                        mimeType: 'audio/mpeg',
+                        data: 'base64encodedaudiodata',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      });
+
+      expect(messages).toHaveLength(1);
+      expect(messages[0]?.role).toBe('user');
+
+      const toolResult = messages[0]?.content?.[0] as {
+        type: string;
+        content: Array<{ type: string; text?: string }>;
+      };
+      expect(toolResult.type).toBe('tool_result');
+      expect(Array.isArray(toolResult.content)).toBe(true);
+      expect(toolResult.content[0]).toEqual({
+        type: 'text',
+        text: 'Audio content',
+      });
+      expect(toolResult.content[1]?.type).toBe('text');
+      expect(toolResult.content[1]?.text).toContain(
+        'Unsupported inline media type for Anthropic',
+      );
+      expect(toolResult.content[1]?.text).toContain('audio/mpeg');
+    });
+
+    it('converts fileData with PDF into document block', () => {
+      const { messages } = converter.convertGeminiRequestToAnthropic({
+        model: 'models/test',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call-1',
+                  name: 'Read',
+                  response: { output: 'PDF content' },
+                  parts: [
+                    {
+                      fileData: {
+                        mimeType: 'application/pdf',
+                        fileUri: 'pdfbase64data',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      });
+
+      expect(messages).toEqual([
+        {
+          role: 'user',
+          content: [
+            {
+              type: 'tool_result',
+              tool_use_id: 'call-1',
+              content: [
+                { type: 'text', text: 'PDF content' },
+                {
+                  type: 'document',
+                  source: {
+                    type: 'base64',
+                    media_type: 'application/pdf',
+                    data: 'pdfbase64data',
+                  },
+                },
+              ],
+            },
+          ],
+        },
+      ]);
+    });
+
+    it('associates each image with its preceding functionResponse', () => {
+      const { messages } = converter.convertGeminiRequestToAnthropic({
+        model: 'models/test',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              // Tool 1 with image 1
+              {
+                functionResponse: {
+                  id: 'call-1',
+                  name: 'Read',
+                  response: { output: 'File 1' },
+                  parts: [
+                    {
+                      inlineData: {
+                        mimeType: 'image/png',
+                        data: 'image1data',
+                      },
+                    },
+                  ],
+                },
+              },
+              // Tool 2 with image 2
+              {
+                functionResponse: {
+                  id: 'call-2',
+                  name: 'Read',
+                  response: { output: 'File 2' },
+                  parts: [
+                    {
+                      inlineData: {
+                        mimeType: 'image/jpeg',
+                        data: 'image2data',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      });
+
+      // Multiple tool_result blocks are emitted in order
+      expect(messages).toHaveLength(1);
+      expect(messages[0]).toEqual({
+        role: 'user',
+        content: [
+          {
+            type: 'tool_result',
+            tool_use_id: 'call-1',
+            content: [
+              { type: 'text', text: 'File 1' },
+              {
+                type: 'image',
+                source: {
+                  type: 'base64',
+                  media_type: 'image/png',
+                  data: 'image1data',
+                },
+              },
+            ],
+          },
+          {
+            type: 'tool_result',
+            tool_use_id: 'call-2',
+            content: [
+              { type: 'text', text: 'File 2' },
+              {
+                type: 'image',
+                source: {
+                  type: 'base64',
+                  media_type: 'image/jpeg',
+                  data: 'image2data',
+                },
+              },
+            ],
+          },
+        ],
+      });
+    });
  });

  describe('convertGeminiToolsToAnthropic', () => {
--- a/packages/core/src/core/anthropicContentGenerator/converter.ts
+++ b/packages/core/src/core/anthropicContentGenerator/converter.ts
@@ -10,7 +10,6 @@ import type {
  Content,
  ContentListUnion,
  ContentUnion,
-  FunctionCall,
  FunctionResponse,
  GenerateContentParameters,
  Part,
@@ -30,15 +29,6 @@ type AnthropicMessageParam = Anthropic.MessageParam;
 type AnthropicToolParam = Anthropic.Tool;
 type AnthropicContentBlockParam = Anthropic.ContentBlockParam;

-type ThoughtPart = { text: string; signature?: string };
-
-interface ParsedParts {
-  thoughtParts: ThoughtPart[];
-  contentParts: string[];
-  functionCalls: FunctionCall[];
-  functionResponses: FunctionResponse[];
-}
-
 export class AnthropicContentConverter {
  private model: string;
  private schemaCompliance: SchemaComplianceMode;
@@ -228,127 +218,161 @@ export class AnthropicContentConverter {
    }

    if (!this.isContentObject(content)) return;
-
-    const parsed = this.parseParts(content.parts || []);
-
-    if (parsed.functionResponses.length > 0) {
-      for (const response of parsed.functionResponses) {
-        messages.push({
-          role: 'user',
-          content: [
-            {
-              type: 'tool_result',
-              tool_use_id: response.id || '',
-              content: this.extractFunctionResponseContent(response.response),
-            },
-          ],
-        });
-      }
-      return;
-    }
-
-    if (content.role === 'model' && parsed.functionCalls.length > 0) {
-      const thinkingBlocks: AnthropicContentBlockParam[] =
-        parsed.thoughtParts.map((part) => {
-          const thinkingBlock: unknown = {
-            type: 'thinking',
-            thinking: part.text,
-          };
-          if (part.signature) {
-            (thinkingBlock as { signature?: string }).signature =
-              part.signature;
-          }
-          return thinkingBlock as AnthropicContentBlockParam;
-        });
-      const toolUses: AnthropicContentBlockParam[] = parsed.functionCalls.map(
-        (call, index) => ({
-          type: 'tool_use',
-          id: call.id || `tool_${index}`,
-          name: call.name || '',
-          input: (call.args as Record<string, unknown>) || {},
-        }),
-      );
-
-      const textBlocks: AnthropicContentBlockParam[] = parsed.contentParts.map(
-        (text) => ({
-          type: 'text' as const,
-          text,
-        }),
-      );
-
-      messages.push({
-        role: 'assistant',
-        content: [...thinkingBlocks, ...textBlocks, ...toolUses],
-      });
-      return;
-    }
-
+    const parts = content.parts || [];
    const role = content.role === 'model' ? 'assistant' : 'user';
-    const thinkingBlocks: AnthropicContentBlockParam[] =
-      role === 'assistant'
-        ? parsed.thoughtParts.map((part) => {
-            const thinkingBlock: unknown = {
-              type: 'thinking',
-              thinking: part.text,
-            };
-            if (part.signature) {
-              (thinkingBlock as { signature?: string }).signature =
-                part.signature;
-            }
-            return thinkingBlock as AnthropicContentBlockParam;
-          })
-        : [];
-    const textBlocks: AnthropicContentBlockParam[] = [
-      ...thinkingBlocks,
-      ...parsed.contentParts.map((text) => ({
-        type: 'text' as const,
-        text,
-      })),
-    ];
-    if (textBlocks.length > 0) {
-      messages.push({ role, content: textBlocks });
-    }
-  }
-
-  private parseParts(parts: Part[]): ParsedParts {
-    const thoughtParts: ThoughtPart[] = [];
-    const contentParts: string[] = [];
-    const functionCalls: FunctionCall[] = [];
-    const functionResponses: FunctionResponse[] = [];
+    const contentBlocks: AnthropicContentBlockParam[] = [];
+    let toolCallIndex = 0;

    for (const part of parts) {
      if (typeof part === 'string') {
-        contentParts.push(part);
-      } else if (
-        'text' in part &&
-        part.text &&
-        !('thought' in part && part.thought)
-      ) {
-        contentParts.push(part.text);
-      } else if ('text' in part && 'thought' in part && part.thought) {
-        thoughtParts.push({
-          text: part.text || '',
-          signature:
+        contentBlocks.push({ type: 'text', text: part });
+        continue;
+      }
+
+      if ('text' in part && 'thought' in part && part.thought) {
+        if (role === 'assistant') {
+          const thinkingBlock: unknown = {
+            type: 'thinking',
+            thinking: part.text || '',
+          };
+          if (
            'thoughtSignature' in part &&
            typeof part.thoughtSignature === 'string'
-              ? part.thoughtSignature
-              : undefined,
-        });
-      } else if ('functionCall' in part && part.functionCall) {
-        functionCalls.push(part.functionCall);
-      } else if ('functionResponse' in part && part.functionResponse) {
-        functionResponses.push(part.functionResponse);
+          ) {
+            (thinkingBlock as { signature?: string }).signature =
+              part.thoughtSignature;
+          }
+          contentBlocks.push(thinkingBlock as AnthropicContentBlockParam);
+        }
+      }
+
+      if ('text' in part && part.text && !('thought' in part && part.thought)) {
+        contentBlocks.push({ type: 'text', text: part.text });
+      }
+
+      const mediaBlock = this.createMediaBlockFromPart(part);
+      if (mediaBlock) {
+        contentBlocks.push(mediaBlock);
+      }
+
+      if ('functionCall' in part && part.functionCall) {
+        if (role === 'assistant') {
+          contentBlocks.push({
+            type: 'tool_use',
+            id: part.functionCall.id || `tool_${toolCallIndex}`,
+            name: part.functionCall.name || '',
+            input: (part.functionCall.args as Record<string, unknown>) || {},
+          });
+          toolCallIndex += 1;
+        }
+      }
+
+      if (part.functionResponse) {
+        const toolResultBlock = this.createToolResultBlock(
+          part.functionResponse,
+        );
+        if (toolResultBlock && role === 'user') {
+          contentBlocks.push(toolResultBlock);
+        }
      }
    }

+    if (contentBlocks.length > 0) {
+      messages.push({ role, content: contentBlocks });
+    }
+  }
+
+  private createToolResultBlock(
+    response: FunctionResponse,
+  ): Anthropic.ToolResultBlockParam | null {
+    const textContent = this.extractFunctionResponseContent(response.response);
+
+    type ToolResultContent = Anthropic.ToolResultBlockParam['content'];
+    const partBlocks: AnthropicContentBlockParam[] = [];
+
+    for (const part of response.parts || []) {
+      const block = this.createMediaBlockFromPart(part);
+      if (block) {
+        partBlocks.push(block);
+      }
+    }
+
+    let content: ToolResultContent;
+    if (partBlocks.length > 0) {
+      const blocks: AnthropicContentBlockParam[] = [];
+      if (textContent) {
+        blocks.push({ type: 'text', text: textContent });
+      }
+      blocks.push(...partBlocks);
+      content = blocks as unknown as ToolResultContent;
+    } else {
+      content = textContent;
+    }
+
    return {
-      thoughtParts,
-      contentParts,
-      functionCalls,
-      functionResponses,
+      type: 'tool_result',
+      tool_use_id: response.id || '',
+      content,
    };
  }

+  private createMediaBlockFromPart(
+    part: Part,
+  ): AnthropicContentBlockParam | null {
+    if (part.inlineData?.mimeType && part.inlineData?.data) {
+      if (!this.isSupportedAnthropicImageMimeType(part.inlineData.mimeType)) {
+        const displayName = part.inlineData.displayName ?? '';
+        return {
+          type: 'text',
+          text: `Unsupported inline media type for Anthropic: ${part.inlineData.mimeType}${displayName}.`,
+        };
+      }
+      return {
+        type: 'image',
+        source: {
+          type: 'base64',
+          media_type: part.inlineData.mimeType as
+            | 'image/jpeg'
+            | 'image/png'
+            | 'image/gif'
+            | 'image/webp',
+          data: part.inlineData.data,
+        },
+      };
+    }
+
+    if (part.fileData?.mimeType && part.fileData?.fileUri) {
+      if (part.fileData.mimeType !== 'application/pdf') {
+        const displayName = part.fileData.displayName ?? '';
+        return {
+          type: 'text',
+          text: `Unsupported file media for Anthropic: ${part.fileData.mimeType}${displayName}`,
+        };
+      }
+      return {
+        type: 'document',
+        source: {
+          type: 'base64',
+          media_type: part.fileData.mimeType as 'application/pdf',
+          data: part.fileData.fileUri,
+        },
+      };
+    }
+
+    return null;
+  }
+
+  private isSupportedAnthropicImageMimeType(
+    mimeType: string,
+  ): mimeType is 'image/jpeg' | 'image/png' | 'image/gif' | 'image/webp' {
+    return (
+      mimeType === 'image/jpeg' ||
+      mimeType === 'image/png' ||
+      mimeType === 'image/gif' ||
+      mimeType === 'image/webp'
+    );
+  }
+
  private extractTextFromContentUnion(contentUnion: unknown): string {
    if (typeof contentUnion === 'string') {
      return contentUnion;
--- a/packages/core/src/core/coreToolScheduler.test.ts
+++ b/packages/core/src/core/coreToolScheduler.test.ts
@@ -800,11 +800,11 @@ describe('convertToFunctionResponse', () => {
          name: toolName,
          id: callId,
          response: {
-            output: 'Binary content of type image/png was processed.',
+            output: '',
          },
+          parts: [{ inlineData: { mimeType: 'image/png', data: 'base64...' } }],
        },
      },
-      llmContent,
    ]);
  });

@@ -819,11 +819,15 @@ describe('convertToFunctionResponse', () => {
          name: toolName,
          id: callId,
          response: {
-            output: 'Binary content of type application/pdf was processed.',
+            output: '',
          },
+          parts: [
+            {
+              fileData: { mimeType: 'application/pdf', fileUri: 'gs://...' },
+            },
+          ],
        },
      },
-      llmContent,
    ]);
  });

@@ -857,11 +861,13 @@ describe('convertToFunctionResponse', () => {
          name: toolName,
          id: callId,
          response: {
-            output: 'Binary content of type image/gif was processed.',
+            output: '',
          },
+          parts: [
+            { inlineData: { mimeType: 'image/gif', data: 'gifdata...' } },
+          ],
        },
      },
-      ...llmContent,
    ]);
  });

--- a/packages/core/src/core/coreToolScheduler.ts
+++ b/packages/core/src/core/coreToolScheduler.ts
@@ -30,7 +30,12 @@ import {
  ToolOutputTruncatedEvent,
  InputFormat,
 } from '../index.js';
-import type { Part, PartListUnion } from '@google/genai';
+import type {
+  FunctionResponse,
+  FunctionResponsePart,
+  Part,
+  PartListUnion,
+} from '@google/genai';
 import { getResponseTextFromParts } from '../utils/generateContentResponseUtilities.js';
 import type { ModifyContext } from '../tools/modifiable-tool.js';
 import {
@@ -151,13 +156,17 @@ function createFunctionResponsePart(
  callId: string,
  toolName: string,
  output: string,
+  mediaParts?: FunctionResponsePart[],
 ): Part {
+  const functionResponse: FunctionResponse = {
+    id: callId,
+    name: toolName,
+    response: { output },
+    ...(mediaParts && mediaParts.length > 0 ? { parts: mediaParts } : {}),
+  };
+
  return {
-    functionResponse: {
-      id: callId,
-      name: toolName,
-      response: { output },
-    },
+    functionResponse,
  };
 }

@@ -198,16 +207,21 @@ export function convertToFunctionResponse(
  }

  if (contentToProcess.inlineData || contentToProcess.fileData) {
-    const mimeType =
-      contentToProcess.inlineData?.mimeType ||
-      contentToProcess.fileData?.mimeType ||
-      'unknown';
+    const mediaParts: FunctionResponsePart[] = [];
+    if (contentToProcess.inlineData) {
+      mediaParts.push({ inlineData: contentToProcess.inlineData });
+    }
+    if (contentToProcess.fileData) {
+      mediaParts.push({ fileData: contentToProcess.fileData });
+    }
+
    const functionResponse = createFunctionResponsePart(
      callId,
      toolName,
-      `Binary content of type ${mimeType} was processed.`,
+      '',
+      mediaParts,
    );
-    return [functionResponse, contentToProcess];
+    return [functionResponse];
  }

  if (contentToProcess.text !== undefined) {
--- a/packages/core/src/core/nonInteractiveToolExecutor.test.ts
+++ b/packages/core/src/core/nonInteractiveToolExecutor.test.ts
@@ -309,11 +309,13 @@ describe('executeToolCall', () => {
            name: 'testTool',
            id: 'call6',
            response: {
-              output: 'Binary content of type image/png was processed.',
+              output: '',
            },
+            parts: [
+              { inlineData: { mimeType: 'image/png', data: 'base64data' } },
+            ],
          },
        },
-        imageDataPart,
      ],
    });
  });
--- a/packages/core/src/core/openaiContentGenerator/converter.test.ts
+++ b/packages/core/src/core/openaiContentGenerator/converter.test.ts
@@ -122,7 +122,13 @@ describe('OpenAIContentConverter', () => {
      const toolMessage = messages.find((message) => message.role === 'tool');

      expect(toolMessage).toBeDefined();
-      expect(toolMessage?.content).toBe('Raw output text');
+      expect(Array.isArray(toolMessage?.content)).toBe(true);
+      const contentArray = toolMessage?.content as Array<{
+        type: string;
+        text?: string;
+      }>;
+      expect(contentArray[0].type).toBe('text');
+      expect(contentArray[0].text).toBe('Raw output text');
    });

    it('should prioritize error field when present', () => {
@@ -134,7 +140,13 @@ describe('OpenAIContentConverter', () => {
      const toolMessage = messages.find((message) => message.role === 'tool');

      expect(toolMessage).toBeDefined();
-      expect(toolMessage?.content).toBe('Command failed');
+      expect(Array.isArray(toolMessage?.content)).toBe(true);
+      const contentArray = toolMessage?.content as Array<{
+        type: string;
+        text?: string;
+      }>;
+      expect(contentArray[0].type).toBe('text');
+      expect(contentArray[0].text).toBe('Command failed');
    });

    it('should stringify non-string responses', () => {
@@ -146,7 +158,318 @@ describe('OpenAIContentConverter', () => {
      const toolMessage = messages.find((message) => message.role === 'tool');

      expect(toolMessage).toBeDefined();
-      expect(toolMessage?.content).toBe('{"data":{"value":42}}');
+      expect(Array.isArray(toolMessage?.content)).toBe(true);
+      const contentArray = toolMessage?.content as Array<{
+        type: string;
+        text?: string;
+      }>;
+      expect(contentArray[0].type).toBe('text');
+      expect(contentArray[0].text).toBe('{"data":{"value":42}}');
+    });
+
+    it('should convert function responses with inlineData to tool message with embedded image_url', () => {
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [
+              {
+                functionCall: {
+                  id: 'call_1',
+                  name: 'Read',
+                  args: {},
+                },
+              },
+            ],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call_1',
+                  name: 'Read',
+                  response: { output: 'Image content' },
+                  parts: [
+                    {
+                      inlineData: {
+                        mimeType: 'image/png',
+                        data: 'base64encodedimagedata',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const messages = converter.convertGeminiRequestToOpenAI(request);
+
+      // Should have tool message with both text and image content
+      const toolMessage = messages.find((message) => message.role === 'tool');
+      expect(toolMessage).toBeDefined();
+      expect((toolMessage as { tool_call_id?: string }).tool_call_id).toBe(
+        'call_1',
+      );
+      expect(Array.isArray(toolMessage?.content)).toBe(true);
+      const contentArray = toolMessage?.content as Array<{
+        type: string;
+        text?: string;
+        image_url?: { url: string };
+      }>;
+      expect(contentArray).toHaveLength(2);
+      expect(contentArray[0].type).toBe('text');
+      expect(contentArray[0].text).toBe('Image content');
+      expect(contentArray[1].type).toBe('image_url');
+      expect(contentArray[1].image_url?.url).toBe(
+        'data:image/png;base64,base64encodedimagedata',
+      );
+
+      // No separate user message should be created
+      const userMessage = messages.find((message) => message.role === 'user');
+      expect(userMessage).toBeUndefined();
+    });
+
+    it('should convert function responses with fileData to tool message with embedded input_file', () => {
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [
+              {
+                functionCall: {
+                  id: 'call_1',
+                  name: 'Read',
+                  args: {},
+                },
+              },
+            ],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call_1',
+                  name: 'Read',
+                  response: { output: 'File content' },
+                  parts: [
+                    {
+                      fileData: {
+                        mimeType: 'image/jpeg',
+                        fileUri: 'base64imagedata',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const messages = converter.convertGeminiRequestToOpenAI(request);
+
+      // Should have tool message with both text and file content
+      const toolMessage = messages.find((message) => message.role === 'tool');
+      expect(toolMessage).toBeDefined();
+      expect(Array.isArray(toolMessage?.content)).toBe(true);
+      const contentArray = toolMessage?.content as Array<{
+        type: string;
+        text?: string;
+        file?: { filename: string; file_data: string };
+      }>;
+      expect(contentArray).toHaveLength(2);
+      expect(contentArray[0].type).toBe('text');
+      expect(contentArray[0].text).toBe('File content');
+      expect(contentArray[1].type).toBe('file');
+      expect(contentArray[1].file?.filename).toBe('file'); // Default filename when displayName not provided
+      expect(contentArray[1].file?.file_data).toBe(
+        'data:image/jpeg;base64,base64imagedata',
+      );
+
+      // No separate user message should be created
+      const userMessage = messages.find((message) => message.role === 'user');
+      expect(userMessage).toBeUndefined();
+    });
+
+    it('should convert PDF fileData to tool message with embedded input_file', () => {
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [
+              {
+                functionCall: {
+                  id: 'call_1',
+                  name: 'Read',
+                  args: {},
+                },
+              },
+            ],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call_1',
+                  name: 'Read',
+                  response: { output: 'PDF content' },
+                  parts: [
+                    {
+                      fileData: {
+                        mimeType: 'application/pdf',
+                        fileUri: 'base64pdfdata',
+                        displayName: 'document.pdf',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const messages = converter.convertGeminiRequestToOpenAI(request);
+
+      // Should have tool message with both text and file content
+      const toolMessage = messages.find((message) => message.role === 'tool');
+      expect(toolMessage).toBeDefined();
+      expect(Array.isArray(toolMessage?.content)).toBe(true);
+      const contentArray = toolMessage?.content as Array<{
+        type: string;
+        text?: string;
+        file?: { filename: string; file_data: string };
+      }>;
+      expect(contentArray).toHaveLength(2);
+      expect(contentArray[0].type).toBe('text');
+      expect(contentArray[0].text).toBe('PDF content');
+      expect(contentArray[1].type).toBe('file');
+      expect(contentArray[1].file?.filename).toBe('document.pdf');
+      expect(contentArray[1].file?.file_data).toBe(
+        'data:application/pdf;base64,base64pdfdata',
+      );
+
+      // No separate user message should be created
+      const userMessage = messages.find((message) => message.role === 'user');
+      expect(userMessage).toBeUndefined();
+    });
+
+    it('should convert audio parts to tool message with embedded input_audio', () => {
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'model',
+            parts: [
+              {
+                functionCall: {
+                  id: 'call_1',
+                  name: 'Record',
+                  args: {},
+                },
+              },
+            ],
+          },
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call_1',
+                  name: 'Record',
+                  response: { output: 'Audio recorded' },
+                  parts: [
+                    {
+                      inlineData: {
+                        mimeType: 'audio/wav',
+                        data: 'audiobase64data',
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const messages = converter.convertGeminiRequestToOpenAI(request);
+
+      // Should have tool message with both text and audio content
+      const toolMessage = messages.find((message) => message.role === 'tool');
+      expect(toolMessage).toBeDefined();
+      expect(Array.isArray(toolMessage?.content)).toBe(true);
+      const contentArray = toolMessage?.content as Array<{
+        type: string;
+        text?: string;
+        input_audio?: { data: string; format: string };
+      }>;
+      expect(contentArray).toHaveLength(2);
+      expect(contentArray[0].type).toBe('text');
+      expect(contentArray[0].text).toBe('Audio recorded');
+      expect(contentArray[1].type).toBe('input_audio');
+      expect(contentArray[1].input_audio?.data).toBe('audiobase64data');
+      expect(contentArray[1].input_audio?.format).toBe('wav');
+
+      // No separate user message should be created
+      const userMessage = messages.find((message) => message.role === 'user');
+      expect(userMessage).toBeUndefined();
+    });
+
+    it('should create tool message with text-only content when no media parts', () => {
+      const request = createRequestWithFunctionResponse({
+        output: 'Plain text output',
+      });
+
+      const messages = converter.convertGeminiRequestToOpenAI(request);
+      const toolMessage = messages.find((message) => message.role === 'tool');
+
+      expect(toolMessage).toBeDefined();
+      expect(Array.isArray(toolMessage?.content)).toBe(true);
+      const contentArray = toolMessage?.content as Array<{
+        type: string;
+        text?: string;
+      }>;
+      expect(contentArray).toHaveLength(1);
+      expect(contentArray[0].type).toBe('text');
+      expect(contentArray[0].text).toBe('Plain text output');
+
+      // No user message should be created when there's no media
+      const userMessage = messages.find((message) => message.role === 'user');
+      expect(userMessage).toBeUndefined();
+    });
+
+    it('should skip empty function responses with no media and no text', () => {
+      const request: GenerateContentParameters = {
+        model: 'models/test',
+        contents: [
+          {
+            role: 'user',
+            parts: [
+              {
+                functionResponse: {
+                  id: 'call_1',
+                  name: 'Empty',
+                  response: { output: '' },
+                },
+              },
+            ],
+          },
+        ],
+      };
+
+      const messages = converter.convertGeminiRequestToOpenAI(request);
+
+      // Should have no messages for empty response
+      expect(messages).toHaveLength(0);
    });
  });

@@ -180,6 +503,35 @@ describe('OpenAIContentConverter', () => {
      );
    });

+    it('should convert reasoning to a thought part for non-streaming responses', () => {
+      const response = converter.convertOpenAIResponseToGemini({
+        object: 'chat.completion',
+        id: 'chatcmpl-2',
+        created: 123,
+        model: 'gpt-test',
+        choices: [
+          {
+            index: 0,
+            message: {
+              role: 'assistant',
+              content: 'final answer',
+              reasoning: 'chain-of-thought',
+            },
+            finish_reason: 'stop',
+            logprobs: null,
+          },
+        ],
+      } as unknown as OpenAI.Chat.ChatCompletion);
+
+      const parts = response.candidates?.[0]?.content?.parts;
+      expect(parts?.[0]).toEqual(
+        expect.objectContaining({ thought: true, text: 'chain-of-thought' }),
+      );
+      expect(parts?.[1]).toEqual(
+        expect.objectContaining({ text: 'final answer' }),
+      );
+    });
+
    it('should convert streaming reasoning_content delta to a thought part', () => {
      const chunk = converter.convertOpenAIChunkToGemini({
        object: 'chat.completion.chunk',
@@ -208,6 +560,34 @@ describe('OpenAIContentConverter', () => {
      );
    });

+    it('should convert streaming reasoning delta to a thought part', () => {
+      const chunk = converter.convertOpenAIChunkToGemini({
+        object: 'chat.completion.chunk',
+        id: 'chunk-1b',
+        created: 456,
+        choices: [
+          {
+            index: 0,
+            delta: {
+              content: 'visible text',
+              reasoning: 'thinking...',
+            },
+            finish_reason: 'stop',
+            logprobs: null,
+          },
+        ],
+        model: 'gpt-test',
+      } as unknown as OpenAI.Chat.ChatCompletionChunk);
+
+      const parts = chunk.candidates?.[0]?.content?.parts;
+      expect(parts?.[0]).toEqual(
+        expect.objectContaining({ thought: true, text: 'thinking...' }),
+      );
+      expect(parts?.[1]).toEqual(
+        expect.objectContaining({ text: 'visible text' }),
+      );
+    });
+
    it('should not throw when streaming chunk has no delta', () => {
      const chunk = converter.convertOpenAIChunkToGemini({
        object: 'chat.completion.chunk',
@@ -584,11 +964,7 @@ describe('OpenAIContentConverter', () => {

      expect(messages).toHaveLength(1);
      expect(messages[0].role).toBe('assistant');
-      const content = messages[0]
-        .content as OpenAI.Chat.ChatCompletionContentPart[];
-      expect(content).toHaveLength(2);
-      expect(content[0]).toEqual({ type: 'text', text: 'First part' });
-      expect(content[1]).toEqual({ type: 'text', text: 'Second part' });
+      expect(messages[0].content).toBe('First partSecond part');
    });

    it('should merge multiple consecutive assistant messages', () => {
@@ -614,9 +990,7 @@ describe('OpenAIContentConverter', () => {

      expect(messages).toHaveLength(1);
      expect(messages[0].role).toBe('assistant');
-      const content = messages[0]
-        .content as OpenAI.Chat.ChatCompletionContentPart[];
-      expect(content).toHaveLength(3);
+      expect(messages[0].content).toBe('Part 1Part 2Part 3');
    });

    it('should merge tool_calls from consecutive assistant messages', () => {
@@ -674,7 +1048,9 @@ describe('OpenAIContentConverter', () => {
        ],
      };

-      const messages = converter.convertGeminiRequestToOpenAI(request);
+      const messages = converter.convertGeminiRequestToOpenAI(request, {
+        cleanOrphanToolCalls: false,
+      });

      // Should have: assistant (tool_call_1), tool (result_1), assistant (tool_call_2), tool (result_2)
      expect(messages).toHaveLength(4);
@@ -729,10 +1105,7 @@ describe('OpenAIContentConverter', () => {
      const messages = converter.convertGeminiRequestToOpenAI(request);

      expect(messages).toHaveLength(1);
-      const content = messages[0]
-        .content as OpenAI.Chat.ChatCompletionContentPart[];
-      expect(Array.isArray(content)).toBe(true);
-      expect(content).toHaveLength(2);
+      expect(messages[0].content).toBe('Text partAnother text');
    });

    it('should merge empty content correctly', () => {
@@ -758,11 +1131,7 @@ describe('OpenAIContentConverter', () => {

      // Empty messages should be filtered out
      expect(messages).toHaveLength(1);
-      const content = messages[0]
-        .content as OpenAI.Chat.ChatCompletionContentPart[];
-      expect(content).toHaveLength(2);
-      expect(content[0]).toEqual({ type: 'text', text: 'First' });
-      expect(content[1]).toEqual({ type: 'text', text: 'Second' });
+      expect(messages[0].content).toBe('FirstSecond');
    });
  });
 });
--- a/packages/core/src/core/openaiContentGenerator/converter.ts
+++ b/packages/core/src/core/openaiContentGenerator/converter.ts
@@ -11,7 +11,6 @@ import type {
  Tool,
  ToolListUnion,
  CallableTool,
-  FunctionCall,
  FunctionResponse,
  ContentListUnion,
  ContentUnion,
@@ -47,11 +46,13 @@ type ExtendedChatCompletionMessageParam =
 export interface ExtendedCompletionMessage
  extends OpenAI.Chat.ChatCompletionMessage {
  reasoning_content?: string | null;
+  reasoning?: string | null;
 }

 export interface ExtendedCompletionChunkDelta
  extends OpenAI.Chat.ChatCompletionChunk.Choice.Delta {
  reasoning_content?: string | null;
+  reasoning?: string | null;
 }

 /**
@@ -63,21 +64,17 @@ export interface ToolCallAccumulator {
  arguments: string;
 }

-/**
- * Parsed parts from Gemini content, categorized by type
- */
-interface ParsedParts {
-  thoughtParts: string[];
-  contentParts: string[];
-  functionCalls: FunctionCall[];
-  functionResponses: FunctionResponse[];
-  mediaParts: Array<{
-    type: 'image' | 'audio' | 'file';
-    data: string;
-    mimeType: string;
-    fileUri?: string;
-  }>;
-}
+type OpenAIContentPart =
+  | OpenAI.Chat.ChatCompletionContentPartText
+  | OpenAI.Chat.ChatCompletionContentPartImage
+  | OpenAI.Chat.ChatCompletionContentPartInputAudio
+  | {
+      type: 'file';
+      file: {
+        filename: string;
+        file_data: string;
+      };
+    };

 /**
 * Converter class for transforming data between Gemini and OpenAI formats
@@ -271,28 +268,48 @@ export class OpenAIContentConverter {
  ): OpenAI.Chat.ChatCompletion {
    const candidate = response.candidates?.[0];
    const parts = (candidate?.content?.parts || []) as Part[];
-    const parsedParts = this.parseParts(parts);
+
+    // Parse parts inline
+    const thoughtParts: string[] = [];
+    const contentParts: string[] = [];
+    const toolCalls: OpenAI.Chat.ChatCompletionMessageToolCall[] = [];
+    let toolCallIndex = 0;
+
+    for (const part of parts) {
+      if (typeof part === 'string') {
+        contentParts.push(part);
+      } else if ('text' in part && part.text) {
+        if ('thought' in part && part.thought) {
+          thoughtParts.push(part.text);
+        } else {
+          contentParts.push(part.text);
+        }
+      } else if ('functionCall' in part && part.functionCall) {
+        toolCalls.push({
+          id: part.functionCall.id || `call_${toolCallIndex}`,
+          type: 'function' as const,
+          function: {
+            name: part.functionCall.name || '',
+            arguments: JSON.stringify(part.functionCall.args || {}),
+          },
+        });
+        toolCallIndex += 1;
+      }
+    }

    const message: ExtendedCompletionMessage = {
      role: 'assistant',
-      content: parsedParts.contentParts.join('') || null,
+      content: contentParts.join('') || null,
      refusal: null,
    };

-    const reasoningContent = parsedParts.thoughtParts.join('');
+    const reasoningContent = thoughtParts.join('');
    if (reasoningContent) {
      message.reasoning_content = reasoningContent;
    }

-    if (parsedParts.functionCalls.length > 0) {
-      message.tool_calls = parsedParts.functionCalls.map((call, index) => ({
-        id: call.id || `call_${index}`,
-        type: 'function' as const,
-        function: {
-          name: call.name || '',
-          arguments: JSON.stringify(call.args || {}),
-        },
-      }));
+    if (toolCalls.length > 0) {
+      message.tool_calls = toolCalls;
    }

    const finishReason = this.mapGeminiFinishReasonToOpenAI(
@@ -390,40 +407,82 @@ export class OpenAIContentConverter {
    }

    if (!this.isContentObject(content)) return;
+    const parts = content.parts || [];
+    const role = content.role === 'model' ? 'assistant' : 'user';

-    const parsedParts = this.parseParts(content.parts || []);
+    const contentParts: OpenAIContentPart[] = [];
+    const reasoningParts: string[] = [];
+    const toolCalls: OpenAI.Chat.ChatCompletionMessageToolCall[] = [];
+    let toolCallIndex = 0;

-    // Handle function responses (tool results) first
-    if (parsedParts.functionResponses.length > 0) {
-      for (const funcResponse of parsedParts.functionResponses) {
-        messages.push({
-          role: 'tool' as const,
-          tool_call_id: funcResponse.id || '',
-          content: this.extractFunctionResponseContent(funcResponse.response),
-        });
+    for (const part of parts) {
+      if (typeof part === 'string') {
+        contentParts.push({ type: 'text' as const, text: part });
+        continue;
+      }
+
+      if ('text' in part && 'thought' in part && part.thought) {
+        if (role === 'assistant' && part.text) {
+          reasoningParts.push(part.text);
+        }
+      }
+
+      if ('text' in part && part.text && !('thought' in part && part.thought)) {
+        contentParts.push({ type: 'text' as const, text: part.text });
+      }
+
+      const mediaPart = this.createMediaContentPart(part);
+      if (mediaPart && role === 'user') {
+        contentParts.push(mediaPart);
+      }
+
+      if ('functionCall' in part && part.functionCall && role === 'assistant') {
+        toolCalls.push({
+          id: part.functionCall.id || `call_${toolCallIndex}`,
+          type: 'function' as const,
+          function: {
+            name: part.functionCall.name || '',
+            arguments: JSON.stringify(part.functionCall.args || {}),
+          },
+        });
+        toolCallIndex += 1;
+      }
+
+      if (part.functionResponse && role === 'user') {
+        // Create tool message for the function response (with embedded media)
+        const toolMessage = this.createToolMessage(part.functionResponse);
+        if (toolMessage) {
+          messages.push(toolMessage);
+        }
      }
-      return;
    }

-    // Handle model messages with function calls
-    if (content.role === 'model' && parsedParts.functionCalls.length > 0) {
-      const toolCalls = parsedParts.functionCalls.map((fc, index) => ({
-        id: fc.id || `call_${index}`,
-        type: 'function' as const,
-        function: {
-          name: fc.name || '',
-          arguments: JSON.stringify(fc.args || {}),
-        },
-      }));
+    if (role === 'assistant') {
+      if (
+        contentParts.length === 0 &&
+        toolCalls.length === 0 &&
+        reasoningParts.length === 0
+      ) {
+        return;
+      }

+      const assistantTextContent = contentParts
+        .filter(
+          (part): part is OpenAI.Chat.ChatCompletionContentPartText =>
+            part.type === 'text',
+        )
+        .map((part) => part.text)
+        .join('');
      const assistantMessage: ExtendedChatCompletionAssistantMessageParam = {
-        role: 'assistant' as const,
-        content: parsedParts.contentParts.join('') || null,
-        tool_calls: toolCalls,
+        role: 'assistant',
+        content: assistantTextContent || null,
      };

-      // Only include reasoning_content if it has actual content
-      const reasoningContent = parsedParts.thoughtParts.join('');
+      if (toolCalls.length > 0) {
+        assistantMessage.tool_calls = toolCalls;
+      }
+
+      const reasoningContent = reasoningParts.join('');
      if (reasoningContent) {
        assistantMessage.reasoning_content = reasoningContent;
      }
@@ -432,79 +491,15 @@ export class OpenAIContentConverter {
      return;
    }

-    // Handle regular messages with multimodal content
-    const role = content.role === 'model' ? 'assistant' : 'user';
-    const openAIMessage = this.createMultimodalMessage(role, parsedParts);
-
-    if (openAIMessage) {
-      messages.push(openAIMessage);
+    if (contentParts.length > 0) {
+      messages.push({
+        role: 'user',
+        content:
+          contentParts as unknown as OpenAI.Chat.ChatCompletionContentPart[],
+      });
    }
  }

-  /**
-   * Parse Gemini parts into categorized components
-   */
-  private parseParts(parts: Part[]): ParsedParts {
-    const thoughtParts: string[] = [];
-    const contentParts: string[] = [];
-    const functionCalls: FunctionCall[] = [];
-    const functionResponses: FunctionResponse[] = [];
-    const mediaParts: Array<{
-      type: 'image' | 'audio' | 'file';
-      data: string;
-      mimeType: string;
-      fileUri?: string;
-    }> = [];
-
-    for (const part of parts) {
-      if (typeof part === 'string') {
-        contentParts.push(part);
-      } else if (
-        'text' in part &&
-        part.text &&
-        !('thought' in part && part.thought)
-      ) {
-        contentParts.push(part.text);
-      } else if (
-        'text' in part &&
-        part.text &&
-        'thought' in part &&
-        part.thought
-      ) {
-        thoughtParts.push(part.text);
-      } else if ('functionCall' in part && part.functionCall) {
-        functionCalls.push(part.functionCall);
-      } else if ('functionResponse' in part && part.functionResponse) {
-        functionResponses.push(part.functionResponse);
-      } else if ('inlineData' in part && part.inlineData) {
-        const { data, mimeType } = part.inlineData;
-        if (data && mimeType) {
-          const mediaType = this.getMediaType(mimeType);
-          mediaParts.push({ type: mediaType, data, mimeType });
-        }
-      } else if ('fileData' in part && part.fileData) {
-        const { fileUri, mimeType } = part.fileData;
-        if (fileUri && mimeType) {
-          const mediaType = this.getMediaType(mimeType);
-          mediaParts.push({
-            type: mediaType,
-            data: '',
-            mimeType,
-            fileUri,
-          });
-        }
-      }
-    }
-
-    return {
-      thoughtParts,
-      contentParts,
-      functionCalls,
-      functionResponses,
-      mediaParts,
-    };
-  }
-
  private extractFunctionResponseContent(response: unknown): string {
    if (response === null || response === undefined) {
      return '';
@@ -535,6 +530,96 @@ export class OpenAIContentConverter {
    }
  }

+  /**
+   * Create a tool message from function response (with embedded media parts)
+   */
+  private createToolMessage(
+    response: FunctionResponse,
+  ): OpenAI.Chat.ChatCompletionToolMessageParam | null {
+    const textContent = this.extractFunctionResponseContent(response.response);
+    const contentParts: OpenAIContentPart[] = [];
+
+    // Add text content first if present
+    if (textContent) {
+      contentParts.push({ type: 'text' as const, text: textContent });
+    }
+
+    // Add media parts from function response
+    for (const part of response.parts || []) {
+      const mediaPart = this.createMediaContentPart(part);
+      if (mediaPart) {
+        contentParts.push(mediaPart);
+      }
+    }
+
+    // Tool messages require content, so skip if empty
+    if (contentParts.length === 0) {
+      return null;
+    }
+
+    // Cast to OpenAI type - some OpenAI-compatible APIs support richer content in tool messages
+    return {
+      role: 'tool' as const,
+      tool_call_id: response.id || '',
+      content: contentParts as unknown as
+        | string
+        | OpenAI.Chat.ChatCompletionContentPartText[],
+    };
+  }
+
+  /**
+   * Create OpenAI media content part from Gemini part
+   */
+  private createMediaContentPart(part: Part): OpenAIContentPart | null {
+    if (part.inlineData?.mimeType && part.inlineData?.data) {
+      const mediaType = this.getMediaType(part.inlineData.mimeType);
+      if (mediaType === 'image') {
+        const dataUrl = `data:${part.inlineData.mimeType};base64,${part.inlineData.data}`;
+        return {
+          type: 'image_url' as const,
+          image_url: { url: dataUrl },
+        };
+      }
+      if (mediaType === 'audio') {
+        const format = this.getAudioFormat(part.inlineData.mimeType);
+        if (format) {
+          return {
+            type: 'input_audio' as const,
+            input_audio: {
+              data: part.inlineData.data,
+              format,
+            },
+          };
+        }
+      }
+    }
+
+    if (part.fileData?.mimeType && part.fileData?.fileUri) {
+      const filename = part.fileData.displayName || 'file';
+      const fileUri = part.fileData.fileUri;
+
+      if (fileUri.startsWith('data:')) {
+        return {
+          type: 'file' as const,
+          file: {
+            filename,
+            file_data: fileUri,
+          },
+        };
+      }
+
+      return {
+        type: 'file' as const,
+        file: {
+          filename,
+          file_data: `data:${part.fileData.mimeType};base64,${fileUri}`,
+        },
+      };
+    }
+
+    return null;
+  }
+
  /**
   * Determine media type from MIME type
   */
@@ -544,85 +629,6 @@ export class OpenAIContentConverter {
    return 'file';
  }

-  /**
-   * Create multimodal OpenAI message from parsed parts
-   */
-  private createMultimodalMessage(
-    role: 'user' | 'assistant',
-    parsedParts: Pick<
-      ParsedParts,
-      'contentParts' | 'mediaParts' | 'thoughtParts'
-    >,
-  ): ExtendedChatCompletionMessageParam | null {
-    const { contentParts, mediaParts, thoughtParts } = parsedParts;
-    const reasoningContent = thoughtParts.join('');
-    const content = contentParts.map((text) => ({
-      type: 'text' as const,
-      text,
-    }));
-
-    // If no media parts, return simple text message
-    if (mediaParts.length === 0) {
-      if (content.length === 0) return null;
-      const message: ExtendedChatCompletionMessageParam = { role, content };
-      // Only include reasoning_content if it has actual content
-      if (reasoningContent) {
-        (
-          message as ExtendedChatCompletionAssistantMessageParam
-        ).reasoning_content = reasoningContent;
-      }
-      return message;
-    }
-
-    // For assistant messages with media, convert to text only
-    // since OpenAI assistant messages don't support media content arrays
-    if (role === 'assistant') {
-      return content.length > 0
-        ? { role: 'assistant' as const, content }
-        : null;
-    }
-
-    const contentArray: OpenAI.Chat.ChatCompletionContentPart[] = [...content];
-
-    // Add media content
-    for (const mediaPart of mediaParts) {
-      if (mediaPart.type === 'image') {
-        if (mediaPart.fileUri) {
-          // For file URIs, use the URI directly
-          contentArray.push({
-            type: 'image_url' as const,
-            image_url: { url: mediaPart.fileUri },
-          });
-        } else if (mediaPart.data) {
-          // For inline data, create data URL
-          const dataUrl = `data:${mediaPart.mimeType};base64,${mediaPart.data}`;
-          contentArray.push({
-            type: 'image_url' as const,
-            image_url: { url: dataUrl },
-          });
-        }
-      } else if (mediaPart.type === 'audio' && mediaPart.data) {
-        // Convert audio format from MIME type
-        const format = this.getAudioFormat(mediaPart.mimeType);
-        if (format) {
-          contentArray.push({
-            type: 'input_audio' as const,
-            input_audio: {
-              data: mediaPart.data,
-              format: format as 'wav' | 'mp3',
-            },
-          });
-        }
-      }
-      // Note: File type is not directly supported in OpenAI's current API
-      // Could be extended in the future or handled as text description
-    }
-
-    return contentArray.length > 0
-      ? { role: 'user' as const, content: contentArray }
-      : null;
-  }
-
  /**
   * Convert MIME type to OpenAI audio format
   */
@@ -693,8 +699,9 @@ export class OpenAIContentConverter {
    const parts: Part[] = [];

    // Handle reasoning content (thoughts)
-    const reasoningText = (choice.message as ExtendedCompletionMessage)
-      .reasoning_content;
+    const reasoningText =
+      (choice.message as ExtendedCompletionMessage).reasoning_content ??
+      (choice.message as ExtendedCompletionMessage).reasoning;
    if (reasoningText) {
      parts.push({ text: reasoningText, thought: true });
    }
@@ -798,8 +805,9 @@ export class OpenAIContentConverter {
    if (choice) {
      const parts: Part[] = [];

-      const reasoningText = (choice.delta as ExtendedCompletionChunkDelta)
-        ?.reasoning_content;
+      const reasoningText =
+        (choice.delta as ExtendedCompletionChunkDelta)?.reasoning_content ??
+        (choice.delta as ExtendedCompletionChunkDelta)?.reasoning;
      if (reasoningText) {
        parts.push({ text: reasoningText, thought: true });
      }
@@ -1130,6 +1138,10 @@ export class OpenAIContentConverter {

        // If the last message is also an assistant message, merge them
        if (lastMessage.role === 'assistant') {
+          const lastToolCalls =
+            'tool_calls' in lastMessage ? lastMessage.tool_calls || [] : [];
+          const currentToolCalls =
+            'tool_calls' in message ? message.tool_calls || [] : [];
          // Combine content
          const lastContent = lastMessage.content;
          const currentContent = message.content;
@@ -1171,10 +1183,6 @@ export class OpenAIContentConverter {
          }

          // Combine tool calls
-          const lastToolCalls =
-            'tool_calls' in lastMessage ? lastMessage.tool_calls || [] : [];
-          const currentToolCalls =
-            'tool_calls' in message ? message.tool_calls || [] : [];
          const combinedToolCalls = [...lastToolCalls, ...currentToolCalls];

          // Update the last message with combined data
--- a/packages/core/src/core/openaiContentGenerator/pipeline.ts
+++ b/packages/core/src/core/openaiContentGenerator/pipeline.ts
@@ -320,13 +320,15 @@ export class ContentGenerationPipeline {
        'frequency_penalty',
        'frequencyPenalty',
      ),
-      ...this.buildReasoningConfig(),
+      ...this.buildReasoningConfig(request),
    };

    return params;
  }

-  private buildReasoningConfig(): Record<string, unknown> {
+  private buildReasoningConfig(
+    request: GenerateContentParameters,
+  ): Record<string, unknown> {
    // Reasoning configuration for OpenAI-compatible endpoints is highly fragmented.
    // For example, across common providers and models:
    //
@@ -336,13 +338,21 @@ export class ContentGenerationPipeline {
    //   - gpt-5.x series      — thinking is enabled by default; can be disabled via `reasoning.effort`
    //   - qwen3 series        — model-dependent; can be manually disabled via `extra_body.enable_thinking`
    //
-    // Given this inconsistency, we choose not to set any reasoning config here and
-    // instead rely on each model’s default behavior.
+    // Given this inconsistency, we avoid mapping values and only pass through the
+    // configured reasoning object when explicitly enabled. This keeps provider- and
+    // model-specific semantics intact while honoring request-level opt-out.

-    // We plan to introduce provider- and model-specific settings to enable more
-    // fine-grained control over reasoning configuration.
+    if (request.config?.thinkingConfig?.includeThoughts === false) {
+      return {};
+    }

-    return {};
+    const reasoning = this.contentGeneratorConfig.reasoning;
+
+    if (reasoning === false || reasoning === undefined) {
+      return {};
+    }
+
+    return { reasoning };
  }

  /**
--- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts
+++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts
@@ -608,7 +608,7 @@ describe('DashScopeOpenAICompatibleProvider', () => {
      });
    });

-    it('should add empty text item with cache control if last item is not text for streaming requests', () => {
+    it('should add cache control to last item even if not text for streaming requests', () => {
      const requestWithNonTextLast: OpenAI.Chat.ChatCompletionCreateParams = {
        model: 'qwen-max',
        stream: true, // This will trigger cache control on last message
@@ -633,12 +633,12 @@ describe('DashScopeOpenAICompatibleProvider', () => {

      const content = result.messages[0]
        .content as OpenAI.Chat.ChatCompletionContentPart[];
-      expect(content).toHaveLength(3);
+      expect(content).toHaveLength(2);

-      // Should add empty text item with cache control
-      expect(content[2]).toEqual({
-        type: 'text',
-        text: '',
+      // Cache control should be added to the last item (image)
+      expect(content[1]).toEqual({
+        type: 'image_url',
+        image_url: { url: 'https://example.com/image.jpg' },
        cache_control: { type: 'ephemeral' },
      });
    });
@@ -709,13 +709,8 @@ describe('DashScopeOpenAICompatibleProvider', () => {

      const content = result.messages[0]
        .content as OpenAI.Chat.ChatCompletionContentPart[];
-      expect(content).toEqual([
-        {
-          type: 'text',
-          text: '',
-          cache_control: { type: 'ephemeral' },
-        },
-      ]);
+      // Empty content array should remain empty
+      expect(content).toEqual([]);
    });
  });

--- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts
+++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts
@@ -257,31 +257,15 @@ export class DashScopeOpenAICompatibleProvider
    contentArray: ChatCompletionContentPartWithCache[],
  ): ChatCompletionContentPartWithCache[] {
    if (contentArray.length === 0) {
-      return [
-        {
-          type: 'text',
-          text: '',
-          cache_control: { type: 'ephemeral' },
-        } as ChatCompletionContentPartTextWithCache,
-      ];
+      return contentArray;
    }

+    // Add cache_control to the last text item
    const lastItem = contentArray[contentArray.length - 1];
-
-    if (lastItem.type === 'text') {
-      // Add cache_control to the last text item
-      contentArray[contentArray.length - 1] = {
-        ...lastItem,
-        cache_control: { type: 'ephemeral' },
-      } as ChatCompletionContentPartTextWithCache;
-    } else {
-      // If the last item is not text, add a new text item with cache_control
-      contentArray.push({
-        type: 'text',
-        text: '',
-        cache_control: { type: 'ephemeral' },
-      } as ChatCompletionContentPartTextWithCache);
-    }
+    contentArray[contentArray.length - 1] = {
+      ...lastItem,
+      cache_control: { type: 'ephemeral' },
+    } as ChatCompletionContentPartTextWithCache;

    return contentArray;
  }
--- a/packages/core/src/models/constants.ts
+++ b/packages/core/src/models/constants.ts
@@ -102,16 +102,14 @@ export const QWEN_OAUTH_ALLOWED_MODELS = [
 export const QWEN_OAUTH_MODELS: ModelConfig[] = [
  {
    id: 'coder-model',
-    name: 'Qwen Coder',
-    description:
-      'The latest Qwen Coder model from Alibaba Cloud ModelStudio (version: qwen3-coder-plus-2025-09-23)',
+    name: 'coder-model',
+    description: 'The latest Qwen Coder model from Alibaba Cloud ModelStudio',
    capabilities: { vision: false },
  },
  {
    id: 'vision-model',
-    name: 'Qwen Vision',
-    description:
-      'The latest Qwen Vision model from Alibaba Cloud ModelStudio (version: qwen3-vl-plus-2025-09-23)',
+    name: 'vision-model',
+    description: 'The latest Qwen Vision model from Alibaba Cloud ModelStudio',
    capabilities: { vision: true },
  },
 ];
--- a/packages/core/src/tools/read-file.test.ts
+++ b/packages/core/src/tools/read-file.test.ts
@@ -283,6 +283,7 @@ describe('ReadFileTool', () => {
        inlineData: {
          data: pngHeader.toString('base64'),
          mimeType: 'image/png',
+          displayName: 'image.png',
        },
      });
      expect(result.returnDisplay).toBe('Read image file: image.png');
@@ -301,9 +302,10 @@ describe('ReadFileTool', () => {

      const result = await invocation.execute(abortSignal);
      expect(result.llmContent).toEqual({
-        inlineData: {
-          data: pdfHeader.toString('base64'),
+        fileData: {
+          fileUri: pdfHeader.toString('base64'),
          mimeType: 'application/pdf',
+          displayName: 'document.pdf',
        },
      });
      expect(result.returnDisplay).toBe('Read pdf file: document.pdf');
--- a/packages/core/src/tools/read-many-files.test.ts
+++ b/packages/core/src/tools/read-many-files.test.ts
@@ -383,6 +383,7 @@ describe('ReadManyFilesTool', () => {
              0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a,
            ]).toString('base64'),
            mimeType: 'image/png',
+            displayName: 'image.png',
          },
        },
        '\n--- End of content ---',
@@ -407,6 +408,7 @@ describe('ReadManyFilesTool', () => {
              0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a,
            ]).toString('base64'),
            mimeType: 'image/png',
+            displayName: 'myExactImage.png',
          },
        },
        '\n--- End of content ---',
@@ -434,32 +436,34 @@ describe('ReadManyFilesTool', () => {
      );
    });

-    it('should include PDF files as inlineData parts if explicitly requested by extension', async () => {
+    it('should include PDF files as fileData parts if explicitly requested by extension', async () => {
      createBinaryFile('important.pdf', Buffer.from('%PDF-1.4...'));
      const params = { paths: ['*.pdf'] }; // Explicitly requesting .pdf files
      const invocation = tool.build(params);
      const result = await invocation.execute(new AbortController().signal);
      expect(result.llmContent).toEqual([
        {
-          inlineData: {
-            data: Buffer.from('%PDF-1.4...').toString('base64'),
+          fileData: {
+            fileUri: Buffer.from('%PDF-1.4...').toString('base64'),
            mimeType: 'application/pdf',
+            displayName: 'important.pdf',
          },
        },
        '\n--- End of content ---',
      ]);
    });

-    it('should include PDF files as inlineData parts if explicitly requested by name', async () => {
+    it('should include PDF files as fileData parts if explicitly requested by name', async () => {
      createBinaryFile('report-final.pdf', Buffer.from('%PDF-1.4...'));
      const params = { paths: ['report-final.pdf'] };
      const invocation = tool.build(params);
      const result = await invocation.execute(new AbortController().signal);
      expect(result.llmContent).toEqual([
        {
-          inlineData: {
-            data: Buffer.from('%PDF-1.4...').toString('base64'),
+          fileData: {
+            fileUri: Buffer.from('%PDF-1.4...').toString('base64'),
            mimeType: 'application/pdf',
+            displayName: 'report-final.pdf',
          },
        },
        '\n--- End of content ---',
--- a/packages/core/src/utils/fileUtils.test.ts
+++ b/packages/core/src/utils/fileUtils.test.ts
@@ -731,6 +731,10 @@ describe('fileUtils', () => {
      expect(
        (result.llmContent as { inlineData: { data: string } }).inlineData.data,
      ).toBe(fakePngData.toString('base64'));
+      expect(
+        (result.llmContent as { inlineData: { displayName?: string } })
+          .inlineData.displayName,
+      ).toBe('image.png');
      expect(result.returnDisplay).toContain('Read image file: image.png');
    });

@@ -743,15 +747,20 @@ describe('fileUtils', () => {
        mockConfig,
      );
      expect(
-        (result.llmContent as { inlineData: unknown }).inlineData,
+        (result.llmContent as { fileData: unknown }).fileData,
      ).toBeDefined();
      expect(
-        (result.llmContent as { inlineData: { mimeType: string } }).inlineData
+        (result.llmContent as { fileData: { mimeType: string } }).fileData
          .mimeType,
      ).toBe('application/pdf');
      expect(
-        (result.llmContent as { inlineData: { data: string } }).inlineData.data,
+        (result.llmContent as { fileData: { fileUri: string } }).fileData
+          .fileUri,
      ).toBe(fakePdfData.toString('base64'));
+      expect(
+        (result.llmContent as { fileData: { displayName?: string } }).fileData
+          .displayName,
+      ).toBe('document.pdf');
      expect(result.returnDisplay).toContain('Read pdf file: document.pdf');
    });

--- a/packages/core/src/utils/fileUtils.ts
+++ b/packages/core/src/utils/fileUtils.ts
@@ -351,6 +351,7 @@ export async function processSingleFileContent(
      .relative(rootDirectory, filePath)
      .replace(/\\/g, '/');

+    const displayName = path.basename(filePath);
    switch (fileType) {
      case 'binary': {
        return {
@@ -456,7 +457,6 @@ export async function processSingleFileContent(
        };
      }
      case 'image':
-      case 'pdf':
      case 'audio':
      case 'video': {
        const contentBuffer = await fs.promises.readFile(filePath);
@@ -466,6 +466,21 @@ export async function processSingleFileContent(
            inlineData: {
              data: base64Data,
              mimeType: mime.getType(filePath) || 'application/octet-stream',
+              displayName,
+            },
+          },
+          returnDisplay: `Read ${fileType} file: ${relativePathForDisplay}`,
+        };
+      }
+      case 'pdf': {
+        const contentBuffer = await fs.promises.readFile(filePath);
+        const base64Data = contentBuffer.toString('base64');
+        return {
+          llmContent: {
+            fileData: {
+              fileUri: base64Data,
+              mimeType: mime.getType(filePath) || 'application/octet-stream',
+              displayName,
            },
          },
          returnDisplay: `Read ${fileType} file: ${relativePathForDisplay}`,
--- a/packages/core/src/utils/pathReader.test.ts
+++ b/packages/core/src/utils/pathReader.test.ts
@@ -113,6 +113,7 @@ describe('readPathFromWorkspace', () => {
        inlineData: {
          mimeType: 'image/png',
          data: imageData.toString('base64'),
+          displayName: 'image.png',
        },
      },
    ]);
@@ -263,6 +264,7 @@ describe('readPathFromWorkspace', () => {
        inlineData: {
          mimeType: 'image/png',
          data: imageData.toString('base64'),
+          displayName: 'photo.png',
        },
      });
    });
Author	SHA1	Message	Date
tanzhenxin	b9a0d904de	feat: add multi-modal input support (image, PDF, audio) across all content generators	2026-01-21 15:44:58 +08:00
Mingholy	6eb16c0bcf	Merge pull request #1548 from QwenLM/mingholy/fix/qwen-oauth-model-info Fix: Update Qwen OAuth model information	2026-01-20 16:16:30 +08:00
mingholy.lmh	03f12bfa3f	fix: update qwen-oauth models info	2026-01-20 15:11:11 +08:00