feat(core): Parse Multimodal MCP Tool responses (#5529)

Co-authored-by: Luccas Paroni <luccasparoni@google.com>
2025-12-20 16:57:46 +00:00 · 2025-08-05 16:19:47 -03:00
parent b465145229
commit 2778c7d851
4 changed files with 580 additions and 56 deletions
--- a/packages/core/src/tools/mcp-tool.test.ts
+++ b/packages/core/src/tools/mcp-tool.test.ts
@@ -131,8 +131,11 @@ describe('DiscoveredMCPTool', () => {
        success: true,
        details: 'executed',
      };
-      const mockFunctionResponseContent: Part[] = [
-        { text: JSON.stringify(mockToolSuccessResultObject) },
+      const mockFunctionResponseContent = [
+        {
+          type: 'text',
+          text: JSON.stringify(mockToolSuccessResultObject),
+        },
      ];
      const mockMcpToolResponseParts: Part[] = [
        {
@@ -149,11 +152,13 @@ describe('DiscoveredMCPTool', () => {
      expect(mockCallTool).toHaveBeenCalledWith([
        { name: serverToolName, args: params },
      ]);
-      expect(toolResult.llmContent).toEqual(mockMcpToolResponseParts);

      const stringifiedResponseContent = JSON.stringify(
        mockToolSuccessResultObject,
      );
+      expect(toolResult.llmContent).toEqual([
+        { text: stringifiedResponseContent },
+      ]);
      expect(toolResult.returnDisplay).toBe(stringifiedResponseContent);
    });

@@ -170,6 +175,9 @@ describe('DiscoveredMCPTool', () => {
      mockCallTool.mockResolvedValue(mockMcpToolResponsePartsEmpty);
      const toolResult: ToolResult = await tool.execute(params);
      expect(toolResult.returnDisplay).toBe('```json\n[]\n```');
+      expect(toolResult.llmContent).toEqual([
+        { text: '[Error: Could not parse tool response]' },
+      ]);
    });

    it('should propagate rejection if mcpTool.callTool rejects', async () => {
@@ -186,6 +194,361 @@ describe('DiscoveredMCPTool', () => {

      await expect(tool.execute(params)).rejects.toThrow(expectedError);
    });
+
+    it('should handle a simple text response correctly', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { query: 'test' };
+      const successMessage = 'This is a success message.';
+
+      // Simulate the response from the GenAI SDK, which wraps the MCP
+      // response in a functionResponse Part.
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              // The `content` array contains MCP ContentBlocks.
+              content: [{ type: 'text', text: successMessage }],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      // 1. Assert that the llmContent sent to the scheduler is a clean Part array.
+      expect(toolResult.llmContent).toEqual([{ text: successMessage }]);
+
+      // 2. Assert that the display output is the simple text message.
+      expect(toolResult.returnDisplay).toBe(successMessage);
+
+      // 3. Verify that the underlying callTool was made correctly.
+      expect(mockCallTool).toHaveBeenCalledWith([
+        { name: serverToolName, args: params },
+      ]);
+    });
+
+    it('should handle an AudioBlock response', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { action: 'play' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                {
+                  type: 'audio',
+                  data: 'BASE64_AUDIO_DATA',
+                  mimeType: 'audio/mp3',
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        {
+          text: `[Tool '${serverToolName}' provided the following audio data with mime-type: audio/mp3]`,
+        },
+        {
+          inlineData: {
+            mimeType: 'audio/mp3',
+            data: 'BASE64_AUDIO_DATA',
+          },
+        },
+      ]);
+      expect(toolResult.returnDisplay).toBe('[Audio: audio/mp3]');
+    });
+
+    it('should handle a ResourceLinkBlock response', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { resource: 'get' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                {
+                  type: 'resource_link',
+                  uri: 'file:///path/to/thing',
+                  name: 'resource-name',
+                  title: 'My Resource',
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        {
+          text: 'Resource Link: My Resource at file:///path/to/thing',
+        },
+      ]);
+      expect(toolResult.returnDisplay).toBe(
+        '[Link to My Resource: file:///path/to/thing]',
+      );
+    });
+
+    it('should handle an embedded text ResourceBlock response', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { resource: 'get' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                {
+                  type: 'resource',
+                  resource: {
+                    uri: 'file:///path/to/text.txt',
+                    text: 'This is the text content.',
+                    mimeType: 'text/plain',
+                  },
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        { text: 'This is the text content.' },
+      ]);
+      expect(toolResult.returnDisplay).toBe('This is the text content.');
+    });
+
+    it('should handle an embedded binary ResourceBlock response', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { resource: 'get' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                {
+                  type: 'resource',
+                  resource: {
+                    uri: 'file:///path/to/data.bin',
+                    blob: 'BASE64_BINARY_DATA',
+                    mimeType: 'application/octet-stream',
+                  },
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        {
+          text: `[Tool '${serverToolName}' provided the following embedded resource with mime-type: application/octet-stream]`,
+        },
+        {
+          inlineData: {
+            mimeType: 'application/octet-stream',
+            data: 'BASE64_BINARY_DATA',
+          },
+        },
+      ]);
+      expect(toolResult.returnDisplay).toBe(
+        '[Embedded Resource: application/octet-stream]',
+      );
+    });
+
+    it('should handle a mix of content block types', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { action: 'complex' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                { type: 'text', text: 'First part.' },
+                {
+                  type: 'image',
+                  data: 'BASE64_IMAGE_DATA',
+                  mimeType: 'image/jpeg',
+                },
+                { type: 'text', text: 'Second part.' },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        { text: 'First part.' },
+        {
+          text: `[Tool '${serverToolName}' provided the following image data with mime-type: image/jpeg]`,
+        },
+        {
+          inlineData: {
+            mimeType: 'image/jpeg',
+            data: 'BASE64_IMAGE_DATA',
+          },
+        },
+        { text: 'Second part.' },
+      ]);
+      expect(toolResult.returnDisplay).toBe(
+        'First part.\n[Image: image/jpeg]\nSecond part.',
+      );
+    });
+
+    it('should ignore unknown content block types', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { action: 'test' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                { type: 'text', text: 'Valid part.' },
+                { type: 'future_block', data: 'some-data' },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([{ text: 'Valid part.' }]);
+      expect(toolResult.returnDisplay).toBe(
+        'Valid part.\n[Unknown content type: future_block]',
+      );
+    });
+
+    it('should handle a complex mix of content block types', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { action: 'super-complex' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                { type: 'text', text: 'Here is a resource.' },
+                {
+                  type: 'resource_link',
+                  uri: 'file:///path/to/resource',
+                  name: 'resource-name',
+                  title: 'My Resource',
+                },
+                {
+                  type: 'resource',
+                  resource: {
+                    uri: 'file:///path/to/text.txt',
+                    text: 'Embedded text content.',
+                    mimeType: 'text/plain',
+                  },
+                },
+                {
+                  type: 'image',
+                  data: 'BASE64_IMAGE_DATA',
+                  mimeType: 'image/jpeg',
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        { text: 'Here is a resource.' },
+        {
+          text: 'Resource Link: My Resource at file:///path/to/resource',
+        },
+        { text: 'Embedded text content.' },
+        {
+          text: `[Tool '${serverToolName}' provided the following image data with mime-type: image/jpeg]`,
+        },
+        {
+          inlineData: {
+            mimeType: 'image/jpeg',
+            data: 'BASE64_IMAGE_DATA',
+          },
+        },
+      ]);
+      expect(toolResult.returnDisplay).toBe(
+        'Here is a resource.\n[Link to My Resource: file:///path/to/resource]\nEmbedded text content.\n[Image: image/jpeg]',
+      );
+    });
  });

  describe('shouldConfirmExecute', () => {
--- a/packages/core/src/tools/mcp-tool.ts
+++ b/packages/core/src/tools/mcp-tool.ts
@@ -22,6 +22,40 @@ import {

 type ToolParams = Record<string, unknown>;

+// Discriminated union for MCP Content Blocks to ensure type safety.
+type McpTextBlock = {
+  type: 'text';
+  text: string;
+};
+
+type McpMediaBlock = {
+  type: 'image' | 'audio';
+  mimeType: string;
+  data: string;
+};
+
+type McpResourceBlock = {
+  type: 'resource';
+  resource: {
+    text?: string;
+    blob?: string;
+    mimeType?: string;
+  };
+};
+
+type McpResourceLinkBlock = {
+  type: 'resource_link';
+  uri: string;
+  title?: string;
+  name?: string;
+};
+
+type McpContentBlock =
+  | McpTextBlock
+  | McpMediaBlock
+  | McpResourceBlock
+  | McpResourceLinkBlock;
+
 export class DiscoveredMCPTool extends BaseTool<ToolParams, ToolResult> {
  private static readonly allowlist: Set<string> = new Set();

@@ -114,70 +148,145 @@ export class DiscoveredMCPTool extends BaseTool<ToolParams, ToolResult> {
      },
    ];

-    const responseParts: Part[] = await this.mcpTool.callTool(functionCalls);
+    const rawResponseParts = await this.mcpTool.callTool(functionCalls);
+    const transformedParts = transformMcpContentToParts(rawResponseParts);

    return {
-      llmContent: responseParts,
-      returnDisplay: getStringifiedResultForDisplay(responseParts),
+      llmContent: transformedParts,
+      returnDisplay: getStringifiedResultForDisplay(rawResponseParts),
    };
  }
 }

-/**
- * Processes an array of `Part` objects, primarily from a tool's execution result,
- * to generate a user-friendly string representation, typically for display in a CLI.
- *
- * The `result` array can contain various types of `Part` objects:
- * 1. `FunctionResponse` parts:
- *    - If the `response.content` of a `FunctionResponse` is an array consisting solely
- *      of `TextPart` objects, their text content is concatenated into a single string.
- *      This is to present simple textual outputs directly.
- *    - If `response.content` is an array but contains other types of `Part` objects (or a mix),
- *      the `content` array itself is preserved. This handles structured data like JSON objects or arrays
- *      returned by a tool.
- *    - If `response.content` is not an array or is missing, the entire `functionResponse`
- *      object is preserved.
- * 2. Other `Part` types (e.g., `TextPart` directly in the `result` array):
- *    - These are preserved as is.
- *
- * All processed parts are then collected into an array, which is JSON.stringify-ed
- * with indentation and wrapped in a markdown JSON code block.
- */
-function getStringifiedResultForDisplay(result: Part[]) {
-  if (!result || result.length === 0) {
-    return '```json\n[]\n```';
+function transformTextBlock(block: McpTextBlock): Part {
+  return { text: block.text };
+}
+
+function transformImageAudioBlock(
+  block: McpMediaBlock,
+  toolName: string,
+): Part[] {
+  return [
+    {
+      text: `[Tool '${toolName}' provided the following ${
+        block.type
+      } data with mime-type: ${block.mimeType}]`,
+    },
+    {
+      inlineData: {
+        mimeType: block.mimeType,
+        data: block.data,
+      },
+    },
+  ];
+}
+
+function transformResourceBlock(
+  block: McpResourceBlock,
+  toolName: string,
+): Part | Part[] | null {
+  const resource = block.resource;
+  if (resource?.text) {
+    return { text: resource.text };
  }
+  if (resource?.blob) {
+    const mimeType = resource.mimeType || 'application/octet-stream';
+    return [
+      {
+        text: `[Tool '${toolName}' provided the following embedded resource with mime-type: ${mimeType}]`,
+      },
+      {
+        inlineData: {
+          mimeType,
+          data: resource.blob,
+        },
+      },
+    ];
+  }
+  return null;
+}

-  const processFunctionResponse = (part: Part) => {
-    if (part.functionResponse) {
-      const responseContent = part.functionResponse.response?.content;
-      if (responseContent && Array.isArray(responseContent)) {
-        // Check if all parts in responseContent are simple TextParts
-        const allTextParts = responseContent.every(
-          (p: Part) => p.text !== undefined,
-        );
-        if (allTextParts) {
-          return responseContent.map((p: Part) => p.text).join('');
-        }
-        // If not all simple text parts, return the array of these content parts for JSON stringification
-        return responseContent;
-      }
-
-      // If no content, or not an array, or not a functionResponse, stringify the whole functionResponse part for inspection
-      return part.functionResponse;
-    }
-    return part; // Fallback for unexpected structure or non-FunctionResponsePart
+function transformResourceLinkBlock(block: McpResourceLinkBlock): Part {
+  return {
+    text: `Resource Link: ${block.title || block.name} at ${block.uri}`,
  };
+}

-  const processedResults =
-    result.length === 1
-      ? processFunctionResponse(result[0])
-      : result.map(processFunctionResponse);
-  if (typeof processedResults === 'string') {
-    return processedResults;
+/**
+ * Transforms the raw MCP content blocks from the SDK response into a
+ * standard GenAI Part array.
+ * @param sdkResponse The raw Part[] array from `mcpTool.callTool()`.
+ * @returns A clean Part[] array ready for the scheduler.
+ */
+function transformMcpContentToParts(sdkResponse: Part[]): Part[] {
+  const funcResponse = sdkResponse?.[0]?.functionResponse;
+  const mcpContent = funcResponse?.response?.content as McpContentBlock[];
+  const toolName = funcResponse?.name || 'unknown tool';
+
+  if (!Array.isArray(mcpContent)) {
+    return [{ text: '[Error: Could not parse tool response]' }];
  }

-  return '```json\n' + JSON.stringify(processedResults, null, 2) + '\n```';
+  const transformed = mcpContent.flatMap(
+    (block: McpContentBlock): Part | Part[] | null => {
+      switch (block.type) {
+        case 'text':
+          return transformTextBlock(block);
+        case 'image':
+        case 'audio':
+          return transformImageAudioBlock(block, toolName);
+        case 'resource':
+          return transformResourceBlock(block, toolName);
+        case 'resource_link':
+          return transformResourceLinkBlock(block);
+        default:
+          return null;
+      }
+    },
+  );
+
+  return transformed.filter((part): part is Part => part !== null);
+}
+
+/**
+ * Processes the raw response from the MCP tool to generate a clean,
+ * human-readable string for display in the CLI. It summarizes non-text
+ * content and presents text directly.
+ *
+ * @param rawResponse The raw Part[] array from the GenAI SDK.
+ * @returns A formatted string representing the tool's output.
+ */
+function getStringifiedResultForDisplay(rawResponse: Part[]): string {
+  const mcpContent = rawResponse?.[0]?.functionResponse?.response
+    ?.content as McpContentBlock[];
+
+  if (!Array.isArray(mcpContent)) {
+    return '```json\n' + JSON.stringify(rawResponse, null, 2) + '\n```';
+  }
+
+  const displayParts = mcpContent.map((block: McpContentBlock): string => {
+    switch (block.type) {
+      case 'text':
+        return block.text;
+      case 'image':
+        return `[Image: ${block.mimeType}]`;
+      case 'audio':
+        return `[Audio: ${block.mimeType}]`;
+      case 'resource_link':
+        return `[Link to ${block.title || block.name}: ${block.uri}]`;
+      case 'resource':
+        if (block.resource?.text) {
+          return block.resource.text;
+        }
+        return `[Embedded Resource: ${
+          block.resource?.mimeType || 'unknown type'
+        }]`;
+      default:
+        return `[Unknown content type: ${(block as { type: string }).type}]`;
+    }
+  });
+
+  return displayParts.join('\n');
 }

 /** Visible for testing */