mirror of
https://github.com/QwenLM/qwen-code.git
synced 2025-12-20 16:57:46 +00:00
feat(core): Parse Multimodal MCP Tool responses (#5529)
Co-authored-by: Luccas Paroni <luccasparoni@google.com>
This commit is contained in:
@@ -131,8 +131,11 @@ describe('DiscoveredMCPTool', () => {
|
||||
success: true,
|
||||
details: 'executed',
|
||||
};
|
||||
const mockFunctionResponseContent: Part[] = [
|
||||
{ text: JSON.stringify(mockToolSuccessResultObject) },
|
||||
const mockFunctionResponseContent = [
|
||||
{
|
||||
type: 'text',
|
||||
text: JSON.stringify(mockToolSuccessResultObject),
|
||||
},
|
||||
];
|
||||
const mockMcpToolResponseParts: Part[] = [
|
||||
{
|
||||
@@ -149,11 +152,13 @@ describe('DiscoveredMCPTool', () => {
|
||||
expect(mockCallTool).toHaveBeenCalledWith([
|
||||
{ name: serverToolName, args: params },
|
||||
]);
|
||||
expect(toolResult.llmContent).toEqual(mockMcpToolResponseParts);
|
||||
|
||||
const stringifiedResponseContent = JSON.stringify(
|
||||
mockToolSuccessResultObject,
|
||||
);
|
||||
expect(toolResult.llmContent).toEqual([
|
||||
{ text: stringifiedResponseContent },
|
||||
]);
|
||||
expect(toolResult.returnDisplay).toBe(stringifiedResponseContent);
|
||||
});
|
||||
|
||||
@@ -170,6 +175,9 @@ describe('DiscoveredMCPTool', () => {
|
||||
mockCallTool.mockResolvedValue(mockMcpToolResponsePartsEmpty);
|
||||
const toolResult: ToolResult = await tool.execute(params);
|
||||
expect(toolResult.returnDisplay).toBe('```json\n[]\n```');
|
||||
expect(toolResult.llmContent).toEqual([
|
||||
{ text: '[Error: Could not parse tool response]' },
|
||||
]);
|
||||
});
|
||||
|
||||
it('should propagate rejection if mcpTool.callTool rejects', async () => {
|
||||
@@ -186,6 +194,361 @@ describe('DiscoveredMCPTool', () => {
|
||||
|
||||
await expect(tool.execute(params)).rejects.toThrow(expectedError);
|
||||
});
|
||||
|
||||
it('should handle a simple text response correctly', async () => {
|
||||
const tool = new DiscoveredMCPTool(
|
||||
mockCallableToolInstance,
|
||||
serverName,
|
||||
serverToolName,
|
||||
baseDescription,
|
||||
inputSchema,
|
||||
);
|
||||
const params = { query: 'test' };
|
||||
const successMessage = 'This is a success message.';
|
||||
|
||||
// Simulate the response from the GenAI SDK, which wraps the MCP
|
||||
// response in a functionResponse Part.
|
||||
const sdkResponse: Part[] = [
|
||||
{
|
||||
functionResponse: {
|
||||
name: serverToolName,
|
||||
response: {
|
||||
// The `content` array contains MCP ContentBlocks.
|
||||
content: [{ type: 'text', text: successMessage }],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
mockCallTool.mockResolvedValue(sdkResponse);
|
||||
|
||||
const toolResult = await tool.execute(params);
|
||||
|
||||
// 1. Assert that the llmContent sent to the scheduler is a clean Part array.
|
||||
expect(toolResult.llmContent).toEqual([{ text: successMessage }]);
|
||||
|
||||
// 2. Assert that the display output is the simple text message.
|
||||
expect(toolResult.returnDisplay).toBe(successMessage);
|
||||
|
||||
// 3. Verify that the underlying callTool was made correctly.
|
||||
expect(mockCallTool).toHaveBeenCalledWith([
|
||||
{ name: serverToolName, args: params },
|
||||
]);
|
||||
});
|
||||
|
||||
it('should handle an AudioBlock response', async () => {
|
||||
const tool = new DiscoveredMCPTool(
|
||||
mockCallableToolInstance,
|
||||
serverName,
|
||||
serverToolName,
|
||||
baseDescription,
|
||||
inputSchema,
|
||||
);
|
||||
const params = { action: 'play' };
|
||||
const sdkResponse: Part[] = [
|
||||
{
|
||||
functionResponse: {
|
||||
name: serverToolName,
|
||||
response: {
|
||||
content: [
|
||||
{
|
||||
type: 'audio',
|
||||
data: 'BASE64_AUDIO_DATA',
|
||||
mimeType: 'audio/mp3',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
mockCallTool.mockResolvedValue(sdkResponse);
|
||||
|
||||
const toolResult = await tool.execute(params);
|
||||
|
||||
expect(toolResult.llmContent).toEqual([
|
||||
{
|
||||
text: `[Tool '${serverToolName}' provided the following audio data with mime-type: audio/mp3]`,
|
||||
},
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: 'audio/mp3',
|
||||
data: 'BASE64_AUDIO_DATA',
|
||||
},
|
||||
},
|
||||
]);
|
||||
expect(toolResult.returnDisplay).toBe('[Audio: audio/mp3]');
|
||||
});
|
||||
|
||||
it('should handle a ResourceLinkBlock response', async () => {
|
||||
const tool = new DiscoveredMCPTool(
|
||||
mockCallableToolInstance,
|
||||
serverName,
|
||||
serverToolName,
|
||||
baseDescription,
|
||||
inputSchema,
|
||||
);
|
||||
const params = { resource: 'get' };
|
||||
const sdkResponse: Part[] = [
|
||||
{
|
||||
functionResponse: {
|
||||
name: serverToolName,
|
||||
response: {
|
||||
content: [
|
||||
{
|
||||
type: 'resource_link',
|
||||
uri: 'file:///path/to/thing',
|
||||
name: 'resource-name',
|
||||
title: 'My Resource',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
mockCallTool.mockResolvedValue(sdkResponse);
|
||||
|
||||
const toolResult = await tool.execute(params);
|
||||
|
||||
expect(toolResult.llmContent).toEqual([
|
||||
{
|
||||
text: 'Resource Link: My Resource at file:///path/to/thing',
|
||||
},
|
||||
]);
|
||||
expect(toolResult.returnDisplay).toBe(
|
||||
'[Link to My Resource: file:///path/to/thing]',
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle an embedded text ResourceBlock response', async () => {
|
||||
const tool = new DiscoveredMCPTool(
|
||||
mockCallableToolInstance,
|
||||
serverName,
|
||||
serverToolName,
|
||||
baseDescription,
|
||||
inputSchema,
|
||||
);
|
||||
const params = { resource: 'get' };
|
||||
const sdkResponse: Part[] = [
|
||||
{
|
||||
functionResponse: {
|
||||
name: serverToolName,
|
||||
response: {
|
||||
content: [
|
||||
{
|
||||
type: 'resource',
|
||||
resource: {
|
||||
uri: 'file:///path/to/text.txt',
|
||||
text: 'This is the text content.',
|
||||
mimeType: 'text/plain',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
mockCallTool.mockResolvedValue(sdkResponse);
|
||||
|
||||
const toolResult = await tool.execute(params);
|
||||
|
||||
expect(toolResult.llmContent).toEqual([
|
||||
{ text: 'This is the text content.' },
|
||||
]);
|
||||
expect(toolResult.returnDisplay).toBe('This is the text content.');
|
||||
});
|
||||
|
||||
it('should handle an embedded binary ResourceBlock response', async () => {
|
||||
const tool = new DiscoveredMCPTool(
|
||||
mockCallableToolInstance,
|
||||
serverName,
|
||||
serverToolName,
|
||||
baseDescription,
|
||||
inputSchema,
|
||||
);
|
||||
const params = { resource: 'get' };
|
||||
const sdkResponse: Part[] = [
|
||||
{
|
||||
functionResponse: {
|
||||
name: serverToolName,
|
||||
response: {
|
||||
content: [
|
||||
{
|
||||
type: 'resource',
|
||||
resource: {
|
||||
uri: 'file:///path/to/data.bin',
|
||||
blob: 'BASE64_BINARY_DATA',
|
||||
mimeType: 'application/octet-stream',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
mockCallTool.mockResolvedValue(sdkResponse);
|
||||
|
||||
const toolResult = await tool.execute(params);
|
||||
|
||||
expect(toolResult.llmContent).toEqual([
|
||||
{
|
||||
text: `[Tool '${serverToolName}' provided the following embedded resource with mime-type: application/octet-stream]`,
|
||||
},
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: 'application/octet-stream',
|
||||
data: 'BASE64_BINARY_DATA',
|
||||
},
|
||||
},
|
||||
]);
|
||||
expect(toolResult.returnDisplay).toBe(
|
||||
'[Embedded Resource: application/octet-stream]',
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle a mix of content block types', async () => {
|
||||
const tool = new DiscoveredMCPTool(
|
||||
mockCallableToolInstance,
|
||||
serverName,
|
||||
serverToolName,
|
||||
baseDescription,
|
||||
inputSchema,
|
||||
);
|
||||
const params = { action: 'complex' };
|
||||
const sdkResponse: Part[] = [
|
||||
{
|
||||
functionResponse: {
|
||||
name: serverToolName,
|
||||
response: {
|
||||
content: [
|
||||
{ type: 'text', text: 'First part.' },
|
||||
{
|
||||
type: 'image',
|
||||
data: 'BASE64_IMAGE_DATA',
|
||||
mimeType: 'image/jpeg',
|
||||
},
|
||||
{ type: 'text', text: 'Second part.' },
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
mockCallTool.mockResolvedValue(sdkResponse);
|
||||
|
||||
const toolResult = await tool.execute(params);
|
||||
|
||||
expect(toolResult.llmContent).toEqual([
|
||||
{ text: 'First part.' },
|
||||
{
|
||||
text: `[Tool '${serverToolName}' provided the following image data with mime-type: image/jpeg]`,
|
||||
},
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: 'image/jpeg',
|
||||
data: 'BASE64_IMAGE_DATA',
|
||||
},
|
||||
},
|
||||
{ text: 'Second part.' },
|
||||
]);
|
||||
expect(toolResult.returnDisplay).toBe(
|
||||
'First part.\n[Image: image/jpeg]\nSecond part.',
|
||||
);
|
||||
});
|
||||
|
||||
it('should ignore unknown content block types', async () => {
|
||||
const tool = new DiscoveredMCPTool(
|
||||
mockCallableToolInstance,
|
||||
serverName,
|
||||
serverToolName,
|
||||
baseDescription,
|
||||
inputSchema,
|
||||
);
|
||||
const params = { action: 'test' };
|
||||
const sdkResponse: Part[] = [
|
||||
{
|
||||
functionResponse: {
|
||||
name: serverToolName,
|
||||
response: {
|
||||
content: [
|
||||
{ type: 'text', text: 'Valid part.' },
|
||||
{ type: 'future_block', data: 'some-data' },
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
mockCallTool.mockResolvedValue(sdkResponse);
|
||||
|
||||
const toolResult = await tool.execute(params);
|
||||
|
||||
expect(toolResult.llmContent).toEqual([{ text: 'Valid part.' }]);
|
||||
expect(toolResult.returnDisplay).toBe(
|
||||
'Valid part.\n[Unknown content type: future_block]',
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle a complex mix of content block types', async () => {
|
||||
const tool = new DiscoveredMCPTool(
|
||||
mockCallableToolInstance,
|
||||
serverName,
|
||||
serverToolName,
|
||||
baseDescription,
|
||||
inputSchema,
|
||||
);
|
||||
const params = { action: 'super-complex' };
|
||||
const sdkResponse: Part[] = [
|
||||
{
|
||||
functionResponse: {
|
||||
name: serverToolName,
|
||||
response: {
|
||||
content: [
|
||||
{ type: 'text', text: 'Here is a resource.' },
|
||||
{
|
||||
type: 'resource_link',
|
||||
uri: 'file:///path/to/resource',
|
||||
name: 'resource-name',
|
||||
title: 'My Resource',
|
||||
},
|
||||
{
|
||||
type: 'resource',
|
||||
resource: {
|
||||
uri: 'file:///path/to/text.txt',
|
||||
text: 'Embedded text content.',
|
||||
mimeType: 'text/plain',
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 'image',
|
||||
data: 'BASE64_IMAGE_DATA',
|
||||
mimeType: 'image/jpeg',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
mockCallTool.mockResolvedValue(sdkResponse);
|
||||
|
||||
const toolResult = await tool.execute(params);
|
||||
|
||||
expect(toolResult.llmContent).toEqual([
|
||||
{ text: 'Here is a resource.' },
|
||||
{
|
||||
text: 'Resource Link: My Resource at file:///path/to/resource',
|
||||
},
|
||||
{ text: 'Embedded text content.' },
|
||||
{
|
||||
text: `[Tool '${serverToolName}' provided the following image data with mime-type: image/jpeg]`,
|
||||
},
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: 'image/jpeg',
|
||||
data: 'BASE64_IMAGE_DATA',
|
||||
},
|
||||
},
|
||||
]);
|
||||
expect(toolResult.returnDisplay).toBe(
|
||||
'Here is a resource.\n[Link to My Resource: file:///path/to/resource]\nEmbedded text content.\n[Image: image/jpeg]',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('shouldConfirmExecute', () => {
|
||||
|
||||
@@ -22,6 +22,40 @@ import {
|
||||
|
||||
type ToolParams = Record<string, unknown>;
|
||||
|
||||
// Discriminated union for MCP Content Blocks to ensure type safety.
|
||||
type McpTextBlock = {
|
||||
type: 'text';
|
||||
text: string;
|
||||
};
|
||||
|
||||
type McpMediaBlock = {
|
||||
type: 'image' | 'audio';
|
||||
mimeType: string;
|
||||
data: string;
|
||||
};
|
||||
|
||||
type McpResourceBlock = {
|
||||
type: 'resource';
|
||||
resource: {
|
||||
text?: string;
|
||||
blob?: string;
|
||||
mimeType?: string;
|
||||
};
|
||||
};
|
||||
|
||||
type McpResourceLinkBlock = {
|
||||
type: 'resource_link';
|
||||
uri: string;
|
||||
title?: string;
|
||||
name?: string;
|
||||
};
|
||||
|
||||
type McpContentBlock =
|
||||
| McpTextBlock
|
||||
| McpMediaBlock
|
||||
| McpResourceBlock
|
||||
| McpResourceLinkBlock;
|
||||
|
||||
export class DiscoveredMCPTool extends BaseTool<ToolParams, ToolResult> {
|
||||
private static readonly allowlist: Set<string> = new Set();
|
||||
|
||||
@@ -114,70 +148,145 @@ export class DiscoveredMCPTool extends BaseTool<ToolParams, ToolResult> {
|
||||
},
|
||||
];
|
||||
|
||||
const responseParts: Part[] = await this.mcpTool.callTool(functionCalls);
|
||||
const rawResponseParts = await this.mcpTool.callTool(functionCalls);
|
||||
const transformedParts = transformMcpContentToParts(rawResponseParts);
|
||||
|
||||
return {
|
||||
llmContent: responseParts,
|
||||
returnDisplay: getStringifiedResultForDisplay(responseParts),
|
||||
llmContent: transformedParts,
|
||||
returnDisplay: getStringifiedResultForDisplay(rawResponseParts),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes an array of `Part` objects, primarily from a tool's execution result,
|
||||
* to generate a user-friendly string representation, typically for display in a CLI.
|
||||
*
|
||||
* The `result` array can contain various types of `Part` objects:
|
||||
* 1. `FunctionResponse` parts:
|
||||
* - If the `response.content` of a `FunctionResponse` is an array consisting solely
|
||||
* of `TextPart` objects, their text content is concatenated into a single string.
|
||||
* This is to present simple textual outputs directly.
|
||||
* - If `response.content` is an array but contains other types of `Part` objects (or a mix),
|
||||
* the `content` array itself is preserved. This handles structured data like JSON objects or arrays
|
||||
* returned by a tool.
|
||||
* - If `response.content` is not an array or is missing, the entire `functionResponse`
|
||||
* object is preserved.
|
||||
* 2. Other `Part` types (e.g., `TextPart` directly in the `result` array):
|
||||
* - These are preserved as is.
|
||||
*
|
||||
* All processed parts are then collected into an array, which is JSON.stringify-ed
|
||||
* with indentation and wrapped in a markdown JSON code block.
|
||||
*/
|
||||
function getStringifiedResultForDisplay(result: Part[]) {
|
||||
if (!result || result.length === 0) {
|
||||
return '```json\n[]\n```';
|
||||
function transformTextBlock(block: McpTextBlock): Part {
|
||||
return { text: block.text };
|
||||
}
|
||||
|
||||
function transformImageAudioBlock(
|
||||
block: McpMediaBlock,
|
||||
toolName: string,
|
||||
): Part[] {
|
||||
return [
|
||||
{
|
||||
text: `[Tool '${toolName}' provided the following ${
|
||||
block.type
|
||||
} data with mime-type: ${block.mimeType}]`,
|
||||
},
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: block.mimeType,
|
||||
data: block.data,
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
function transformResourceBlock(
|
||||
block: McpResourceBlock,
|
||||
toolName: string,
|
||||
): Part | Part[] | null {
|
||||
const resource = block.resource;
|
||||
if (resource?.text) {
|
||||
return { text: resource.text };
|
||||
}
|
||||
if (resource?.blob) {
|
||||
const mimeType = resource.mimeType || 'application/octet-stream';
|
||||
return [
|
||||
{
|
||||
text: `[Tool '${toolName}' provided the following embedded resource with mime-type: ${mimeType}]`,
|
||||
},
|
||||
{
|
||||
inlineData: {
|
||||
mimeType,
|
||||
data: resource.blob,
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const processFunctionResponse = (part: Part) => {
|
||||
if (part.functionResponse) {
|
||||
const responseContent = part.functionResponse.response?.content;
|
||||
if (responseContent && Array.isArray(responseContent)) {
|
||||
// Check if all parts in responseContent are simple TextParts
|
||||
const allTextParts = responseContent.every(
|
||||
(p: Part) => p.text !== undefined,
|
||||
);
|
||||
if (allTextParts) {
|
||||
return responseContent.map((p: Part) => p.text).join('');
|
||||
}
|
||||
// If not all simple text parts, return the array of these content parts for JSON stringification
|
||||
return responseContent;
|
||||
}
|
||||
|
||||
// If no content, or not an array, or not a functionResponse, stringify the whole functionResponse part for inspection
|
||||
return part.functionResponse;
|
||||
}
|
||||
return part; // Fallback for unexpected structure or non-FunctionResponsePart
|
||||
function transformResourceLinkBlock(block: McpResourceLinkBlock): Part {
|
||||
return {
|
||||
text: `Resource Link: ${block.title || block.name} at ${block.uri}`,
|
||||
};
|
||||
}
|
||||
|
||||
const processedResults =
|
||||
result.length === 1
|
||||
? processFunctionResponse(result[0])
|
||||
: result.map(processFunctionResponse);
|
||||
if (typeof processedResults === 'string') {
|
||||
return processedResults;
|
||||
/**
|
||||
* Transforms the raw MCP content blocks from the SDK response into a
|
||||
* standard GenAI Part array.
|
||||
* @param sdkResponse The raw Part[] array from `mcpTool.callTool()`.
|
||||
* @returns A clean Part[] array ready for the scheduler.
|
||||
*/
|
||||
function transformMcpContentToParts(sdkResponse: Part[]): Part[] {
|
||||
const funcResponse = sdkResponse?.[0]?.functionResponse;
|
||||
const mcpContent = funcResponse?.response?.content as McpContentBlock[];
|
||||
const toolName = funcResponse?.name || 'unknown tool';
|
||||
|
||||
if (!Array.isArray(mcpContent)) {
|
||||
return [{ text: '[Error: Could not parse tool response]' }];
|
||||
}
|
||||
|
||||
return '```json\n' + JSON.stringify(processedResults, null, 2) + '\n```';
|
||||
const transformed = mcpContent.flatMap(
|
||||
(block: McpContentBlock): Part | Part[] | null => {
|
||||
switch (block.type) {
|
||||
case 'text':
|
||||
return transformTextBlock(block);
|
||||
case 'image':
|
||||
case 'audio':
|
||||
return transformImageAudioBlock(block, toolName);
|
||||
case 'resource':
|
||||
return transformResourceBlock(block, toolName);
|
||||
case 'resource_link':
|
||||
return transformResourceLinkBlock(block);
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
return transformed.filter((part): part is Part => part !== null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes the raw response from the MCP tool to generate a clean,
|
||||
* human-readable string for display in the CLI. It summarizes non-text
|
||||
* content and presents text directly.
|
||||
*
|
||||
* @param rawResponse The raw Part[] array from the GenAI SDK.
|
||||
* @returns A formatted string representing the tool's output.
|
||||
*/
|
||||
function getStringifiedResultForDisplay(rawResponse: Part[]): string {
|
||||
const mcpContent = rawResponse?.[0]?.functionResponse?.response
|
||||
?.content as McpContentBlock[];
|
||||
|
||||
if (!Array.isArray(mcpContent)) {
|
||||
return '```json\n' + JSON.stringify(rawResponse, null, 2) + '\n```';
|
||||
}
|
||||
|
||||
const displayParts = mcpContent.map((block: McpContentBlock): string => {
|
||||
switch (block.type) {
|
||||
case 'text':
|
||||
return block.text;
|
||||
case 'image':
|
||||
return `[Image: ${block.mimeType}]`;
|
||||
case 'audio':
|
||||
return `[Audio: ${block.mimeType}]`;
|
||||
case 'resource_link':
|
||||
return `[Link to ${block.title || block.name}: ${block.uri}]`;
|
||||
case 'resource':
|
||||
if (block.resource?.text) {
|
||||
return block.resource.text;
|
||||
}
|
||||
return `[Embedded Resource: ${
|
||||
block.resource?.mimeType || 'unknown type'
|
||||
}]`;
|
||||
default:
|
||||
return `[Unknown content type: ${(block as { type: string }).type}]`;
|
||||
}
|
||||
});
|
||||
|
||||
return displayParts.join('\n');
|
||||
}
|
||||
|
||||
/** Visible for testing */
|
||||
|
||||
Reference in New Issue
Block a user