Vision model support for Qwen-OAuth (#525)

* refactor: openaiContentGenerator * refactor: optimize stream handling * refactor: re-organize refactored files * fix: unit test cases * feat: `/model` command for switching to vision model * fix: lint error * feat: add image tokenizer to fit vlm context window * fix: lint and type errors * feat: add `visionModelPreview` to control default visibility of vision models * fix: remove deprecated files * fix: align supported image formats with bailian doc
2025-12-20 16:57:46 +00:00 · 2025-09-18 13:32:00 +08:00
parent 56808ac210
commit 761833c915
41 changed files with 4083 additions and 5336 deletions
--- a/packages/cli/src/ui/hooks/slashCommandProcessor.test.ts
+++ b/packages/cli/src/ui/hooks/slashCommandProcessor.test.ts
@@ -106,6 +106,7 @@ describe('useSlashCommandProcessor', () => {
  const mockLoadHistory = vi.fn();
  const mockOpenThemeDialog = vi.fn();
  const mockOpenAuthDialog = vi.fn();
+  const mockOpenModelSelectionDialog = vi.fn();
  const mockSetQuittingMessages = vi.fn();

  const mockConfig = makeFakeConfig({});
@@ -122,6 +123,7 @@ describe('useSlashCommandProcessor', () => {
    mockBuiltinLoadCommands.mockResolvedValue([]);
    mockFileLoadCommands.mockResolvedValue([]);
    mockMcpLoadCommands.mockResolvedValue([]);
+    mockOpenModelSelectionDialog.mockClear();
  });

  const setupProcessorHook = (
@@ -150,11 +152,13 @@ describe('useSlashCommandProcessor', () => {
        mockSetQuittingMessages,
        vi.fn(), // openPrivacyNotice
        vi.fn(), // openSettingsDialog
+        mockOpenModelSelectionDialog,
        vi.fn(), // openSubagentCreateDialog
        vi.fn(), // openAgentsManagerDialog
        vi.fn(), // toggleVimEnabled
        setIsProcessing,
        vi.fn(), // setGeminiMdFileCount
+        vi.fn(), // _showQuitConfirmation
      ),
    );

@@ -395,6 +399,21 @@ describe('useSlashCommandProcessor', () => {
      expect(mockOpenThemeDialog).toHaveBeenCalled();
    });

+    it('should handle "dialog: model" action', async () => {
+      const command = createTestCommand({
+        name: 'modelcmd',
+        action: vi.fn().mockResolvedValue({ type: 'dialog', dialog: 'model' }),
+      });
+      const result = setupProcessorHook([command]);
+      await waitFor(() => expect(result.current.slashCommands).toHaveLength(1));
+
+      await act(async () => {
+        await result.current.handleSlashCommand('/modelcmd');
+      });
+
+      expect(mockOpenModelSelectionDialog).toHaveBeenCalled();
+    });
+
    it('should handle "load_history" action', async () => {
      const command = createTestCommand({
        name: 'load',
@@ -904,11 +923,13 @@ describe('useSlashCommandProcessor', () => {
          mockSetQuittingMessages,
          vi.fn(), // openPrivacyNotice
          vi.fn(), // openSettingsDialog
+          vi.fn(), // openModelSelectionDialog
          vi.fn(), // openSubagentCreateDialog
          vi.fn(), // openAgentsManagerDialog
          vi.fn(), // toggleVimEnabled
          vi.fn(), // setIsProcessing
          vi.fn(), // setGeminiMdFileCount
+          vi.fn(), // _showQuitConfirmation
        ),
      );

--- a/packages/cli/src/ui/hooks/slashCommandProcessor.ts
+++ b/packages/cli/src/ui/hooks/slashCommandProcessor.ts
@@ -53,6 +53,7 @@ export const useSlashCommandProcessor = (
  setQuittingMessages: (message: HistoryItem[]) => void,
  openPrivacyNotice: () => void,
  openSettingsDialog: () => void,
+  openModelSelectionDialog: () => void,
  openSubagentCreateDialog: () => void,
  openAgentsManagerDialog: () => void,
  toggleVimEnabled: () => Promise<boolean>,
@@ -404,6 +405,9 @@ export const useSlashCommandProcessor = (
                    case 'settings':
                      openSettingsDialog();
                      return { type: 'handled' };
+                    case 'model':
+                      openModelSelectionDialog();
+                      return { type: 'handled' };
                    case 'subagent_create':
                      openSubagentCreateDialog();
                      return { type: 'handled' };
@@ -663,6 +667,7 @@ export const useSlashCommandProcessor = (
      setSessionShellAllowlist,
      setIsProcessing,
      setConfirmationRequest,
+      openModelSelectionDialog,
      session.stats,
    ],
  );
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -56,6 +56,12 @@ const MockedUserPromptEvent = vi.hoisted(() =>
 );
 const mockParseAndFormatApiError = vi.hoisted(() => vi.fn());

+// Vision auto-switch mocks (hoisted)
+const mockHandleVisionSwitch = vi.hoisted(() =>
+  vi.fn().mockResolvedValue({ shouldProceed: true }),
+);
+const mockRestoreOriginalModel = vi.hoisted(() => vi.fn());
+
 vi.mock('@qwen-code/qwen-code-core', async (importOriginal) => {
  const actualCoreModule = (await importOriginal()) as any;
  return {
@@ -76,6 +82,13 @@ vi.mock('./useReactToolScheduler.js', async (importOriginal) => {
  };
 });

+vi.mock('./useVisionAutoSwitch.js', () => ({
+  useVisionAutoSwitch: vi.fn(() => ({
+    handleVisionSwitch: mockHandleVisionSwitch,
+    restoreOriginalModel: mockRestoreOriginalModel,
+  })),
+}));
+
 vi.mock('./useKeypress.js', () => ({
  useKeypress: vi.fn(),
 }));
@@ -199,6 +212,7 @@ describe('useGeminiStream', () => {
      getContentGeneratorConfig: vi
        .fn()
        .mockReturnValue(contentGeneratorConfig),
+      getMaxSessionTurns: vi.fn(() => 50),
    } as unknown as Config;
    mockOnDebugMessage = vi.fn();
    mockHandleSlashCommand = vi.fn().mockResolvedValue(false);
@@ -1551,6 +1565,7 @@ describe('useGeminiStream', () => {
      expect.any(String), // Argument 3: The prompt_id string
    );
  });
+
  describe('Thought Reset', () => {
    it('should reset thought to null when starting a new prompt', async () => {
      // First, simulate a response with a thought
@@ -1900,4 +1915,166 @@ describe('useGeminiStream', () => {
      );
    });
  });
+
+  // --- New tests focused on recent modifications ---
+  describe('Vision Auto Switch Integration', () => {
+    it('should call handleVisionSwitch and proceed to send when allowed', async () => {
+      mockHandleVisionSwitch.mockResolvedValueOnce({ shouldProceed: true });
+      mockSendMessageStream.mockReturnValue(
+        (async function* () {
+          yield { type: ServerGeminiEventType.Content, value: 'ok' };
+          yield { type: ServerGeminiEventType.Finished, value: 'STOP' };
+        })(),
+      );
+
+      const { result } = renderHook(() =>
+        useGeminiStream(
+          new MockedGeminiClientClass(mockConfig),
+          [],
+          mockAddItem,
+          mockConfig,
+          mockOnDebugMessage,
+          mockHandleSlashCommand,
+          false,
+          () => 'vscode' as EditorType,
+          () => {},
+          () => Promise.resolve(),
+          false,
+          () => {},
+          () => {},
+          () => {},
+        ),
+      );
+
+      await act(async () => {
+        await result.current.submitQuery('image prompt');
+      });
+
+      await waitFor(() => {
+        expect(mockHandleVisionSwitch).toHaveBeenCalled();
+        expect(mockSendMessageStream).toHaveBeenCalled();
+      });
+    });
+
+    it('should gate submission when handleVisionSwitch returns shouldProceed=false', async () => {
+      mockHandleVisionSwitch.mockResolvedValueOnce({ shouldProceed: false });
+
+      const { result } = renderHook(() =>
+        useGeminiStream(
+          new MockedGeminiClientClass(mockConfig),
+          [],
+          mockAddItem,
+          mockConfig,
+          mockOnDebugMessage,
+          mockHandleSlashCommand,
+          false,
+          () => 'vscode' as EditorType,
+          () => {},
+          () => Promise.resolve(),
+          false,
+          () => {},
+          () => {},
+          () => {},
+        ),
+      );
+
+      await act(async () => {
+        await result.current.submitQuery('vision-gated');
+      });
+
+      // No call to API, no restoreOriginalModel needed since no override occurred
+      expect(mockSendMessageStream).not.toHaveBeenCalled();
+      expect(mockRestoreOriginalModel).not.toHaveBeenCalled();
+
+      // Next call allowed (flag reset path)
+      mockHandleVisionSwitch.mockResolvedValueOnce({ shouldProceed: true });
+      mockSendMessageStream.mockReturnValue(
+        (async function* () {
+          yield { type: ServerGeminiEventType.Content, value: 'ok' };
+          yield { type: ServerGeminiEventType.Finished, value: 'STOP' };
+        })(),
+      );
+      await act(async () => {
+        await result.current.submitQuery('after-gate');
+      });
+      await waitFor(() => {
+        expect(mockSendMessageStream).toHaveBeenCalled();
+      });
+    });
+  });
+
+  describe('Model restore on completion and errors', () => {
+    it('should restore model after successful stream completion', async () => {
+      mockSendMessageStream.mockReturnValue(
+        (async function* () {
+          yield { type: ServerGeminiEventType.Content, value: 'content' };
+          yield { type: ServerGeminiEventType.Finished, value: 'STOP' };
+        })(),
+      );
+
+      const { result } = renderHook(() =>
+        useGeminiStream(
+          new MockedGeminiClientClass(mockConfig),
+          [],
+          mockAddItem,
+          mockConfig,
+          mockOnDebugMessage,
+          mockHandleSlashCommand,
+          false,
+          () => 'vscode' as EditorType,
+          () => {},
+          () => Promise.resolve(),
+          false,
+          () => {},
+          () => {},
+          () => {},
+        ),
+      );
+
+      await act(async () => {
+        await result.current.submitQuery('restore-success');
+      });
+
+      await waitFor(() => {
+        expect(mockRestoreOriginalModel).toHaveBeenCalledTimes(1);
+      });
+    });
+
+    it('should restore model when an error occurs during streaming', async () => {
+      const testError = new Error('stream failure');
+      mockSendMessageStream.mockReturnValue(
+        (async function* () {
+          yield { type: ServerGeminiEventType.Content, value: 'content' };
+          throw testError;
+        })(),
+      );
+
+      const { result } = renderHook(() =>
+        useGeminiStream(
+          new MockedGeminiClientClass(mockConfig),
+          [],
+          mockAddItem,
+          mockConfig,
+          mockOnDebugMessage,
+          mockHandleSlashCommand,
+          false,
+          () => 'vscode' as EditorType,
+          () => {},
+          () => Promise.resolve(),
+          false,
+          () => {},
+          () => {},
+          () => {},
+        ),
+      );
+
+      await act(async () => {
+        await result.current.submitQuery('restore-error');
+      });
+
+      await waitFor(() => {
+        expect(mockRestoreOriginalModel).toHaveBeenCalledTimes(1);
+      });
+    });
+  });
 });
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -42,6 +42,7 @@ import type {
 import { StreamingState, MessageType, ToolCallStatus } from '../types.js';
 import { isAtCommand, isSlashCommand } from '../utils/commandUtils.js';
 import { useShellCommandProcessor } from './shellCommandProcessor.js';
+import { useVisionAutoSwitch } from './useVisionAutoSwitch.js';
 import { handleAtCommand } from './atCommandProcessor.js';
 import { findLastSafeSplitPoint } from '../utils/markdownUtilities.js';
 import { useStateAndRef } from './useStateAndRef.js';
@@ -88,6 +89,12 @@ export const useGeminiStream = (
  setModelSwitchedFromQuotaError: React.Dispatch<React.SetStateAction<boolean>>,
  onEditorClose: () => void,
  onCancelSubmit: () => void,
+  visionModelPreviewEnabled: boolean = false,
+  onVisionSwitchRequired?: (query: PartListUnion) => Promise<{
+    modelOverride?: string;
+    persistSessionModel?: string;
+    showGuidance?: boolean;
+  }>,
 ) => {
  const [initError, setInitError] = useState<string | null>(null);
  const abortControllerRef = useRef<AbortController | null>(null);
@@ -155,6 +162,13 @@ export const useGeminiStream = (
    geminiClient,
  );

+  const { handleVisionSwitch, restoreOriginalModel } = useVisionAutoSwitch(
+    config,
+    addItem,
+    visionModelPreviewEnabled,
+    onVisionSwitchRequired,
+  );
+
  const streamingState = useMemo(() => {
    if (toolCalls.some((tc) => tc.status === 'awaiting_approval')) {
      return StreamingState.WaitingForConfirmation;
@@ -715,6 +729,20 @@ export const useGeminiStream = (
        return;
      }

+      // Handle vision switch requirement
+      const visionSwitchResult = await handleVisionSwitch(
+        queryToSend,
+        userMessageTimestamp,
+        options?.isContinuation || false,
+      );
+
+      if (!visionSwitchResult.shouldProceed) {
+        isSubmittingQueryRef.current = false;
+        return;
+      }
+
+      const finalQueryToSend = queryToSend;
+
      if (!options?.isContinuation) {
        startNewPrompt();
        setThought(null); // Reset thought when starting a new prompt
@@ -725,7 +753,7 @@ export const useGeminiStream = (

      try {
        const stream = geminiClient.sendMessageStream(
-          queryToSend,
+          finalQueryToSend,
          abortSignal,
          prompt_id!,
        );
@@ -736,6 +764,8 @@ export const useGeminiStream = (
        );

        if (processingStatus === StreamProcessingStatus.UserCancelled) {
+          // Restore original model if it was temporarily overridden
+          restoreOriginalModel();
          isSubmittingQueryRef.current = false;
          return;
        }
@@ -748,7 +778,13 @@ export const useGeminiStream = (
          loopDetectedRef.current = false;
          handleLoopDetectedEvent();
        }
+
+        // Restore original model if it was temporarily overridden
+        restoreOriginalModel();
      } catch (error: unknown) {
+        // Restore original model if it was temporarily overridden
+        restoreOriginalModel();
+
        if (error instanceof UnauthorizedError) {
          onAuthError();
        } else if (!isNodeError(error) || error.name !== 'AbortError') {
@@ -786,6 +822,8 @@ export const useGeminiStream = (
      startNewPrompt,
      getPromptCount,
      handleLoopDetectedEvent,
+      handleVisionSwitch,
+      restoreOriginalModel,
    ],
  );

--- a/packages/cli/src/ui/hooks/useVisionAutoSwitch.test.ts
+++ b/packages/cli/src/ui/hooks/useVisionAutoSwitch.test.ts
@@ -0,0 +1,374 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* eslint-disable @typescript-eslint/no-explicit-any */
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { renderHook, act } from '@testing-library/react';
+import type { Part, PartListUnion } from '@google/genai';
+import { AuthType, type Config } from '@qwen-code/qwen-code-core';
+import {
+  shouldOfferVisionSwitch,
+  processVisionSwitchOutcome,
+  getVisionSwitchGuidanceMessage,
+  useVisionAutoSwitch,
+} from './useVisionAutoSwitch.js';
+import { VisionSwitchOutcome } from '../components/ModelSwitchDialog.js';
+import { MessageType } from '../types.js';
+import { getDefaultVisionModel } from '../models/availableModels.js';
+
+describe('useVisionAutoSwitch helpers', () => {
+  describe('shouldOfferVisionSwitch', () => {
+    it('returns false when authType is not QWEN_OAUTH', () => {
+      const parts: PartListUnion = [
+        { inlineData: { mimeType: 'image/png', data: '...' } },
+      ];
+      const result = shouldOfferVisionSwitch(
+        parts,
+        AuthType.USE_GEMINI,
+        'qwen3-coder-plus',
+        true,
+      );
+      expect(result).toBe(false);
+    });
+
+    it('returns false when current model is already a vision model', () => {
+      const parts: PartListUnion = [
+        { inlineData: { mimeType: 'image/png', data: '...' } },
+      ];
+      const result = shouldOfferVisionSwitch(
+        parts,
+        AuthType.QWEN_OAUTH,
+        'qwen-vl-max-latest',
+        true,
+      );
+      expect(result).toBe(false);
+    });
+
+    it('returns true when image parts exist, QWEN_OAUTH, and model is not vision', () => {
+      const parts: PartListUnion = [
+        { text: 'hello' },
+        { inlineData: { mimeType: 'image/jpeg', data: '...' } },
+      ];
+      const result = shouldOfferVisionSwitch(
+        parts,
+        AuthType.QWEN_OAUTH,
+        'qwen3-coder-plus',
+        true,
+      );
+      expect(result).toBe(true);
+    });
+
+    it('detects image when provided as a single Part object (non-array)', () => {
+      const singleImagePart: PartListUnion = {
+        fileData: { mimeType: 'image/gif', fileUri: 'file://image.gif' },
+      } as Part;
+      const result = shouldOfferVisionSwitch(
+        singleImagePart,
+        AuthType.QWEN_OAUTH,
+        'qwen3-coder-plus',
+        true,
+      );
+      expect(result).toBe(true);
+    });
+
+    it('returns false when parts contain no images', () => {
+      const parts: PartListUnion = [{ text: 'just text' }];
+      const result = shouldOfferVisionSwitch(
+        parts,
+        AuthType.QWEN_OAUTH,
+        'qwen3-coder-plus',
+        true,
+      );
+      expect(result).toBe(false);
+    });
+
+    it('returns false when parts is a plain string', () => {
+      const parts: PartListUnion = 'plain text';
+      const result = shouldOfferVisionSwitch(
+        parts,
+        AuthType.QWEN_OAUTH,
+        'qwen3-coder-plus',
+        true,
+      );
+      expect(result).toBe(false);
+    });
+
+    it('returns false when visionModelPreviewEnabled is false', () => {
+      const parts: PartListUnion = [
+        { inlineData: { mimeType: 'image/png', data: '...' } },
+      ];
+      const result = shouldOfferVisionSwitch(
+        parts,
+        AuthType.QWEN_OAUTH,
+        'qwen3-coder-plus',
+        false,
+      );
+      expect(result).toBe(false);
+    });
+  });
+
+  describe('processVisionSwitchOutcome', () => {
+    it('maps SwitchOnce to a one-time model override', () => {
+      const vl = getDefaultVisionModel();
+      const result = processVisionSwitchOutcome(VisionSwitchOutcome.SwitchOnce);
+      expect(result).toEqual({ modelOverride: vl });
+    });
+
+    it('maps SwitchSessionToVL to a persistent session model', () => {
+      const vl = getDefaultVisionModel();
+      const result = processVisionSwitchOutcome(
+        VisionSwitchOutcome.SwitchSessionToVL,
+      );
+      expect(result).toEqual({ persistSessionModel: vl });
+    });
+
+    it('maps DisallowWithGuidance to showGuidance', () => {
+      const result = processVisionSwitchOutcome(
+        VisionSwitchOutcome.DisallowWithGuidance,
+      );
+      expect(result).toEqual({ showGuidance: true });
+    });
+  });
+
+  describe('getVisionSwitchGuidanceMessage', () => {
+    it('returns the expected guidance message', () => {
+      const vl = getDefaultVisionModel();
+      const expected =
+        'To use images with your query, you can:\n' +
+        `• Use /model set ${vl} to switch to a vision-capable model\n` +
+        '• Or remove the image and provide a text description instead';
+      expect(getVisionSwitchGuidanceMessage()).toBe(expected);
+    });
+  });
+});
+
+describe('useVisionAutoSwitch hook', () => {
+  type AddItemFn = (
+    item: { type: MessageType; text: string },
+    ts: number,
+  ) => any;
+
+  const createMockConfig = (authType: AuthType, initialModel: string) => {
+    let currentModel = initialModel;
+    const mockConfig: Partial<Config> = {
+      getModel: vi.fn(() => currentModel),
+      setModel: vi.fn((m: string) => {
+        currentModel = m;
+      }),
+      getContentGeneratorConfig: vi.fn(() => ({
+        authType,
+        model: currentModel,
+        apiKey: 'test-key',
+        vertexai: false,
+      })),
+    };
+    return mockConfig as Config;
+  };
+
+  let addItem: AddItemFn;
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    addItem = vi.fn();
+  });
+
+  it('returns shouldProceed=true immediately for continuations', async () => {
+    const config = createMockConfig(AuthType.QWEN_OAUTH, 'qwen3-coder-plus');
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(config, addItem as any, true, vi.fn()),
+    );
+
+    const parts: PartListUnion = [
+      { inlineData: { mimeType: 'image/png', data: '...' } },
+    ];
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, Date.now(), true);
+    });
+    expect(res).toEqual({ shouldProceed: true });
+    expect(addItem).not.toHaveBeenCalled();
+  });
+
+  it('does nothing when authType is not QWEN_OAUTH', async () => {
+    const config = createMockConfig(AuthType.USE_GEMINI, 'qwen3-coder-plus');
+    const onVisionSwitchRequired = vi.fn();
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(config, addItem as any, true, onVisionSwitchRequired),
+    );
+
+    const parts: PartListUnion = [
+      { inlineData: { mimeType: 'image/png', data: '...' } },
+    ];
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, 123, false);
+    });
+    expect(res).toEqual({ shouldProceed: true });
+    expect(onVisionSwitchRequired).not.toHaveBeenCalled();
+  });
+
+  it('does nothing when there are no image parts', async () => {
+    const config = createMockConfig(AuthType.QWEN_OAUTH, 'qwen3-coder-plus');
+    const onVisionSwitchRequired = vi.fn();
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(config, addItem as any, true, onVisionSwitchRequired),
+    );
+
+    const parts: PartListUnion = [{ text: 'no images here' }];
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, 456, false);
+    });
+    expect(res).toEqual({ shouldProceed: true });
+    expect(onVisionSwitchRequired).not.toHaveBeenCalled();
+  });
+
+  it('shows guidance and blocks when dialog returns showGuidance', async () => {
+    const config = createMockConfig(AuthType.QWEN_OAUTH, 'qwen3-coder-plus');
+    const onVisionSwitchRequired = vi
+      .fn()
+      .mockResolvedValue({ showGuidance: true });
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(config, addItem as any, true, onVisionSwitchRequired),
+    );
+
+    const parts: PartListUnion = [
+      { inlineData: { mimeType: 'image/png', data: '...' } },
+    ];
+
+    const userTs = 1010;
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, userTs, false);
+    });
+
+    expect(addItem).toHaveBeenCalledWith(
+      { type: MessageType.INFO, text: getVisionSwitchGuidanceMessage() },
+      userTs,
+    );
+    expect(res).toEqual({ shouldProceed: false });
+    expect(config.setModel).not.toHaveBeenCalled();
+  });
+
+  it('applies a one-time override and returns originalModel, then restores', async () => {
+    const initialModel = 'qwen3-coder-plus';
+    const config = createMockConfig(AuthType.QWEN_OAUTH, initialModel);
+    const onVisionSwitchRequired = vi
+      .fn()
+      .mockResolvedValue({ modelOverride: 'qwen-vl-max-latest' });
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(config, addItem as any, true, onVisionSwitchRequired),
+    );
+
+    const parts: PartListUnion = [
+      { inlineData: { mimeType: 'image/png', data: '...' } },
+    ];
+
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, 2020, false);
+    });
+
+    expect(res).toEqual({ shouldProceed: true, originalModel: initialModel });
+    expect(config.setModel).toHaveBeenCalledWith('qwen-vl-max-latest');
+
+    // Now restore
+    act(() => {
+      result.current.restoreOriginalModel();
+    });
+    expect(config.setModel).toHaveBeenLastCalledWith(initialModel);
+  });
+
+  it('persists session model when dialog requests persistence', async () => {
+    const config = createMockConfig(AuthType.QWEN_OAUTH, 'qwen3-coder-plus');
+    const onVisionSwitchRequired = vi
+      .fn()
+      .mockResolvedValue({ persistSessionModel: 'qwen-vl-max-latest' });
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(config, addItem as any, true, onVisionSwitchRequired),
+    );
+
+    const parts: PartListUnion = [
+      { inlineData: { mimeType: 'image/png', data: '...' } },
+    ];
+
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, 3030, false);
+    });
+
+    expect(res).toEqual({ shouldProceed: true });
+    expect(config.setModel).toHaveBeenCalledWith('qwen-vl-max-latest');
+
+    // Restore should be a no-op since no one-time override was used
+    act(() => {
+      result.current.restoreOriginalModel();
+    });
+    // Last call should still be the persisted model set
+    expect((config.setModel as any).mock.calls.pop()?.[0]).toBe(
+      'qwen-vl-max-latest',
+    );
+  });
+
+  it('returns shouldProceed=true when dialog returns no special flags', async () => {
+    const config = createMockConfig(AuthType.QWEN_OAUTH, 'qwen3-coder-plus');
+    const onVisionSwitchRequired = vi.fn().mockResolvedValue({});
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(config, addItem as any, true, onVisionSwitchRequired),
+    );
+
+    const parts: PartListUnion = [
+      { inlineData: { mimeType: 'image/png', data: '...' } },
+    ];
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, 4040, false);
+    });
+    expect(res).toEqual({ shouldProceed: true });
+    expect(config.setModel).not.toHaveBeenCalled();
+  });
+
+  it('blocks when dialog throws or is cancelled', async () => {
+    const config = createMockConfig(AuthType.QWEN_OAUTH, 'qwen3-coder-plus');
+    const onVisionSwitchRequired = vi.fn().mockRejectedValue(new Error('x'));
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(config, addItem as any, true, onVisionSwitchRequired),
+    );
+
+    const parts: PartListUnion = [
+      { inlineData: { mimeType: 'image/png', data: '...' } },
+    ];
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, 5050, false);
+    });
+    expect(res).toEqual({ shouldProceed: false });
+    expect(config.setModel).not.toHaveBeenCalled();
+  });
+
+  it('does nothing when visionModelPreviewEnabled is false', async () => {
+    const config = createMockConfig(AuthType.QWEN_OAUTH, 'qwen3-coder-plus');
+    const onVisionSwitchRequired = vi.fn();
+    const { result } = renderHook(() =>
+      useVisionAutoSwitch(
+        config,
+        addItem as any,
+        false,
+        onVisionSwitchRequired,
+      ),
+    );
+
+    const parts: PartListUnion = [
+      { inlineData: { mimeType: 'image/png', data: '...' } },
+    ];
+    let res: any;
+    await act(async () => {
+      res = await result.current.handleVisionSwitch(parts, 6060, false);
+    });
+    expect(res).toEqual({ shouldProceed: true });
+    expect(onVisionSwitchRequired).not.toHaveBeenCalled();
+  });
+});
--- a/packages/cli/src/ui/hooks/useVisionAutoSwitch.ts
+++ b/packages/cli/src/ui/hooks/useVisionAutoSwitch.ts
@@ -0,0 +1,304 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { type PartListUnion, type Part } from '@google/genai';
+import { AuthType, type Config } from '@qwen-code/qwen-code-core';
+import { useCallback, useRef } from 'react';
+import { VisionSwitchOutcome } from '../components/ModelSwitchDialog.js';
+import {
+  getDefaultVisionModel,
+  isVisionModel,
+} from '../models/availableModels.js';
+import { MessageType } from '../types.js';
+import type { UseHistoryManagerReturn } from './useHistoryManager.js';
+import {
+  isSupportedImageMimeType,
+  getUnsupportedImageFormatWarning,
+} from '@qwen-code/qwen-code-core';
+
+/**
+ * Checks if a PartListUnion contains image parts
+ */
+function hasImageParts(parts: PartListUnion): boolean {
+  if (typeof parts === 'string') {
+    return false;
+  }
+
+  if (Array.isArray(parts)) {
+    return parts.some((part) => {
+      // Skip string parts
+      if (typeof part === 'string') return false;
+      return isImagePart(part);
+    });
+  }
+
+  // If it's a single Part (not a string), check if it's an image
+  if (typeof parts === 'object') {
+    return isImagePart(parts);
+  }
+
+  return false;
+}
+
+/**
+ * Checks if a single Part is an image part
+ */
+function isImagePart(part: Part): boolean {
+  // Check for inlineData with image mime type
+  if ('inlineData' in part && part.inlineData?.mimeType?.startsWith('image/')) {
+    return true;
+  }
+
+  // Check for fileData with image mime type
+  if ('fileData' in part && part.fileData?.mimeType?.startsWith('image/')) {
+    return true;
+  }
+
+  return false;
+}
+
+/**
+ * Checks if image parts have supported formats and returns unsupported ones
+ */
+function checkImageFormatsSupport(parts: PartListUnion): {
+  hasImages: boolean;
+  hasUnsupportedFormats: boolean;
+  unsupportedMimeTypes: string[];
+} {
+  const unsupportedMimeTypes: string[] = [];
+  let hasImages = false;
+
+  if (typeof parts === 'string') {
+    return {
+      hasImages: false,
+      hasUnsupportedFormats: false,
+      unsupportedMimeTypes: [],
+    };
+  }
+
+  const partsArray = Array.isArray(parts) ? parts : [parts];
+
+  for (const part of partsArray) {
+    if (typeof part === 'string') continue;
+
+    let mimeType: string | undefined;
+
+    // Check inlineData
+    if (
+      'inlineData' in part &&
+      part.inlineData?.mimeType?.startsWith('image/')
+    ) {
+      hasImages = true;
+      mimeType = part.inlineData.mimeType;
+    }
+
+    // Check fileData
+    if ('fileData' in part && part.fileData?.mimeType?.startsWith('image/')) {
+      hasImages = true;
+      mimeType = part.fileData.mimeType;
+    }
+
+    // Check if the mime type is supported
+    if (mimeType && !isSupportedImageMimeType(mimeType)) {
+      unsupportedMimeTypes.push(mimeType);
+    }
+  }
+
+  return {
+    hasImages,
+    hasUnsupportedFormats: unsupportedMimeTypes.length > 0,
+    unsupportedMimeTypes,
+  };
+}
+
+/**
+ * Determines if we should offer vision switch for the given parts, auth type, and current model
+ */
+export function shouldOfferVisionSwitch(
+  parts: PartListUnion,
+  authType: AuthType,
+  currentModel: string,
+  visionModelPreviewEnabled: boolean = false,
+): boolean {
+  // Only trigger for qwen-oauth
+  if (authType !== AuthType.QWEN_OAUTH) {
+    return false;
+  }
+
+  // If vision model preview is disabled, never offer vision switch
+  if (!visionModelPreviewEnabled) {
+    return false;
+  }
+
+  // If current model is already a vision model, no need to switch
+  if (isVisionModel(currentModel)) {
+    return false;
+  }
+
+  // Check if the current message contains image parts
+  return hasImageParts(parts);
+}
+
+/**
+ * Interface for vision switch result
+ */
+export interface VisionSwitchResult {
+  modelOverride?: string;
+  persistSessionModel?: string;
+  showGuidance?: boolean;
+}
+
+/**
+ * Processes the vision switch outcome and returns the appropriate result
+ */
+export function processVisionSwitchOutcome(
+  outcome: VisionSwitchOutcome,
+): VisionSwitchResult {
+  const vlModelId = getDefaultVisionModel();
+
+  switch (outcome) {
+    case VisionSwitchOutcome.SwitchOnce:
+      return { modelOverride: vlModelId };
+
+    case VisionSwitchOutcome.SwitchSessionToVL:
+      return { persistSessionModel: vlModelId };
+
+    case VisionSwitchOutcome.DisallowWithGuidance:
+      return { showGuidance: true };
+
+    default:
+      return { showGuidance: true };
+  }
+}
+
+/**
+ * Gets the guidance message for when vision switch is disallowed
+ */
+export function getVisionSwitchGuidanceMessage(): string {
+  const vlModelId = getDefaultVisionModel();
+  return `To use images with your query, you can:
+• Use /model set ${vlModelId} to switch to a vision-capable model
+• Or remove the image and provide a text description instead`;
+}
+
+/**
+ * Interface for vision switch handling result
+ */
+export interface VisionSwitchHandlingResult {
+  shouldProceed: boolean;
+  originalModel?: string;
+}
+
+/**
+ * Custom hook for handling vision model auto-switching
+ */
+export function useVisionAutoSwitch(
+  config: Config,
+  addItem: UseHistoryManagerReturn['addItem'],
+  visionModelPreviewEnabled: boolean = false,
+  onVisionSwitchRequired?: (query: PartListUnion) => Promise<{
+    modelOverride?: string;
+    persistSessionModel?: string;
+    showGuidance?: boolean;
+  }>,
+) {
+  const originalModelRef = useRef<string | null>(null);
+
+  const handleVisionSwitch = useCallback(
+    async (
+      query: PartListUnion,
+      userMessageTimestamp: number,
+      isContinuation: boolean,
+    ): Promise<VisionSwitchHandlingResult> => {
+      // Skip vision switch handling for continuations or if no handler provided
+      if (isContinuation || !onVisionSwitchRequired) {
+        return { shouldProceed: true };
+      }
+
+      const contentGeneratorConfig = config.getContentGeneratorConfig();
+
+      // Only handle qwen-oauth auth type
+      if (contentGeneratorConfig?.authType !== AuthType.QWEN_OAUTH) {
+        return { shouldProceed: true };
+      }
+
+      // Check image format support first
+      const formatCheck = checkImageFormatsSupport(query);
+
+      // If there are unsupported image formats, show warning
+      if (formatCheck.hasUnsupportedFormats) {
+        addItem(
+          {
+            type: MessageType.INFO,
+            text: getUnsupportedImageFormatWarning(),
+          },
+          userMessageTimestamp,
+        );
+        // Continue processing but with warning shown
+      }
+
+      // Check if vision switch is needed
+      if (
+        !shouldOfferVisionSwitch(
+          query,
+          contentGeneratorConfig.authType,
+          config.getModel(),
+          visionModelPreviewEnabled,
+        )
+      ) {
+        return { shouldProceed: true };
+      }
+
+      try {
+        const visionSwitchResult = await onVisionSwitchRequired(query);
+
+        if (visionSwitchResult.showGuidance) {
+          // Show guidance and don't proceed with the request
+          addItem(
+            {
+              type: MessageType.INFO,
+              text: getVisionSwitchGuidanceMessage(),
+            },
+            userMessageTimestamp,
+          );
+          return { shouldProceed: false };
+        }
+
+        if (visionSwitchResult.modelOverride) {
+          // One-time model override
+          originalModelRef.current = config.getModel();
+          config.setModel(visionSwitchResult.modelOverride);
+          return {
+            shouldProceed: true,
+            originalModel: originalModelRef.current,
+          };
+        } else if (visionSwitchResult.persistSessionModel) {
+          // Persistent session model change
+          config.setModel(visionSwitchResult.persistSessionModel);
+          return { shouldProceed: true };
+        }
+
+        return { shouldProceed: true };
+      } catch (_error) {
+        // If vision switch dialog was cancelled or errored, don't proceed
+        return { shouldProceed: false };
+      }
+    },
+    [config, addItem, visionModelPreviewEnabled, onVisionSwitchRequired],
+  );
+
+  const restoreOriginalModel = useCallback(() => {
+    if (originalModelRef.current) {
+      config.setModel(originalModelRef.current);
+      originalModelRef.current = null;
+    }
+  }, [config]);
+
+  return {
+    handleVisionSwitch,
+    restoreOriginalModel,
+  };
+}