Remove auto-execution on Flash in the event of a 429/Quota failover (#3662)

Co-authored-by: Jenna Inouye <jinouye@google.com>
2025-12-20 08:47:44 +00:00 · 2025-07-09 13:55:56 -04:00
parent 01e756481f
commit 8a6509ffeb
14 changed files with 292 additions and 86 deletions
--- a/packages/cli/src/ui/App.tsx
+++ b/packages/cli/src/ui/App.tsx
@@ -70,6 +70,7 @@ import { UpdateNotification } from './components/UpdateNotification.js';
 import {
  isProQuotaExceededError,
  isGenericQuotaExceededError,
+  UserTierId,
 } from '@google/gemini-cli-core';
 import { checkForUpdates } from './utils/updateCheck.js';
 import ansiEscapes from 'ansi-escapes';
@@ -136,6 +137,8 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
  const ctrlDTimerRef = useRef<NodeJS.Timeout | null>(null);
  const [constrainHeight, setConstrainHeight] = useState<boolean>(true);
  const [showPrivacyNotice, setShowPrivacyNotice] = useState<boolean>(false);
+  const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] =
+    useState<boolean>(false);

  const openPrivacyNotice = useCallback(() => {
    setShowPrivacyNotice(true);
@@ -251,23 +254,51 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
    ): Promise<boolean> => {
      let message: string;

+      // For quota errors, assume FREE tier (safe default) - only show upgrade messaging to free tier users
+      // TODO: Get actual user tier from config when available
+      const userTier = undefined; // Defaults to FREE tier behavior
+      const isPaidTier =
+        userTier === UserTierId.LEGACY || userTier === UserTierId.STANDARD;
+
      // Check if this is a Pro quota exceeded error
      if (error && isProQuotaExceededError(error)) {
-        message = `⚡ You have reached your daily ${currentModel} quota limit.
+        if (isPaidTier) {
+          message = `⚡ You have reached your daily ${currentModel} quota limit.
+⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
+⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
+        } else {
+          message = `⚡ You have reached your daily ${currentModel} quota limit.
 ⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
 ⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
 ⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
 ⚡ You can switch authentication methods by typing /auth`;
+        }
      } else if (error && isGenericQuotaExceededError(error)) {
-        message = `⚡ You have reached your daily quota limit.
+        if (isPaidTier) {
+          message = `⚡ You have reached your daily quota limit.
+⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
+⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
+        } else {
+          message = `⚡ You have reached your daily quota limit.
 ⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
 ⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
 ⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
 ⚡ You can switch authentication methods by typing /auth`;
+        }
      } else {
-        // Default fallback message for other cases (like consecutive 429s)
-        message = `⚡ Slow response times detected.
-⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.`;
+        if (isPaidTier) {
+          // Default fallback message for other cases (like consecutive 429s)
+          message = `⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.
+⚡ Possible reasons for this are that you have received multiple consecutive capacity errors or you have reached your daily ${currentModel} quota limit
+⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
+        } else {
+          // Default fallback message for other cases (like consecutive 429s)
+          message = `⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.  
+⚡ Possible reasons for this are that you have received multiple consecutive capacity errors or you have reached your daily ${currentModel} quota limit
+⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
+⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
+⚡ You can switch authentication methods by typing /auth`;
+        }
      }

      // Add message to UI history
@@ -278,7 +309,14 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
        },
        Date.now(),
      );
-      return true; // Always accept the fallback
+
+      // Set the flag to prevent tool continuation
+      setModelSwitchedFromQuotaError(true);
+      // Set global quota error flag to prevent Flash model calls
+      config.setQuotaErrorOccurred(true);
+      // Switch model for future use but return false to stop current retry
+      config.setModel(fallbackModel);
+      return false; // Don't continue with current prompt
    };

    config.setFlashFallbackHandler(flashFallbackHandler);
@@ -445,6 +483,8 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
    getPreferredEditor,
    onAuthError,
    performMemoryRefresh,
+    modelSwitchedFromQuotaError,
+    setModelSwitchedFromQuotaError,
  );
  pendingHistoryItems.push(...pendingGeminiHistoryItems);
  const { elapsedTime, currentLoadingPhrase } =
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -301,6 +301,8 @@ describe('useGeminiStream', () => {
      getUsageStatisticsEnabled: () => true,
      getDebugMode: () => false,
      addHistory: vi.fn(),
+      setQuotaErrorOccurred: vi.fn(),
+      getQuotaErrorOccurred: vi.fn(() => false),
    } as unknown as Config;
    mockOnDebugMessage = vi.fn();
    mockHandleSlashCommand = vi.fn().mockResolvedValue(false);
@@ -386,6 +388,8 @@ describe('useGeminiStream', () => {
          () => 'vscode' as EditorType,
          () => {},
          () => Promise.resolve(),
+          false,
+          () => {},
        );
      },
      {
@@ -518,6 +522,8 @@ describe('useGeminiStream', () => {
        () => 'vscode' as EditorType,
        () => {},
        () => Promise.resolve(),
+        false,
+        () => {},
      ),
    );

@@ -582,6 +588,8 @@ describe('useGeminiStream', () => {
        () => 'vscode' as EditorType,
        () => {},
        () => Promise.resolve(),
+        false,
+        () => {},
      ),
    );

@@ -675,6 +683,8 @@ describe('useGeminiStream', () => {
        () => 'vscode' as EditorType,
        () => {},
        () => Promise.resolve(),
+        false,
+        () => {},
      ),
    );

@@ -775,6 +785,8 @@ describe('useGeminiStream', () => {
        () => 'vscode' as EditorType,
        () => {},
        () => Promise.resolve(),
+        false,
+        () => {},
      ),
    );

@@ -1063,6 +1075,8 @@ describe('useGeminiStream', () => {
          () => 'vscode' as EditorType,
          () => {},
          mockPerformMemoryRefresh,
+          false,
+          () => {},
        ),
      );

@@ -1113,6 +1127,8 @@ describe('useGeminiStream', () => {
          () => 'vscode' as EditorType,
          () => {},
          () => Promise.resolve(),
+          false,
+          () => {},
        ),
      );

--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -90,6 +90,8 @@ export const useGeminiStream = (
  getPreferredEditor: () => EditorType | undefined,
  onAuthError: () => void,
  performMemoryRefresh: () => Promise<void>,
+  modelSwitchedFromQuotaError: boolean,
+  setModelSwitchedFromQuotaError: React.Dispatch<React.SetStateAction<boolean>>,
 ) => {
  const [initError, setInitError] = useState<string | null>(null);
  const abortControllerRef = useRef<AbortController | null>(null);
@@ -494,6 +496,12 @@ export const useGeminiStream = (
      const userMessageTimestamp = Date.now();
      setShowHelp(false);

+      // Reset quota error flag when starting a new query (not a continuation)
+      if (!options?.isContinuation) {
+        setModelSwitchedFromQuotaError(false);
+        config.setQuotaErrorOccurred(false);
+      }
+
      abortControllerRef.current = new AbortController();
      const abortSignal = abortControllerRef.current.signal;
      turnCancelledRef.current = false;
@@ -552,6 +560,7 @@ export const useGeminiStream = (
    [
      streamingState,
      setShowHelp,
+      setModelSwitchedFromQuotaError,
      prepareQueryForGemini,
      processGeminiStreamEvents,
      pendingHistoryItemRef,
@@ -668,6 +677,12 @@ export const useGeminiStream = (
      );

      markToolsAsSubmitted(callIdsToMarkAsSubmitted);
+
+      // Don't continue if model was switched due to quota error
+      if (modelSwitchedFromQuotaError) {
+        return;
+      }
+
      submitQuery(mergePartListUnions(responsesToSend), {
        isContinuation: true,
      });
@@ -678,6 +693,7 @@ export const useGeminiStream = (
      markToolsAsSubmitted,
      geminiClient,
      performMemoryRefresh,
+      modelSwitchedFromQuotaError,
    ],
  );

--- a/packages/cli/src/ui/utils/errorParsing.test.ts
+++ b/packages/cli/src/ui/utils/errorParsing.test.ts
@@ -39,7 +39,7 @@ describe('parseAndFormatApiError', () => {
    );
    expect(result).toContain('[API Error: Rate limit exceeded');
    expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
    );
  });

@@ -55,7 +55,7 @@ describe('parseAndFormatApiError', () => {
    );
    expect(result).toContain('[API Error: Rate limit exceeded');
    expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
    );
  });

@@ -169,7 +169,7 @@ describe('parseAndFormatApiError', () => {
    );
    expect(result).toContain('[API Error: Rate limit exceeded');
    expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
    );
    expect(result).not.toContain(
      'You have reached your daily gemini-2.5-pro quota limit',
@@ -262,21 +262,17 @@ describe('parseAndFormatApiError', () => {
    );
  });

-  it('should handle different Gemini version strings in Pro quota exceeded errors', () => {
-    const errorMessage15 =
-      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 1.5 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
+  it('should handle different Gemini 2.5 version strings in Pro quota exceeded errors', () => {
+    const errorMessage25 =
+      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 2.5 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
    const errorMessagePreview =
      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 2.5-preview Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
-    const errorMessageBeta =
-      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini beta-3.0 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
-    const errorMessageExperimental =
-      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini experimental-v2 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';

-    const result15 = parseAndFormatApiError(
-      errorMessage15,
+    const result25 = parseAndFormatApiError(
+      errorMessage25,
      AuthType.LOGIN_WITH_GOOGLE,
      undefined,
-      'gemini-1.5-pro',
+      'gemini-2.5-pro',
      DEFAULT_GEMINI_FLASH_MODEL,
    );
    const resultPreview = parseAndFormatApiError(
@@ -286,45 +282,19 @@ describe('parseAndFormatApiError', () => {
      'gemini-2.5-preview-pro',
      DEFAULT_GEMINI_FLASH_MODEL,
    );
-    const resultBeta = parseAndFormatApiError(
-      errorMessageBeta,
-      AuthType.LOGIN_WITH_GOOGLE,
-      undefined,
-      'gemini-beta-3.0-pro',
-      DEFAULT_GEMINI_FLASH_MODEL,
-    );
-    const resultExperimental = parseAndFormatApiError(
-      errorMessageExperimental,
-      AuthType.LOGIN_WITH_GOOGLE,
-      undefined,
-      'gemini-experimental-v2-pro',
-      DEFAULT_GEMINI_FLASH_MODEL,
-    );

-    expect(result15).toContain(
-      'You have reached your daily gemini-1.5-pro quota limit',
+    expect(result25).toContain(
+      'You have reached your daily gemini-2.5-pro quota limit',
    );
    expect(resultPreview).toContain(
      'You have reached your daily gemini-2.5-preview-pro quota limit',
    );
-    expect(resultBeta).toContain(
-      'You have reached your daily gemini-beta-3.0-pro quota limit',
-    );
-    expect(resultExperimental).toContain(
-      'You have reached your daily gemini-experimental-v2-pro quota limit',
-    );
-    expect(result15).toContain(
+    expect(result25).toContain(
      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
    );
    expect(resultPreview).toContain(
      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
    );
-    expect(resultBeta).toContain(
-      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
-    );
-    expect(resultExperimental).toContain(
-      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
-    );
  });

  it('should not match non-Pro models with similar version strings', () => {
@@ -339,16 +309,6 @@ describe('parseAndFormatApiError', () => {
        "Quota exceeded for quota metric 'Gemini 2.5-preview Flash Requests' and limit",
      ),
    ).toBe(false);
-    expect(
-      isProQuotaExceededError(
-        "Quota exceeded for quota metric 'Gemini beta-3.0 Flash Requests' and limit",
-      ),
-    ).toBe(false);
-    expect(
-      isProQuotaExceededError(
-        "Quota exceeded for quota metric 'Gemini experimental-v2 Flash Requests' and limit",
-      ),
-    ).toBe(false);

    // Test other model types
    expect(
--- a/packages/cli/src/ui/utils/errorParsing.ts
+++ b/packages/cli/src/ui/utils/errorParsing.ts
@@ -19,7 +19,7 @@ import {
 const getRateLimitErrorMessageGoogleFree = (
  fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;

 const getRateLimitErrorMessageGoogleProQuotaFree = (
  currentModel: string = DEFAULT_GEMINI_MODEL,
@@ -34,7 +34,7 @@ const getRateLimitErrorMessageGoogleGenericQuotaFree = () =>
 const getRateLimitErrorMessageGooglePaid = (
  fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session. We appreciate you for choosing Gemini Code Assist and the Gemini CLI.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session. We appreciate you for choosing Gemini Code Assist and the Gemini CLI.`;

 const getRateLimitErrorMessageGoogleProQuotaPaid = (
  currentModel: string = DEFAULT_GEMINI_MODEL,
@@ -53,7 +53,7 @@ const RATE_LIMIT_ERROR_MESSAGE_VERTEX =
 const getRateLimitErrorMessageDefault = (
  fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;

 function getRateLimitMessage(
  authType?: AuthType,