feat: Add support for reasoning_effort="none" for Gemini models (BerriAI#16548)

Chesars · krrishdholakia · LingXuanYin · commit 86eba9df5d31 · 2025-11-14T16:19:56.000+08:00
Implements support for reasoning_effort="none" parameter for Gemini models, providing significant cost savings (up to 96% cheaper) by disabling thinking budget while maintaining response quality. Changes: - Added "supports_reasoning": true to gemini-2.0-flash-thinking-exp-01-21 in model config - Implemented mapping for reasoning_effort="none" to thinkingConfig {thinkingBudget: 0, includeThoughts: false} - Added unit test to verify the mapping works correctly Performance impact: - Without reasoning_effort: ~313 tokens - With reasoning_effort="none": ~12 tokens (96% cheaper) Closes BerriAI#16420 Co-authored-by: Krish Dholakia <krrishdholakia@gmail.com>
diff --git a/docs/my-website/docs/providers/gemini.md b/docs/my-website/docs/providers/gemini.md
@@ -64,23 +64,36 @@ response = completion(
 
 LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
 
-Added an additional non-OpenAI standard "disable" value for non-reasoning Gemini requests.
+**Cost Optimization:** Use `reasoning_effort="none"` (OpenAI standard) for significant cost savings - up to 96% cheaper. [Google's docs](https://ai.google.dev/gemini-api/docs/openai)
+
+:::info
+Note: Reasoning cannot be turned off on Gemini 2.5 Pro models.
+:::
 
 **Mapping**
 
-| reasoning_effort | thinking |
-| ---------------- | -------- |
-| "disable"        | "budget_tokens": 0    |
-| "low"            | "budget_tokens": 1024 |
-| "medium"         | "budget_tokens": 2048 |
-| "high"           | "budget_tokens": 4096 |
+| reasoning_effort | thinking | Notes |
+| ---------------- | -------- | ----- |
+| "none"           | "budget_tokens": 0, "includeThoughts": false | 💰 **Recommended for cost optimization** - OpenAI-compatible, always 0 |
+| "disable"        | "budget_tokens": DEFAULT (0), "includeThoughts": false | LiteLLM-specific, configurable via env var |
+| "low"            | "budget_tokens": 1024 | |
+| "medium"         | "budget_tokens": 2048 | |
+| "high"           | "budget_tokens": 4096 | |
 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 
 ```python
 from litellm import completion
 
+# Cost-optimized: Use reasoning_effort="none" for best pricing
+resp = completion(
+    model="gemini/gemini-2.0-flash-thinking-exp-01-21",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    reasoning_effort="none",  # Up to 96% cheaper!
+)
+
+# Or use other levels: "low", "medium", "high"
 resp = completion(
     model="gemini/gemini-2.5-flash-preview-04-17",
     messages=[{"role": "user", "content": "What is the capital of France?"}],
diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -567,6 +567,11 @@ def _map_reasoning_effort_to_thinking_budget(
                 "thinkingBudget": DEFAULT_REASONING_EFFORT_DISABLE_THINKING_BUDGET,
                 "includeThoughts": False,
             }
+        elif reasoning_effort == "none":
+            return {
+                "thinkingBudget": 0,
+                "includeThoughts": False,
+            }
         else:
             raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
 
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
@@ -9963,6 +9963,7 @@
         "supports_function_calling": false,
         "supports_parallel_function_calling": true,
         "supports_prompt_caching": true,
+        "supports_reasoning": true,
         "supports_response_schema": false,
         "supports_system_messages": true,
         "supports_tool_choice": true,
@@ -11568,6 +11569,7 @@
         "supports_audio_output": true,
         "supports_function_calling": true,
         "supports_prompt_caching": true,
+        "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_system_messages": true,
         "supports_tool_choice": true,
diff --git a/tests/llm_translation/test_gemini.py b/tests/llm_translation/test_gemini.py
@@ -1137,6 +1137,25 @@ def test_gemini_embedding():
     assert response is not None
 
 
+def test_reasoning_effort_none_mapping():
+    """
+    Test that reasoning_effort='none' correctly maps to thinkingConfig.
+    Related issue: https://github.com/BerriAI/litellm/issues/16420
+    """
+    from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexGeminiConfig,
+    )
+
+    # Test reasoning_effort="none" mapping
+    result = VertexGeminiConfig._map_reasoning_effort_to_thinking_budget(
+        reasoning_effort="none",
+        model="gemini-2.0-flash-thinking-exp-01-21",
+    )
+
+    assert result is not None
+    assert result["thinkingBudget"] == 0
+    assert result["includeThoughts"] is False
+    
 def test_gemini_function_args_preserve_unicode():
     """
     Test for Issue #16533: Gemini function call arguments should preserve non-ASCII characters