Gemini-2.5-flash improvements (#10198)

krrishdholakia · web-flow · commit a7db0df0434b · 2025-04-21T22:48:00.000-07:00
* fix(vertex_and_google_ai_studio_gemini.py): allow thinking budget = 0 Fixes #10121 * fix(vertex_and_google_ai_studio_gemini.py): handle nuance in counting exclusive vs. inclusive tokens Addresses #10141 (comment)
diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -57,6 +57,7 @@
     LogprobsResult,
     ToolConfig,
     Tools,
+    UsageMetadata,
 )
 from litellm.types.utils import (
     ChatCompletionTokenLogprob,
@@ -390,7 +391,7 @@ def _map_thinking_param(
         params: GeminiThinkingConfig = {}
         if thinking_enabled:
             params["includeThoughts"] = True
-        if thinking_budget:
+        if thinking_budget is not None and isinstance(thinking_budget, int):
             params["thinkingBudget"] = thinking_budget
 
         return params
@@ -740,6 +741,23 @@ def _handle_content_policy_violation(
 
         return model_response
 
+    def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> bool:
+        """
+        Check if the candidate token count is inclusive of the thinking token count
+
+        if prompttokencount + candidatesTokenCount == totalTokenCount, then the candidate token count is inclusive of the thinking token count
+
+        else the candidate token count is exclusive of the thinking token count
+
+        Addresses - https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
+        """
+        if usage_metadata.get("promptTokenCount", 0) + usage_metadata.get(
+            "candidatesTokenCount", 0
+        ) == usage_metadata.get("totalTokenCount", 0):
+            return True
+        else:
+            return False
+
     def _calculate_usage(
         self,
         completion_response: GenerateContentResponseBody,
@@ -768,14 +786,23 @@ def _calculate_usage(
             audio_tokens=audio_tokens,
             text_tokens=text_tokens,
         )
+
+        completion_tokens = completion_response["usageMetadata"].get(
+            "candidatesTokenCount", 0
+        )
+        if (
+            not self.is_candidate_token_count_inclusive(
+                completion_response["usageMetadata"]
+            )
+            and reasoning_tokens
+        ):
+            completion_tokens = reasoning_tokens + completion_tokens
         ## GET USAGE ##
         usage = Usage(
             prompt_tokens=completion_response["usageMetadata"].get(
                 "promptTokenCount", 0
             ),
-            completion_tokens=completion_response["usageMetadata"].get(
-                "candidatesTokenCount", 0
-            ),
+            completion_tokens=completion_tokens,
             total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
             prompt_tokens_details=prompt_tokens_details,
             reasoning_tokens=reasoning_tokens,
diff --git a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
@@ -10,7 +10,8 @@
 from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
     VertexGeminiConfig,
 )
-from litellm.types.utils import ChoiceLogprobs
+from litellm.types.llms.vertex_ai import UsageMetadata
+from litellm.types.utils import ChoiceLogprobs, Usage
 
 
 def test_top_logprobs():
@@ -259,3 +260,53 @@ def test_vertex_ai_empty_content():
     content, reasoning_content = v.get_assistant_content_message(parts=parts)
     assert content is None
     assert reasoning_content is None
+
+
+@pytest.mark.parametrize(
+    "usage_metadata, inclusive, expected_usage",
+    [
+        (
+            UsageMetadata(
+                promptTokenCount=10,
+                candidatesTokenCount=10,
+                totalTokenCount=20,
+                thoughtsTokenCount=5,
+            ),
+            True,
+            Usage(
+                prompt_tokens=10,
+                completion_tokens=10,
+                total_tokens=20,
+                reasoning_tokens=5,
+            ),
+        ),
+        (
+            UsageMetadata(
+                promptTokenCount=10,
+                candidatesTokenCount=5,
+                totalTokenCount=20,
+                thoughtsTokenCount=5,
+            ),
+            False,
+            Usage(
+                prompt_tokens=10,
+                completion_tokens=10,
+                total_tokens=20,
+                reasoning_tokens=5,
+            ),
+        ),
+    ],
+)
+def test_vertex_ai_candidate_token_count_inclusive(
+    usage_metadata, inclusive, expected_usage
+):
+    """
+    Test that the candidate token count is inclusive of the thinking token count
+    """
+    v = VertexGeminiConfig()
+    assert v.is_candidate_token_count_inclusive(usage_metadata) is inclusive
+
+    usage = v._calculate_usage(completion_response={"usageMetadata": usage_metadata})
+    assert usage.prompt_tokens == expected_usage.prompt_tokens
+    assert usage.completion_tokens == expected_usage.completion_tokens
+    assert usage.total_tokens == expected_usage.total_tokens
diff --git a/tests/llm_translation/test_gemini.py b/tests/llm_translation/test_gemini.py
@@ -116,4 +116,22 @@ def test_gemini_thinking():
         messages=messages, # make sure call works
     )
     print(response.choices[0].message)
-    assert response.choices[0].message.content is not None
+    assert response.choices[0].message.content is not None
+
+
+def test_gemini_thinking_budget_0():
+    litellm._turn_on_debug()
+    from litellm.types.utils import Message, CallTypes
+    from litellm.utils import return_raw_request
+    import json
+
+    raw_request = return_raw_request(
+        endpoint=CallTypes.completion,
+        kwargs={
+            "model": "gemini/gemini-2.5-flash-preview-04-17",
+            "messages": [{"role": "user", "content": "Explain the concept of Occam's Razor and provide a simple, everyday example"}],
+            "thinking": {"type": "enabled", "budget_tokens": 0}
+        }
+    )
+    print(raw_request)
+    assert "0" in json.dumps(raw_request["raw_request_body"])