Skip to content

Commit a7db0df

Browse files
Gemini-2.5-flash improvements (#10198)
* fix(vertex_and_google_ai_studio_gemini.py): allow thinking budget = 0 Fixes #10121 * fix(vertex_and_google_ai_studio_gemini.py): handle nuance in counting exclusive vs. inclusive tokens Addresses #10141 (comment)
1 parent d1fb051 commit a7db0df

File tree

3 files changed

+102
-6
lines changed

3 files changed

+102
-6
lines changed

litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
LogprobsResult,
5858
ToolConfig,
5959
Tools,
60+
UsageMetadata,
6061
)
6162
from litellm.types.utils import (
6263
ChatCompletionTokenLogprob,
@@ -390,7 +391,7 @@ def _map_thinking_param(
390391
params: GeminiThinkingConfig = {}
391392
if thinking_enabled:
392393
params["includeThoughts"] = True
393-
if thinking_budget:
394+
if thinking_budget is not None and isinstance(thinking_budget, int):
394395
params["thinkingBudget"] = thinking_budget
395396

396397
return params
@@ -740,6 +741,23 @@ def _handle_content_policy_violation(
740741

741742
return model_response
742743

744+
def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> bool:
745+
"""
746+
Check if the candidate token count is inclusive of the thinking token count
747+
748+
if prompttokencount + candidatesTokenCount == totalTokenCount, then the candidate token count is inclusive of the thinking token count
749+
750+
else the candidate token count is exclusive of the thinking token count
751+
752+
Addresses - https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
753+
"""
754+
if usage_metadata.get("promptTokenCount", 0) + usage_metadata.get(
755+
"candidatesTokenCount", 0
756+
) == usage_metadata.get("totalTokenCount", 0):
757+
return True
758+
else:
759+
return False
760+
743761
def _calculate_usage(
744762
self,
745763
completion_response: GenerateContentResponseBody,
@@ -768,14 +786,23 @@ def _calculate_usage(
768786
audio_tokens=audio_tokens,
769787
text_tokens=text_tokens,
770788
)
789+
790+
completion_tokens = completion_response["usageMetadata"].get(
791+
"candidatesTokenCount", 0
792+
)
793+
if (
794+
not self.is_candidate_token_count_inclusive(
795+
completion_response["usageMetadata"]
796+
)
797+
and reasoning_tokens
798+
):
799+
completion_tokens = reasoning_tokens + completion_tokens
771800
## GET USAGE ##
772801
usage = Usage(
773802
prompt_tokens=completion_response["usageMetadata"].get(
774803
"promptTokenCount", 0
775804
),
776-
completion_tokens=completion_response["usageMetadata"].get(
777-
"candidatesTokenCount", 0
778-
),
805+
completion_tokens=completion_tokens,
779806
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
780807
prompt_tokens_details=prompt_tokens_details,
781808
reasoning_tokens=reasoning_tokens,

tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
1111
VertexGeminiConfig,
1212
)
13-
from litellm.types.utils import ChoiceLogprobs
13+
from litellm.types.llms.vertex_ai import UsageMetadata
14+
from litellm.types.utils import ChoiceLogprobs, Usage
1415

1516

1617
def test_top_logprobs():
@@ -259,3 +260,53 @@ def test_vertex_ai_empty_content():
259260
content, reasoning_content = v.get_assistant_content_message(parts=parts)
260261
assert content is None
261262
assert reasoning_content is None
263+
264+
265+
@pytest.mark.parametrize(
266+
"usage_metadata, inclusive, expected_usage",
267+
[
268+
(
269+
UsageMetadata(
270+
promptTokenCount=10,
271+
candidatesTokenCount=10,
272+
totalTokenCount=20,
273+
thoughtsTokenCount=5,
274+
),
275+
True,
276+
Usage(
277+
prompt_tokens=10,
278+
completion_tokens=10,
279+
total_tokens=20,
280+
reasoning_tokens=5,
281+
),
282+
),
283+
(
284+
UsageMetadata(
285+
promptTokenCount=10,
286+
candidatesTokenCount=5,
287+
totalTokenCount=20,
288+
thoughtsTokenCount=5,
289+
),
290+
False,
291+
Usage(
292+
prompt_tokens=10,
293+
completion_tokens=10,
294+
total_tokens=20,
295+
reasoning_tokens=5,
296+
),
297+
),
298+
],
299+
)
300+
def test_vertex_ai_candidate_token_count_inclusive(
301+
usage_metadata, inclusive, expected_usage
302+
):
303+
"""
304+
Test that the candidate token count is inclusive of the thinking token count
305+
"""
306+
v = VertexGeminiConfig()
307+
assert v.is_candidate_token_count_inclusive(usage_metadata) is inclusive
308+
309+
usage = v._calculate_usage(completion_response={"usageMetadata": usage_metadata})
310+
assert usage.prompt_tokens == expected_usage.prompt_tokens
311+
assert usage.completion_tokens == expected_usage.completion_tokens
312+
assert usage.total_tokens == expected_usage.total_tokens

tests/llm_translation/test_gemini.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,4 +116,22 @@ def test_gemini_thinking():
116116
messages=messages, # make sure call works
117117
)
118118
print(response.choices[0].message)
119-
assert response.choices[0].message.content is not None
119+
assert response.choices[0].message.content is not None
120+
121+
122+
def test_gemini_thinking_budget_0():
123+
litellm._turn_on_debug()
124+
from litellm.types.utils import Message, CallTypes
125+
from litellm.utils import return_raw_request
126+
import json
127+
128+
raw_request = return_raw_request(
129+
endpoint=CallTypes.completion,
130+
kwargs={
131+
"model": "gemini/gemini-2.5-flash-preview-04-17",
132+
"messages": [{"role": "user", "content": "Explain the concept of Occam's Razor and provide a simple, everyday example"}],
133+
"thinking": {"type": "enabled", "budget_tokens": 0}
134+
}
135+
)
136+
print(raw_request)
137+
assert "0" in json.dumps(raw_request["raw_request_body"])

0 commit comments

Comments
 (0)