Skip to content

Commit 36308a3

Browse files
Gemini-2.5-flash - support reasoning cost calc + return reasoning content (#10141)
* build(model_prices_and_context_window.json): add vertex ai gemini-2.5-flash pricing * build(model_prices_and_context_window.json): add gemini reasoning token pricing * fix(vertex_and_google_ai_studio_gemini.py): support counting thinking tokens for gemini allows accurate cost calc * fix(utils.py): add reasoning token cost calc to generic cost calc ensures gemini-2.5-flash cost calculation is accurate * build(model_prices_and_context_window.json): mark gemini-2.5-flash as 'supports_reasoning' * feat(gemini/): support 'thinking' + 'reasoning_effort' params + new unit tests allow controlling thinking effort for gemini-2.5-flash models * test: update unit testing * feat(vertex_and_google_ai_studio_gemini.py): return reasoning content if given in gemini response * test: update model name * fix: fix ruff check * test(test_spend_management_endpoints.py): update tests to be less sensitive to new keys / updates to usage object * fix(vertex_and_google_ai_studio_gemini.py): fix translation
1 parent db4ebe1 commit 36308a3

File tree

16 files changed

+453
-88
lines changed

16 files changed

+453
-88
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,4 @@ litellm/proxy/db/migrations/0_init/migration.sql
8686
litellm/proxy/db/migrations/*
8787
litellm/proxy/migrations/*config.yaml
8888
litellm/proxy/migrations/*
89+
tests/litellm/litellm_core_utils/llm_cost_calc/log.txt

litellm/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
2222
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
2323

24+
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
25+
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
26+
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
27+
2428
########## Networking constants ##############################################################
2529
_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour
2630

litellm/litellm_core_utils/llm_cost_calc/utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ def generic_cost_per_token(
267267
## CALCULATE OUTPUT COST
268268
text_tokens = usage.completion_tokens
269269
audio_tokens = 0
270+
reasoning_tokens = 0
270271
if usage.completion_tokens_details is not None:
271272
audio_tokens = (
272273
cast(
@@ -282,14 +283,24 @@ def generic_cost_per_token(
282283
)
283284
or usage.completion_tokens # default to completion tokens, if this field is not set
284285
)
285-
286+
reasoning_tokens = (
287+
cast(
288+
Optional[int],
289+
getattr(usage.completion_tokens_details, "reasoning_tokens", 0),
290+
)
291+
or 0
292+
)
286293
## TEXT COST
287294
completion_cost = float(text_tokens) * completion_base_cost
288295

289296
_output_cost_per_audio_token: Optional[float] = model_info.get(
290297
"output_cost_per_audio_token"
291298
)
292299

300+
_output_cost_per_reasoning_token: Optional[float] = model_info.get(
301+
"output_cost_per_reasoning_token"
302+
)
303+
293304
## AUDIO COST
294305
if (
295306
_output_cost_per_audio_token is not None
@@ -298,4 +309,12 @@ def generic_cost_per_token(
298309
):
299310
completion_cost += float(audio_tokens) * _output_cost_per_audio_token
300311

312+
## REASONING COST
313+
if (
314+
_output_cost_per_reasoning_token is not None
315+
and reasoning_tokens
316+
and reasoning_tokens > 0
317+
):
318+
completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
319+
301320
return prompt_cost, completion_cost

litellm/llms/anthropic/chat/transformation.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
import litellm
88
from litellm.constants import (
99
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
10+
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
11+
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
12+
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
1013
RESPONSE_FORMAT_TOOL_NAME,
1114
)
1215
from litellm.litellm_core_utils.core_helpers import map_finish_reason
@@ -276,11 +279,20 @@ def _map_reasoning_effort(
276279
if reasoning_effort is None:
277280
return None
278281
elif reasoning_effort == "low":
279-
return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
282+
return AnthropicThinkingParam(
283+
type="enabled",
284+
budget_tokens=DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
285+
)
280286
elif reasoning_effort == "medium":
281-
return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
287+
return AnthropicThinkingParam(
288+
type="enabled",
289+
budget_tokens=DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
290+
)
282291
elif reasoning_effort == "high":
283-
return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
292+
return AnthropicThinkingParam(
293+
type="enabled",
294+
budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
295+
)
284296
else:
285297
raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")
286298

litellm/llms/gemini/chat/transformation.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
)
88
from litellm.types.llms.openai import AllMessageValues
99
from litellm.types.llms.vertex_ai import ContentType, PartType
10+
from litellm.utils import supports_reasoning
1011

1112
from ...vertex_ai.gemini.transformation import _gemini_convert_messages_with_history
1213
from ...vertex_ai.gemini.vertex_and_google_ai_studio_gemini import VertexGeminiConfig
@@ -67,7 +68,7 @@ def get_config(cls):
6768
return super().get_config()
6869

6970
def get_supported_openai_params(self, model: str) -> List[str]:
70-
return [
71+
supported_params = [
7172
"temperature",
7273
"top_p",
7374
"max_tokens",
@@ -83,6 +84,10 @@ def get_supported_openai_params(self, model: str) -> List[str]:
8384
"frequency_penalty",
8485
"modalities",
8586
]
87+
if supports_reasoning(model):
88+
supported_params.append("reasoning_effort")
89+
supported_params.append("thinking")
90+
return supported_params
8691

8792
def map_openai_params(
8893
self,

litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py

Lines changed: 93 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,19 @@
2424
import litellm.litellm_core_utils
2525
import litellm.litellm_core_utils.litellm_logging
2626
from litellm import verbose_logger
27+
from litellm.constants import (
28+
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
29+
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
30+
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
31+
)
2732
from litellm.litellm_core_utils.core_helpers import map_finish_reason
2833
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
2934
from litellm.llms.custom_httpx.http_handler import (
3035
AsyncHTTPHandler,
3136
HTTPHandler,
3237
get_async_httpx_client,
3338
)
39+
from litellm.types.llms.anthropic import AnthropicThinkingParam
3440
from litellm.types.llms.openai import (
3541
AllMessageValues,
3642
ChatCompletionResponseMessage,
@@ -45,6 +51,7 @@
4551
ContentType,
4652
FunctionCallingConfig,
4753
FunctionDeclaration,
54+
GeminiThinkingConfig,
4855
GenerateContentResponseBody,
4956
HttpxPartType,
5057
LogprobsResult,
@@ -59,7 +66,7 @@
5966
TopLogprob,
6067
Usage,
6168
)
62-
from litellm.utils import CustomStreamWrapper, ModelResponse
69+
from litellm.utils import CustomStreamWrapper, ModelResponse, supports_reasoning
6370

6471
from ....utils import _remove_additional_properties, _remove_strict_from_schema
6572
from ..common_utils import VertexAIError, _build_vertex_schema
@@ -190,7 +197,7 @@ def get_config(cls):
190197
return super().get_config()
191198

192199
def get_supported_openai_params(self, model: str) -> List[str]:
193-
return [
200+
supported_params = [
194201
"temperature",
195202
"top_p",
196203
"max_tokens",
@@ -210,6 +217,10 @@ def get_supported_openai_params(self, model: str) -> List[str]:
210217
"top_logprobs",
211218
"modalities",
212219
]
220+
if supports_reasoning(model):
221+
supported_params.append("reasoning_effort")
222+
supported_params.append("thinking")
223+
return supported_params
213224

214225
def map_tool_choice_values(
215226
self, model: str, tool_choice: Union[str, dict]
@@ -313,10 +324,14 @@ def _map_response_schema(self, value: dict) -> dict:
313324
if isinstance(old_schema, list):
314325
for item in old_schema:
315326
if isinstance(item, dict):
316-
item = _build_vertex_schema(parameters=item, add_property_ordering=True)
327+
item = _build_vertex_schema(
328+
parameters=item, add_property_ordering=True
329+
)
317330

318331
elif isinstance(old_schema, dict):
319-
old_schema = _build_vertex_schema(parameters=old_schema, add_property_ordering=True)
332+
old_schema = _build_vertex_schema(
333+
parameters=old_schema, add_property_ordering=True
334+
)
320335
return old_schema
321336

322337
def apply_response_schema_transformation(self, value: dict, optional_params: dict):
@@ -343,6 +358,43 @@ def apply_response_schema_transformation(self, value: dict, optional_params: dic
343358
value=optional_params["response_schema"]
344359
)
345360

361+
@staticmethod
362+
def _map_reasoning_effort_to_thinking_budget(
363+
reasoning_effort: str,
364+
) -> GeminiThinkingConfig:
365+
if reasoning_effort == "low":
366+
return {
367+
"thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
368+
"includeThoughts": True,
369+
}
370+
elif reasoning_effort == "medium":
371+
return {
372+
"thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
373+
"includeThoughts": True,
374+
}
375+
elif reasoning_effort == "high":
376+
return {
377+
"thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
378+
"includeThoughts": True,
379+
}
380+
else:
381+
raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
382+
383+
@staticmethod
384+
def _map_thinking_param(
385+
thinking_param: AnthropicThinkingParam,
386+
) -> GeminiThinkingConfig:
387+
thinking_enabled = thinking_param.get("type") == "enabled"
388+
thinking_budget = thinking_param.get("budget_tokens")
389+
390+
params: GeminiThinkingConfig = {}
391+
if thinking_enabled:
392+
params["includeThoughts"] = True
393+
if thinking_budget:
394+
params["thinkingBudget"] = thinking_budget
395+
396+
return params
397+
346398
def map_openai_params(
347399
self,
348400
non_default_params: Dict,
@@ -399,6 +451,16 @@ def map_openai_params(
399451
optional_params["tool_choice"] = _tool_choice_value
400452
elif param == "seed":
401453
optional_params["seed"] = value
454+
elif param == "reasoning_effort" and isinstance(value, str):
455+
optional_params[
456+
"thinkingConfig"
457+
] = VertexGeminiConfig._map_reasoning_effort_to_thinking_budget(value)
458+
elif param == "thinking":
459+
optional_params[
460+
"thinkingConfig"
461+
] = VertexGeminiConfig._map_thinking_param(
462+
cast(AnthropicThinkingParam, value)
463+
)
402464
elif param == "modalities" and isinstance(value, list):
403465
response_modalities = []
404466
for modality in value:
@@ -514,19 +576,27 @@ def translate_exception_str(self, exception_string: str):
514576

515577
def get_assistant_content_message(
516578
self, parts: List[HttpxPartType]
517-
) -> Optional[str]:
518-
_content_str = ""
579+
) -> Tuple[Optional[str], Optional[str]]:
580+
content_str: Optional[str] = None
581+
reasoning_content_str: Optional[str] = None
519582
for part in parts:
583+
_content_str = ""
520584
if "text" in part:
521585
_content_str += part["text"]
522586
elif "inlineData" in part: # base64 encoded image
523587
_content_str += "data:{};base64,{}".format(
524588
part["inlineData"]["mimeType"], part["inlineData"]["data"]
525589
)
590+
if part.get("thought") is True:
591+
if reasoning_content_str is None:
592+
reasoning_content_str = ""
593+
reasoning_content_str += _content_str
594+
else:
595+
if content_str is None:
596+
content_str = ""
597+
content_str += _content_str
526598

527-
if _content_str:
528-
return _content_str
529-
return None
599+
return content_str, reasoning_content_str
530600

531601
def _transform_parts(
532602
self,
@@ -677,6 +747,7 @@ def _calculate_usage(
677747
audio_tokens: Optional[int] = None
678748
text_tokens: Optional[int] = None
679749
prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
750+
reasoning_tokens: Optional[int] = None
680751
if "cachedContentTokenCount" in completion_response["usageMetadata"]:
681752
cached_tokens = completion_response["usageMetadata"][
682753
"cachedContentTokenCount"
@@ -687,7 +758,10 @@ def _calculate_usage(
687758
audio_tokens = detail["tokenCount"]
688759
elif detail["modality"] == "TEXT":
689760
text_tokens = detail["tokenCount"]
690-
761+
if "thoughtsTokenCount" in completion_response["usageMetadata"]:
762+
reasoning_tokens = completion_response["usageMetadata"][
763+
"thoughtsTokenCount"
764+
]
691765
prompt_tokens_details = PromptTokensDetailsWrapper(
692766
cached_tokens=cached_tokens,
693767
audio_tokens=audio_tokens,
@@ -703,6 +777,7 @@ def _calculate_usage(
703777
),
704778
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
705779
prompt_tokens_details=prompt_tokens_details,
780+
reasoning_tokens=reasoning_tokens,
706781
)
707782

708783
return usage
@@ -731,11 +806,16 @@ def _process_candidates(self, _candidates, model_response, litellm_params):
731806
citation_metadata.append(candidate["citationMetadata"])
732807

733808
if "parts" in candidate["content"]:
734-
chat_completion_message[
735-
"content"
736-
] = VertexGeminiConfig().get_assistant_content_message(
809+
(
810+
content,
811+
reasoning_content,
812+
) = VertexGeminiConfig().get_assistant_content_message(
737813
parts=candidate["content"]["parts"]
738814
)
815+
if content is not None:
816+
chat_completion_message["content"] = content
817+
if reasoning_content is not None:
818+
chat_completion_message["reasoning_content"] = reasoning_content
739819

740820
functions, tools = self._transform_parts(
741821
parts=candidate["content"]["parts"],

litellm/model_prices_and_context_window_backup.json

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5178,19 +5178,50 @@
51785178
"max_audio_length_hours": 8.4,
51795179
"max_audio_per_prompt": 1,
51805180
"max_pdf_size_mb": 30,
5181-
"input_cost_per_audio_token": 0.0000001,
5182-
"input_cost_per_token": 0.00000015,
5183-
"output_cost_per_token": 0.00000060,
5181+
"input_cost_per_audio_token": 1e-6,
5182+
"input_cost_per_token": 0.15e-6,
5183+
"output_cost_per_token": 0.6e-6,
5184+
"output_cost_per_reasoning_token": 3.5e-6,
51845185
"litellm_provider": "gemini",
51855186
"mode": "chat",
51865187
"rpm": 10,
51875188
"tpm": 250000,
51885189
"supports_system_messages": true,
51895190
"supports_function_calling": true,
51905191
"supports_vision": true,
5192+
"supports_reasoning": true,
5193+
"supports_response_schema": true,
5194+
"supports_audio_output": false,
5195+
"supports_tool_choice": true,
5196+
"supported_endpoints": ["/v1/chat/completions", "/v1/completions"],
5197+
"supported_modalities": ["text", "image", "audio", "video"],
5198+
"supported_output_modalities": ["text"],
5199+
"source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
5200+
},
5201+
"gemini-2.5-flash-preview-04-17": {
5202+
"max_tokens": 65536,
5203+
"max_input_tokens": 1048576,
5204+
"max_output_tokens": 65536,
5205+
"max_images_per_prompt": 3000,
5206+
"max_videos_per_prompt": 10,
5207+
"max_video_length": 1,
5208+
"max_audio_length_hours": 8.4,
5209+
"max_audio_per_prompt": 1,
5210+
"max_pdf_size_mb": 30,
5211+
"input_cost_per_audio_token": 1e-6,
5212+
"input_cost_per_token": 0.15e-6,
5213+
"output_cost_per_token": 0.6e-6,
5214+
"output_cost_per_reasoning_token": 3.5e-6,
5215+
"litellm_provider": "vertex_ai-language-models",
5216+
"mode": "chat",
5217+
"supports_reasoning": true,
5218+
"supports_system_messages": true,
5219+
"supports_function_calling": true,
5220+
"supports_vision": true,
51915221
"supports_response_schema": true,
51925222
"supports_audio_output": false,
51935223
"supports_tool_choice": true,
5224+
"supported_endpoints": ["/v1/chat/completions", "/v1/completions", "/v1/batch"],
51945225
"supported_modalities": ["text", "image", "audio", "video"],
51955226
"supported_output_modalities": ["text"],
51965227
"source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"

0 commit comments

Comments
 (0)