Skip to content

[Bug fix] Router - handle cooldown_time = 0 for deployments #12108

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion litellm/router_utils/cooldown_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,15 @@ def add_deployment_to_cooldown(
cooldown_time: Optional[float],
):
try:
_cooldown_time = cooldown_time or self.default_cooldown_time
#########################################################
# get cooldown time
# 1. If dynamic cooldown time is set for the model/deployment, use that
# 2. If no dynamic cooldown time is set, use the default cooldown time set on CooldownCache
_cooldown_time = cooldown_time
if _cooldown_time is None:
_cooldown_time = self.default_cooldown_time
#########################################################

cooldown_key, cooldown_data = self._common_add_cooldown_logic(
model_id=model_id,
original_exception=original_exception,
Expand Down
2 changes: 1 addition & 1 deletion litellm/router_utils/cooldown_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ async def router_cooldown_event_callback(
litellm_router_instance: LitellmRouter,
deployment_id: str,
exception_status: Union[str, int],
cooldown_time: float,
cooldown_time: Optional[float],
):
"""
Callback triggered when a deployment is put into cooldown by litellm
Expand Down
32 changes: 24 additions & 8 deletions litellm/router_utils/cooldown_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""

import asyncio
import math
from typing import TYPE_CHECKING, Any, List, Optional, Union

import litellm
Expand Down Expand Up @@ -96,6 +97,7 @@ def _should_run_cooldown_logic(
deployment: Optional[str],
exception_status: Union[str, int],
original_exception: Any,
time_to_cooldown: Optional[float] = None,
) -> bool:
"""
Helper that decides if cooldown logic should be run
Expand All @@ -116,6 +118,17 @@ def _should_run_cooldown_logic(
"Should Not Run Cooldown Logic: deployment id is none or model group can't be found."
)
return False

#########################################################
# If time_to_cooldown is 0 or 0.0000000, don't run cooldown logic
#########################################################
if time_to_cooldown is not None and math.isclose(
a=time_to_cooldown,
b=0.0,
abs_tol=1e-9
):
verbose_router_logger.debug("Should Not Run Cooldown Logic: time_to_cooldown is effectively 0")
return False

if litellm_router_instance.disable_cooldowns:
verbose_router_logger.debug(
Expand Down Expand Up @@ -261,7 +274,11 @@ def _set_cooldown_deployments(

if (
_should_run_cooldown_logic(
litellm_router_instance, deployment, exception_status, original_exception
litellm_router_instance=litellm_router_instance,
deployment=deployment,
exception_status=exception_status,
original_exception=original_exception,
time_to_cooldown=time_to_cooldown,
)
is False
or deployment is None
Expand All @@ -270,20 +287,19 @@ def _set_cooldown_deployments(
return False

exception_status_int = cast_exception_status_to_int(exception_status)

verbose_router_logger.debug(f"Attempting to add {deployment} to cooldown list")
cooldown_time = litellm_router_instance.cooldown_time or 1
if time_to_cooldown is not None:
cooldown_time = time_to_cooldown

if _should_cooldown_deployment(
litellm_router_instance, deployment, exception_status, original_exception
litellm_router_instance=litellm_router_instance,
deployment=deployment,
exception_status=exception_status,
original_exception=original_exception,
):
litellm_router_instance.cooldown_cache.add_deployment_to_cooldown(
model_id=deployment,
original_exception=original_exception,
exception_status=exception_status_int,
cooldown_time=cooldown_time,
cooldown_time=time_to_cooldown,
)

# Trigger cooldown callback handler
Expand All @@ -292,7 +308,7 @@ def _set_cooldown_deployments(
litellm_router_instance=litellm_router_instance,
deployment_id=deployment,
exception_status=exception_status,
cooldown_time=cooldown_time,
cooldown_time=time_to_cooldown,
)
)
return True
Expand Down
115 changes: 114 additions & 1 deletion tests/local_testing/test_router_cooldowns.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import litellm
from litellm import Router
from litellm.integrations.custom_logger import CustomLogger
from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments
from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments, _should_run_cooldown_logic
from litellm.types.router import (
DeploymentTypedDict,
LiteLLMParamsTypedDict,
Expand Down Expand Up @@ -121,6 +121,119 @@ async def test_dynamic_cooldowns():
assert tmp_mock.call_args[0][0]["litellm_params"]["cooldown_time"] == 0


@pytest.mark.asyncio
async def test_cooldown_time_zero_uses_zero_not_default():
"""
Test that when cooldown_time=0 is passed, it uses 0 instead of the default cooldown time
AND that the early exit logic prevents cooldown entirely
"""
router = Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"cooldown_time": 0,
},
},
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
},
},
],
cooldown_time=300, # Default cooldown time is 300 seconds
num_retries=0,
)

# Mock the add_deployment_to_cooldown method to verify it's NOT called
with patch.object(router.cooldown_cache, "add_deployment_to_cooldown") as mock_add_cooldown:
try:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
mock_response="litellm.RateLimitError",
)
except litellm.RateLimitError:
pass

# Verify that add_deployment_to_cooldown was NOT called due to early exit
mock_add_cooldown.assert_not_called()

# Also verify the deployment is not in cooldown
cooldown_list = await _async_get_cooldown_deployments(
litellm_router_instance=router, parent_otel_span=None
)
assert len(cooldown_list) == 0

# Verify the deployment is still healthy and available
healthy_deployments, _ = await router._async_get_healthy_deployments(
model="gpt-3.5-turbo", parent_otel_span=None
)
assert len(healthy_deployments) == 1


def test_should_run_cooldown_logic_early_exit_on_zero_cooldown():
"""
Unit test for _should_run_cooldown_logic to verify early exit when time_to_cooldown is 0
"""
router = Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
"model_info": {
"id": "test-deployment-id",
},
}
],
cooldown_time=300,
)

# Test with time_to_cooldown = 0 - should return False (don't run cooldown logic)
result = _should_run_cooldown_logic(
litellm_router_instance=router,
deployment="test-deployment-id",
exception_status=429,
original_exception=litellm.RateLimitError("test error", "openai", "gpt-3.5-turbo"),
time_to_cooldown=0.0
)
assert result is False, "Should not run cooldown logic when time_to_cooldown is 0"

# Test with very small time_to_cooldown (effectively 0) - should return False
result = _should_run_cooldown_logic(
litellm_router_instance=router,
deployment="test-deployment-id",
exception_status=429,
original_exception=litellm.RateLimitError("test error", "openai", "gpt-3.5-turbo"),
time_to_cooldown=1e-10
)
assert result is False, "Should not run cooldown logic when time_to_cooldown is effectively 0"

# Test with None time_to_cooldown - should return True (use default cooldown logic)
result = _should_run_cooldown_logic(
litellm_router_instance=router,
deployment="test-deployment-id",
exception_status=429,
original_exception=litellm.RateLimitError("test error", "openai", "gpt-3.5-turbo"),
time_to_cooldown=None
)
assert result is True, "Should run cooldown logic when time_to_cooldown is None"

# Test with positive time_to_cooldown - should return True
result = _should_run_cooldown_logic(
litellm_router_instance=router,
deployment="test-deployment-id",
exception_status=429,
original_exception=litellm.RateLimitError("test error", "openai", "gpt-3.5-turbo"),
time_to_cooldown=60.0
)
assert result is True, "Should run cooldown logic when time_to_cooldown is positive"


@pytest.mark.parametrize("num_deployments", [1, 2])
def test_single_deployment_no_cooldowns(num_deployments):
"""
Expand Down
Loading