BerriAI · ishaan-jaff · Jun 28, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/litellm/router_utils/cooldown_cache.py b/litellm/router_utils/cooldown_cache.py
@@ -62,7 +62,15 @@ def add_deployment_to_cooldown(
         cooldown_time: Optional[float],
     ):
         try:
-            _cooldown_time = cooldown_time or self.default_cooldown_time
+            #########################################################
+            # get cooldown time
+            # 1. If dynamic cooldown time is set for the model/deployment, use that
+            # 2. If no dynamic cooldown time is set, use the default cooldown time set on CooldownCache
+            _cooldown_time = cooldown_time
+            if _cooldown_time is None:
+                _cooldown_time = self.default_cooldown_time
+            #########################################################
+
             cooldown_key, cooldown_data = self._common_add_cooldown_logic(
                 model_id=model_id,
                 original_exception=original_exception,

diff --git a/litellm/router_utils/cooldown_callbacks.py b/litellm/router_utils/cooldown_callbacks.py
@@ -22,7 +22,7 @@ async def router_cooldown_event_callback(
     litellm_router_instance: LitellmRouter,
     deployment_id: str,
     exception_status: Union[str, int],
-    cooldown_time: float,
+    cooldown_time: Optional[float],
 ):
     """
     Callback triggered when a deployment is put into cooldown by litellm

diff --git a/litellm/router_utils/cooldown_handlers.py b/litellm/router_utils/cooldown_handlers.py
@@ -7,6 +7,7 @@
 """
 
 import asyncio
+import math
 from typing import TYPE_CHECKING, Any, List, Optional, Union
 
 import litellm
@@ -96,6 +97,7 @@ def _should_run_cooldown_logic(
     deployment: Optional[str],
     exception_status: Union[str, int],
     original_exception: Any,
+    time_to_cooldown: Optional[float] = None,
 ) -> bool:
     """
     Helper that decides if cooldown logic should be run
@@ -116,6 +118,17 @@ def _should_run_cooldown_logic(
             "Should Not Run Cooldown Logic: deployment id is none or model group can't be found."
         )
         return False
+
+    #########################################################
+    # If time_to_cooldown is 0 or 0.0000000, don't run cooldown logic
+    #########################################################
+    if time_to_cooldown is not None and math.isclose(
+        a=time_to_cooldown, 
+        b=0.0, 
+        abs_tol=1e-9
+    ):
+        verbose_router_logger.debug("Should Not Run Cooldown Logic: time_to_cooldown is effectively 0")
+        return False
 
     if litellm_router_instance.disable_cooldowns:
         verbose_router_logger.debug(
@@ -261,7 +274,11 @@ def _set_cooldown_deployments(
 
     if (
         _should_run_cooldown_logic(
-            litellm_router_instance, deployment, exception_status, original_exception
+            litellm_router_instance=litellm_router_instance,
+            deployment=deployment, 
+            exception_status=exception_status, 
+            original_exception=original_exception,
+            time_to_cooldown=time_to_cooldown,
         )
         is False
         or deployment is None
@@ -270,20 +287,19 @@ def _set_cooldown_deployments(
         return False
 
     exception_status_int = cast_exception_status_to_int(exception_status)
-
     verbose_router_logger.debug(f"Attempting to add {deployment} to cooldown list")
-    cooldown_time = litellm_router_instance.cooldown_time or 1
-    if time_to_cooldown is not None:
-        cooldown_time = time_to_cooldown
 
     if _should_cooldown_deployment(
-        litellm_router_instance, deployment, exception_status, original_exception
+        litellm_router_instance=litellm_router_instance, 
+        deployment=deployment, 
+        exception_status=exception_status, 
+        original_exception=original_exception,
     ):
         litellm_router_instance.cooldown_cache.add_deployment_to_cooldown(
             model_id=deployment,
             original_exception=original_exception,
             exception_status=exception_status_int,
-            cooldown_time=cooldown_time,
+            cooldown_time=time_to_cooldown,
         )
 
         # Trigger cooldown callback handler
@@ -292,7 +308,7 @@ def _set_cooldown_deployments(
                 litellm_router_instance=litellm_router_instance,
                 deployment_id=deployment,
                 exception_status=exception_status,
-                cooldown_time=cooldown_time,
+                cooldown_time=time_to_cooldown,
             )
         )
         return True

diff --git a/tests/local_testing/test_router_cooldowns.py b/tests/local_testing/test_router_cooldowns.py
@@ -22,7 +22,7 @@
 import litellm
 from litellm import Router
 from litellm.integrations.custom_logger import CustomLogger
-from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments
+from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments, _should_run_cooldown_logic
 from litellm.types.router import (
     DeploymentTypedDict,
     LiteLLMParamsTypedDict,
@@ -121,6 +121,119 @@ async def test_dynamic_cooldowns():
     assert tmp_mock.call_args[0][0]["litellm_params"]["cooldown_time"] == 0
 
 
+@pytest.mark.asyncio
+async def test_cooldown_time_zero_uses_zero_not_default():
+    """
+    Test that when cooldown_time=0 is passed, it uses 0 instead of the default cooldown time
+    AND that the early exit logic prevents cooldown entirely
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "cooldown_time": 0,
+                },
+            },
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4",
+                },
+            },
+        ],
+        cooldown_time=300,  # Default cooldown time is 300 seconds
+        num_retries=0,
+    )
+
+    # Mock the add_deployment_to_cooldown method to verify it's NOT called
+    with patch.object(router.cooldown_cache, "add_deployment_to_cooldown") as mock_add_cooldown:
+        try:
+            await router.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response="litellm.RateLimitError",
+            )
+        except litellm.RateLimitError:
+            pass
+
+        # Verify that add_deployment_to_cooldown was NOT called due to early exit
+        mock_add_cooldown.assert_not_called()
+
+    # Also verify the deployment is not in cooldown
+    cooldown_list = await _async_get_cooldown_deployments(
+        litellm_router_instance=router, parent_otel_span=None
+    )
+    assert len(cooldown_list) == 0
+
+    # Verify the deployment is still healthy and available
+    healthy_deployments, _ = await router._async_get_healthy_deployments(
+        model="gpt-3.5-turbo", parent_otel_span=None
+    )
+    assert len(healthy_deployments) == 1
+
+
+def test_should_run_cooldown_logic_early_exit_on_zero_cooldown():
+    """
+    Unit test for _should_run_cooldown_logic to verify early exit when time_to_cooldown is 0
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+                "model_info": {
+                    "id": "test-deployment-id",
+                },
+            }
+        ],
+        cooldown_time=300,
+    )
+
+    # Test with time_to_cooldown = 0 - should return False (don't run cooldown logic)
+    result = _should_run_cooldown_logic(
+        litellm_router_instance=router,
+        deployment="test-deployment-id",
+        exception_status=429,
+        original_exception=litellm.RateLimitError("test error", "openai", "gpt-3.5-turbo"),
+        time_to_cooldown=0.0
+    )
+    assert result is False, "Should not run cooldown logic when time_to_cooldown is 0"
+
+    # Test with very small time_to_cooldown (effectively 0) - should return False
+    result = _should_run_cooldown_logic(
+        litellm_router_instance=router,
+        deployment="test-deployment-id",
+        exception_status=429,
+        original_exception=litellm.RateLimitError("test error", "openai", "gpt-3.5-turbo"),
+        time_to_cooldown=1e-10
+    )
+    assert result is False, "Should not run cooldown logic when time_to_cooldown is effectively 0"
+
+    # Test with None time_to_cooldown - should return True (use default cooldown logic)
+    result = _should_run_cooldown_logic(
+        litellm_router_instance=router,
+        deployment="test-deployment-id", 
+        exception_status=429,
+        original_exception=litellm.RateLimitError("test error", "openai", "gpt-3.5-turbo"),
+        time_to_cooldown=None
+    )
+    assert result is True, "Should run cooldown logic when time_to_cooldown is None"
+
+    # Test with positive time_to_cooldown - should return True
+    result = _should_run_cooldown_logic(
+        litellm_router_instance=router,
+        deployment="test-deployment-id",
+        exception_status=429,
+        original_exception=litellm.RateLimitError("test error", "openai", "gpt-3.5-turbo"),
+        time_to_cooldown=60.0
+    )
+    assert result is True, "Should run cooldown logic when time_to_cooldown is positive"
+
+
 @pytest.mark.parametrize("num_deployments", [1, 2])
 def test_single_deployment_no_cooldowns(num_deployments):
     """