Rate Limiting: Check all slots on redis, Reduce number of cache writes (BerriAI#11299)

krrishdholakia · stefan-- · commit b4247e76dd8e · 2025-06-12T17:22:11.000+02:00
* fix(base_routing_strategy.py): compress increments to redis - reduces write ops

* fix(base_routing_strategy.py): make get and reset in memory keys atomic

* fix(base_routing_strategy.py): don't reset keys - causes discrepency on subsequent requests to instance

* fix(parallel_request_limiter.py): retrieve values of previous slots from cache

more accurate rate limiting with sliding window

* fix: fix test

* fix: fix linting error
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
@@ -1,16 +1,100 @@
 model_list:
-  - model_name: fake-openai-endpoint
+  - model_name: "gemini-2.0-flash"
     litellm_params:
-      model: openai/fake
-      api_key: fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-  - model_name: "text-embedding-3-small"
+      model: vertex_ai/gemini-2.0-flash
+      vertex_project: my-project-id
+      vertex_location: us-central1
+  - model_name: "gpt-4o-mini-openai"
     litellm_params:
-      model: text-embedding-3-small
+      model: gpt-4o-mini
       api_key: os.environ/OPENAI_API_KEY
-  - model_name: papluca/xlm-roberta-base-language-detection
+  - model_name: "bedrock-nova"
+    litellm_params:
+      model: us.amazon.nova-pro-v1:0
+  - model_name: openrouter_model
+    litellm_params:
+      model: openrouter/openrouter_model
+      api_key: os.environ/OPENROUTER_API_KEY
+      api_base: http://0.0.0.0:8090
+  - model_name: dall-e-3-azure
     litellm_params:
-      model: openai/gpt-3.5-turbo
-      api_base: https://api.openai.com
+      model: azure/dall-e-3-test
+      api_version: "2023-12-01-preview"
+      api_base: os.environ/AZURE_SWEDEN_API_BASE
+      api_key: os.environ/AZURE_SWEDEN_API_KEY
+    model_info:
+      input_cost_per_pixel: 10
+  - model_name: "claude-3-7-sonnet"
+    litellm_params:
+      model: databricks/databricks-claude-3-7-sonnet
+      api_key: os.environ/DATABRICKS_API_KEY
+      api_base: os.environ/DATABRICKS_API_BASE
+  - model_name: "gpt-4.1"
+    litellm_params:
+      model: azure/gpt-4.1
+      api_key: os.environ/AZURE_API_KEY_REALTIME
+      api_base: https://krris-m2f9a9i7-eastus2.openai.azure.com/
+  - model_name: "xai/*"
+    litellm_params:
+      model: xai/*
+      api_key: os.environ/XAI_API_KEY
+  - model_name: "text-embedding-ada-002"
+    litellm_params:
+      model: text-embedding-ada-002
       api_key: os.environ/OPENAI_API_KEY
-  
+  - model_name: gemini/gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+  - model_name: llama-qwen
+    litellm_params:
+      model: ollama/qwen2:0.5b
+    model_info:
+      input_cost_per_token: 0.75
+      output_cost_per_token: 3
+  - model_name: gpt-image-1
+    litellm_params:
+      model: gpt-image-1
+      api_key: os.environ/OPENAI_API_KEY
+      # drop_params: true
+  - model_name: "gpt-4o-batch"
+    litellm_params:
+      model: azure/gpt-4o-mini
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+    model_info: 
+      id: my-general-azure-deployment
+      mode: batch
+  - model_name: "gpt-4o-batch"
+    litellm_params:
+      model: azure/gpt-4o-mini
+      api_base: https://krris-m2f9a9i7-eastus2.openai.azure.com
+      api_key: 04d22fb7e9ad4d9c8afe7c6abf97a6fc
+    model_info: 
+      id: my-unique-azure-deployment
+      mode: batch
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+general_settings:
+  store_model_in_db: true
+  store_prompts_in_spend_logs: true
+  disable_prisma_schema_update: true
+  # master_key: os.environ/PROXY_MASTER_KEY
+
+litellm_settings:
+  cache: true
+  cache_params:
+    type: redis
+    ttl: 600
+    password: os.environ/REDIS_PASSWORD
+    supported_call_types: ["acompletion", "completion"]
+    
+router_settings:
+  redis_password: os.environ/REDIS_PASSWORD
+
+
+
+
diff --git a/litellm/proxy/hooks/parallel_request_limiter_v2.py b/litellm/proxy/hooks/parallel_request_limiter_v2.py
@@ -67,7 +67,7 @@ def __init__(self, internal_usage_cache: InternalUsageCache):
             self,
             dual_cache=internal_usage_cache.dual_cache,
             should_batch_redis_writes=True,
-            default_sync_interval=0.01,
+            default_sync_interval=1,
         )
 
     def print_verbose(self, print_statement):
@@ -120,6 +120,27 @@ def _get_current_usage_key(
     def get_key_pattern_to_sync(self) -> Optional[str]:
         return self.prefix + "::"
 
+    def _get_slots_to_check(self, current_slot: int) -> List[str]:
+        slots_to_check = []
+        current_time = datetime.now()
+        for i in range(4):
+            slot_number = (current_slot - i) % 4  # This ensures we wrap around properly
+            minute = current_time.minute
+            hour = current_time.hour
+
+            # If we need to look at previous minute
+            if current_slot - i < 0:
+                if minute == 0:
+                    # If we're at minute 0, go to previous hour
+                    hour = (current_time.hour - 1) % 24
+                    minute = 59
+                else:
+                    minute = current_time.minute - 1
+
+            slot_key = f"{current_time.strftime('%Y-%m-%d')}-{hour:02d}-{minute:02d}-{slot_number}"
+            slots_to_check.append(slot_key)
+        return slots_to_check
+
     async def check_key_in_limits_v2(
         self,
         user_api_key_dict: UserAPIKeyAuth,
@@ -145,25 +166,9 @@ async def check_key_in_limits_v2(
         current_slot = (
             current_time.second // 15
         )  # This gives us 0-3 for the current 15s slot
-        slots_to_check = []
+        slots_to_check = self._get_slots_to_check(current_slot)
         slot_cache_keys = []
         # Calculate the last 4 slots, handling minute boundaries
-        for i in range(4):
-            slot_number = (current_slot - i) % 4  # This ensures we wrap around properly
-            minute = current_time.minute
-            hour = current_time.hour
-
-            # If we need to look at previous minute
-            if current_slot - i < 0:
-                if minute == 0:
-                    # If we're at minute 0, go to previous hour
-                    hour = (current_time.hour - 1) % 24
-                    minute = 59
-                else:
-                    minute = current_time.minute - 1
-
-            slot_key = f"{current_time.strftime('%Y-%m-%d')}-{hour:02d}-{minute:02d}-{slot_number}"
-            slots_to_check.append(slot_key)
 
         # For each slot, create keys for all rate limit groups
         for slot_key in slots_to_check:
@@ -183,6 +188,8 @@ async def check_key_in_limits_v2(
                     decrement_list.append(
                         (key, -1 if increment_value_by_group[group] == 1 else 0)
                     )
+                else:
+                    self.add_to_in_memory_keys_to_update(key=key)
                 slot_cache_keys.append(key)
 
         if (
diff --git a/litellm/router_strategy/base_routing_strategy.py b/litellm/router_strategy/base_routing_strategy.py
@@ -4,7 +4,7 @@
 
 import asyncio
 from abc import ABC
-from typing import List, Optional, Set, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 from litellm._logging import verbose_router_logger
 from litellm.caching.caching import DualCache
@@ -87,6 +87,7 @@ async def _increment_value_in_current_window(
             increment_value=value,
             ttl=ttl,
         )
+
         self.redis_increment_operation_queue.append(increment_op)
         self.add_to_in_memory_keys_to_update(key=key)
         return result
@@ -116,24 +117,40 @@ async def _push_in_memory_increments_to_redis(self):
         """
         How this works:
         - async_log_success_event collects all provider spend increments in `redis_increment_operation_queue`
-        - This function pushes all increments to Redis in a batched pipeline to optimize performance
+        - This function compresses multiple increments for the same key into a single operation
+        - Then pushes all increments to Redis in a batched pipeline to optimize performance
 
         Only runs if Redis is initialized
         """
         try:
             if not self.dual_cache.redis_cache:
                 return  # Redis is not initialized
 
-            # verbose_router_logger.debug(
-            #     "Pushing Redis Increment Pipeline for queue: %s",
-            #     self.redis_increment_operation_queue,
-            # )
             if len(self.redis_increment_operation_queue) > 0:
+                # Compress operations for the same key
+                compressed_ops: Dict[str, RedisPipelineIncrementOperation] = {}
+                ops_to_remove = []
+                for idx, op in enumerate(self.redis_increment_operation_queue):
+                    if op["key"] in compressed_ops:
+                        # Add to existing increment
+                        compressed_ops[op["key"]]["increment_value"] += op[
+                            "increment_value"
+                        ]
+                    else:
+                        compressed_ops[op["key"]] = op
+
+                    ops_to_remove.append(idx)
+                # Convert back to list
+                compressed_queue = list(compressed_ops.values())
+
                 await self.dual_cache.redis_cache.async_increment_pipeline(
-                    increment_list=self.redis_increment_operation_queue,
+                    increment_list=compressed_queue,
                 )
-
-            self.redis_increment_operation_queue = []
+                self.redis_increment_operation_queue = [
+                    op
+                    for idx, op in enumerate(self.redis_increment_operation_queue)
+                    if idx not in ops_to_remove
+                ]
 
         except Exception as e:
             verbose_router_logger.error(
@@ -153,6 +170,12 @@ def get_key_pattern_to_sync(self) -> Optional[str]:
     def get_in_memory_keys_to_update(self) -> Set[str]:
         return self.in_memory_keys_to_update
 
+    def get_and_reset_in_memory_keys_to_update(self) -> Set[str]:
+        """Atomic get and reset in-memory keys to update"""
+        keys = self.in_memory_keys_to_update
+        self.in_memory_keys_to_update = set()
+        return keys
+
     def reset_in_memory_keys_to_update(self):
         self.in_memory_keys_to_update = set()
 
@@ -174,9 +197,6 @@ async def _sync_in_memory_spend_with_redis(self):
             if self.dual_cache.redis_cache is None:
                 return
 
-            # 1. Push all provider spend increments to Redis
-            await self._push_in_memory_increments_to_redis()
-
             # 2. Fetch all current provider spend from Redis to update in-memory cache
             cache_keys = (
                 self.get_in_memory_keys_to_update()
@@ -195,39 +215,29 @@ async def _sync_in_memory_spend_with_redis(self):
                 )
             )
             for k, v in zip(cache_keys_list, in_memory_before):
-                in_memory_before_dict[k] = v
+                in_memory_before_dict[k] = float(v or 0)
+
+            # 1. Push all provider spend increments to Redis
+            await self._push_in_memory_increments_to_redis()
 
             # 2. Fetch from Redis
             redis_values = await self.dual_cache.redis_cache.async_batch_get_cache(
                 key_list=cache_keys_list
             )
 
-            # 3. Snapshot in-memory after
-            in_memory_after = (
-                await self.dual_cache.in_memory_cache.async_batch_get_cache(
-                    keys=cache_keys_list
-                )
-            )
-            in_memory_after_dict = {}
-            for k, v in zip(cache_keys_list, in_memory_after):
-                in_memory_after_dict[k] = v
-
             # 4. Merge
             for key in cache_keys_list:
                 redis_val = float(redis_values.get(key, 0) or 0)
                 before = float(in_memory_before_dict.get(key, 0) or 0)
-                after = float(in_memory_after_dict.get(key, 0) or 0)
+                after = float(
+                    await self.dual_cache.in_memory_cache.async_get_cache(key=key) or 0
+                )
                 delta = after - before
-                if delta > 0:
-                    await self._increment_value_in_current_window(
-                        key=key, value=delta, ttl=60
-                    )
                 merged = redis_val + delta
                 await self.dual_cache.in_memory_cache.async_set_cache(
                     key=key, value=merged
                 )
 
-            self.reset_in_memory_keys_to_update()
         except Exception as e:
             verbose_router_logger.exception(
                 f"Error syncing in-memory cache with Redis: {str(e)}"
diff --git a/tests/test_litellm/router_strategy/test_base_routing_strategy.py b/tests/test_litellm/router_strategy/test_base_routing_strategy.py
@@ -99,7 +99,7 @@ async def test_sync_in_memory_spend_with_redis(base_strategy, mock_dual_cache):
     # Setup test data
     base_strategy.in_memory_keys_to_update = {"key1"}
 
-    # Mock the in-memory cache batch get responses
+    # Mock the in-memory cache batch get responses for before snapshot
     in_memory_before_future: asyncio.Future[List[str]] = asyncio.Future()
     in_memory_before_future.set_result(["5.0"])  # Initial values
     mock_dual_cache.in_memory_cache.async_batch_get_cache.return_value = (
@@ -111,13 +111,12 @@ async def test_sync_in_memory_spend_with_redis(base_strategy, mock_dual_cache):
     redis_future.set_result({"key1": "15.0"})  # Redis values
     mock_dual_cache.redis_cache.async_batch_get_cache.return_value = redis_future
 
-    # Mock in-memory after snapshot
-    in_memory_after_future: asyncio.Future[List[str]] = asyncio.Future()
-    in_memory_after_future.set_result(["8.0"])  # Values after potential updates
-    mock_dual_cache.in_memory_cache.async_batch_get_cache.side_effect = [
-        in_memory_before_future,  # First call for before snapshot
-        in_memory_after_future,  # Second call for after snapshot
-    ]
+    # Mock in-memory get for after snapshot
+    in_memory_after_future: asyncio.Future[Optional[str]] = asyncio.Future()
+    in_memory_after_future.set_result("8.0")  # Value after potential updates
+    mock_dual_cache.in_memory_cache.async_get_cache.return_value = (
+        in_memory_after_future
+    )
 
     await base_strategy._sync_in_memory_spend_with_redis()
 
@@ -129,19 +128,17 @@ async def test_sync_in_memory_spend_with_redis(base_strategy, mock_dual_cache):
 
     # Verify in-memory cache was updated with merged values
     # For key1: redis_val(15.0) + delta(8.0 - 5.0) = 18.0
-    # For key2: redis_val(20.0) + delta(12.0 - 10.0) = 22.0
     assert mock_dual_cache.in_memory_cache.async_set_cache.call_count == 1
 
     # Verify the final merged values
     set_cache_calls = mock_dual_cache.in_memory_cache.async_set_cache.call_args_list
-    print(f"set_cache_calls: {set_cache_calls}")
     assert any(
-        call.kwargs["key"] == "key1" and call.kwargs["value"] == 18.0
+        call.kwargs["key"] == "key1" and float(call.kwargs["value"]) == 18.0
         for call in set_cache_calls
     )
 
-    # Verify cache keys were reset
-    assert len(base_strategy.in_memory_keys_to_update) == 0
+    # Verify cache keys still exist
+    assert len(base_strategy.in_memory_keys_to_update) == 1
 
 
 def test_cache_keys_management(base_strategy):