BerriAI · ishaan-jaff · Jun 4, 2025 · Jun 3, 2025 · Jun 4, 2025
diff --git a/litellm/proxy/common_utils/debug_utils.py b/litellm/proxy/common_utils/debug_utils.py
@@ -1,7 +1,9 @@
 # Start tracing memory allocations
+import asyncio
 import json
 import os
 import tracemalloc
+from collections import Counter
 
 from fastapi import APIRouter
 
@@ -10,6 +12,43 @@
 
 router = APIRouter()
 
+
+@router.get("/debug/asyncio-tasks")
+async def get_active_tasks_stats():
+    """
+    Returns:
+      total_active_tasks: int
+      by_name: { coroutine_name: count }
+    """
+    MAX_TASKS_TO_CHECK = 5000
+    # Gather all tasks in this event loop (including this endpoint’s own task).
+    all_tasks = asyncio.all_tasks()
+
+    # Filter out tasks that are already done.
+    active_tasks = [t for t in all_tasks if not t.done()]
+
+    # Count how many active tasks exist, grouped by coroutine function name.
+    counter = Counter()
+    for idx, task in enumerate(active_tasks):
+
+        # reasonable max circuit breaker
+        if idx >= MAX_TASKS_TO_CHECK:
+            break
+        coro = task.get_coro()
+        # Derive a human‐readable name from the coroutine:
+        name = (
+            getattr(coro, "__qualname__", None)
+            or getattr(coro, "__name__", None)
+            or repr(coro)
+        )
+        counter[name] += 1
+
+    return {
+        "total_active_tasks": len(active_tasks),
+        "by_name": dict(counter),
+    }
+
+
 if os.environ.get("LITELLM_PROFILE", "false").lower() == "true":
     try:
         import objgraph  # type: ignore

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
@@ -1,8 +1,109 @@
 model_list:
-  - model_name: openai/*
+  - model_name: fake-openai-endpoint
     litellm_params:
-      model: openai/*
-
+      model: openai/my-fake-model
+      api_key: my-fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+  - model_name: "anthropic/*"
+    litellm_params:
+      model: "anthropic/*"
+      api_key: os.environ/ANTHROPIC_API_KEY
+  - model_name: "bedrock/*"
+    litellm_params:
+      model: "bedrock/*"
+  - model_name: "bedrock-useast1/*"
+    litellm_params:
+      model: "bedrock/*"
+  - model_name: "bedrock-useast2/*"
+    litellm_params:
+      model: "bedrock/*"
+      aws_region_name: us-east-2
+  - model_name: "bedrock-uswest2/*"
+    litellm_params:
+      model: "bedrock/*"
+      aws_region_name: us-west-2
+  - model_name: "vertex_ai/*"
+    litellm_params:
+      model: "vertex_ai/*"
+      vertex_project: os.environ/VERTEX_PROJECT
+      vertex_location: os.environ/VERTEX_LOCATION
+      vertex_credentials: os.environ/VERTEX_SERVICE_ACCOUNT
+  - model_name: "gemini/*"
+    litellm_params:
+      model: "gemini/*"
+      api_key: os.environ/GEMINI_API_KEY
+  - model_name: "gemini-dev/*"
+    litellm_params:
+      model: "gemini/*"
+      api_key: os.environ/GEMINI_API_KEY_DEV
+  - model_name: "databricks/*"
+    litellm_params:
+      model: "databricks/*"
+      api_key: os.environ/DATABRICKS_API_KEY
+      api_base: os.environ/DATABRICKS_API_BASE
 
 litellm_settings:
+  cache: True
+  cache_params:
+    type: redis
+    host: os.environ/REDIS_HOST
+    port: os.environ/REDIS_PORT
+    password: os.environ/REDIS_PASSWORD
+    supported_call_types:
+      - acompletion
+      - completion
+  request_timeout: 30
+  allowed_fails: 3
+  # callbacks:
+    # - otel
+    # - prometheus
+  failure_callback:
+    - sentry
+  success_callback:
+    - s3_v2
+  s3_callback_params:
+    s3_bucket_name: load-testing-oct
   disable_token_counter: True
+  default_internal_user_params:
+    user_role: os.environ/DEFAULT_USER_ROLE
+
+callback_settings:
+  otel:
+    message_logging: False
+
+router_settings:
+  routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  redis_host: os.environ/REDIS_HOST
+  redis_port: os.environ/REDIS_PORT
+  redis_password: os.environ/REDIS_PASSWORD
+  retry_policy: {
+    # Set the number of retries for each exception type.
+    # The logic is as follows:
+    # 1. For anything that is likely to repeat the same outcome, don't retry.
+    # 2. Internal server errors might be transient, so retry once.
+    # 3. For rate limit errors, retry twice.
+    # https://docs.litellm.ai/docs/routing#advanced-custom-retries-cooldowns-based-on-error-type
+    # Based on that doc, rate limit retries use exponential backoff whereas others are immediate.
+    "AuthenticationErrorRetries": 0,
+    "BadRequestErrorRetries": 0,
+    "ContentPolicyViolationErrorRetries": 0,
+    "InternalServerErrorRetries": 1,
+    "RateLimitErrorRetries": 2,
+    "TimeoutErrorRetries": 0
+  }
+
+general_settings:
+  disable_spend_logs: True
+  proxy_batch_write_at: 60
+  use_redis_transaction_buffer: true
+  alert_types: # https://docs.litellm.ai/docs/proxy/alerting#all-possible-alert-types
+    - db_exceptions
+    - cooldown_deployment
+    - failed_tracking_spend
+    - fallback_reports
+    # - llm_requests_hanging
+    - llm_too_slow
+    - new_model_added
+    - outage_alerts
+    - region_outage_alerts
+  alerting: ["slack"]