Logfire - fix(opentelemetry.py): Fix otel proxy server initialization (#11091)

krrishdholakia · krrishdholakia · commit c67782aac70f · 2025-05-24T08:19:19.000-07:00
* fix(opentelemetry.py): Fix otel proxy server initialization Fixes #10349 (comment) * feat(router.py): allow ignoring invalid deployments on model load Prevents invalid models from preventing loading other valid models Fixes issue where on instance spin up invalid models were blocking valid models from being used * test: add additional unit testing * fix(user_api_key_auth.py): return abbreviated key in exception - make it easy to debug which key is invalid for client * docs(config_settings.md): document param * fix(user_api_key_auth.py): fix error string to match previous one
diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py
@@ -135,7 +135,13 @@ def _init_otel_logger_on_litellm_proxy(self):
         - Adds Otel as a service callback
         - Sets `proxy_server.open_telemetry_logger` to self
         """
-        from litellm.proxy import proxy_server
+        try:
+            from litellm.proxy import proxy_server
+        except ImportError:
+            verbose_logger.warning(
+                "Proxy Server is not installed. Skipping OpenTelemetry initialization."
+            )
+            return
 
         # Add Otel as a service callback
         if "otel" not in litellm.service_callback:
diff --git a/litellm/proxy/auth/auth_utils.py b/litellm/proxy/auth/auth_utils.py
@@ -534,3 +534,7 @@ def get_model_from_request(
             model = match.group(1)
 
     return model
+
+
+def abbreviate_api_key(api_key: str) -> str:
+    return f"sk-...{api_key[-4:]}"
diff --git a/litellm/proxy/auth/user_api_key_auth.py b/litellm/proxy/auth/user_api_key_auth.py
@@ -39,6 +39,7 @@
 )
 from litellm.proxy.auth.auth_exception_handler import UserAPIKeyAuthExceptionHandler
 from litellm.proxy.auth.auth_utils import (
+    abbreviate_api_key,
     get_end_user_id_from_request_body,
     get_model_from_request,
     get_request_route,
@@ -751,17 +752,25 @@ async def _user_api_key_auth_builder(  # noqa: PLR0915
 
         ## check for cache hit (In-Memory Cache)
         _user_role = None
+        abbreviated_api_key = abbreviate_api_key(api_key=api_key)
         if api_key.startswith("sk-"):
             api_key = hash_token(token=api_key)
 
         if valid_token is None:
-            valid_token = await get_key_object(
-                hashed_token=api_key,
-                prisma_client=prisma_client,
-                user_api_key_cache=user_api_key_cache,
-                parent_otel_span=parent_otel_span,
-                proxy_logging_obj=proxy_logging_obj,
-            )
+            try:
+                valid_token = await get_key_object(
+                    hashed_token=api_key,
+                    prisma_client=prisma_client,
+                    user_api_key_cache=user_api_key_cache,
+                    parent_otel_span=parent_otel_span,
+                    proxy_logging_obj=proxy_logging_obj,
+                )
+            except ProxyException as e:
+                if e.code == 401 or e.code == "401":
+                    e.message = "Authentication Error, Invalid proxy server token passed. Received API Key = {}, Key Hash (Token) ={}. Unable to find token in cache or `LiteLLM_VerificationTokenTable`".format(
+                        abbreviated_api_key, api_key
+                    )
+                raise e
             # update end-user params on valid token
             # These can change per request - it's important to update them here
             valid_token.end_user_id = end_user_params.get("end_user_id")
@@ -775,13 +784,6 @@ async def _user_api_key_auth_builder(  # noqa: PLR0915
                 valid_token
             )  # updating it here, allows all downstream reporting / checks to use the updated budget
 
-        if valid_token is None:
-            raise Exception(
-                "Invalid proxy server token passed. Received API Key (hashed)={}. Unable to find token in cache or `LiteLLM_VerificationTokenTable`".format(
-                    api_key
-                )
-            )
-
         user_obj: Optional[LiteLLM_UserTable] = None
         valid_token_dict: dict = {}
         if valid_token is not None:
diff --git a/litellm/proxy/common_utils/encrypt_decrypt_utils.py b/litellm/proxy/common_utils/encrypt_decrypt_utils.py
@@ -57,6 +57,7 @@ def decrypt_value_helper(
         if exception_type == "debug":
             verbose_proxy_logger.debug(error_message)
             return None
+
         verbose_proxy_logger.error(error_message)
         # [Non-Blocking Exception. - this should not block decrypting other values]
         return None
diff --git a/litellm/proxy/management_endpoints/key_management_endpoints.py b/litellm/proxy/management_endpoints/key_management_endpoints.py
@@ -34,6 +34,7 @@
     get_key_object,
     get_team_object,
 )
+from litellm.proxy.auth.auth_utils import abbreviate_api_key
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_utils.timezone_utils import get_budget_reset_time
 from litellm.proxy.hooks.key_management_event_hooks import KeyManagementEventHooks
@@ -1385,7 +1386,7 @@ async def generate_key_helper_fn(  # noqa: PLR0915
         ):  # allow user to disable storing abbreviated key name (shown in UI, to help figure out which key spent how much)
             pass
         else:
-            key_data["key_name"] = f"sk-...{token[-4:]}"
+            key_data["key_name"] = abbreviate_api_key(api_key=token)
         saved_token = copy.deepcopy(key_data)
         if isinstance(saved_token["aliases"], str):
             saved_token["aliases"] = json.loads(saved_token["aliases"])
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
@@ -2000,6 +2000,7 @@ async def load_config(  # noqa: PLR0915
             router_general_settings=RouterGeneralSettings(
                 async_only_mode=True  # only init async clients
             ),
+            ignore_invalid_deployments=True,  # don't raise an error if a deployment is invalid
         )  # type:ignore
 
         if redis_usage_cache is not None and router.cache.redis_cache is None:
@@ -2326,6 +2327,7 @@ async def _update_llm_router(
                         router_general_settings=RouterGeneralSettings(
                             async_only_mode=True  # only init async clients
                         ),
+                        ignore_invalid_deployments=True,
                     )
                     verbose_proxy_logger.debug(f"updated llm_router: {llm_router}")
             else:
diff --git a/litellm/router.py b/litellm/router.py
@@ -255,6 +255,7 @@ def __init__(  # noqa: PLR0915
         router_general_settings: Optional[
             RouterGeneralSettings
         ] = RouterGeneralSettings(),
+        ignore_invalid_deployments: bool = False,
     ) -> None:
         """
         Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
@@ -287,6 +288,7 @@ def __init__(  # noqa: PLR0915
             routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
             alerting_config (AlertingConfig): Slack alerting configuration. Defaults to None.
             provider_budget_config (ProviderBudgetConfig): Provider budget configuration. Use this to set llm_provider budget limits. example $100/day to OpenAI, $100/day to Azure, etc. Defaults to None.
+            ignore_invalid_deployments (bool): Ignores invalid deployments, and continues with other deployments. Default is to raise an error.
         Returns:
             Router: An instance of the litellm.Router class.
 
@@ -327,6 +329,7 @@ def __init__(  # noqa: PLR0915
         from litellm._service_logger import ServiceLogging
 
         self.set_verbose = set_verbose
+        self.ignore_invalid_deployments = ignore_invalid_deployments
         self.debug_level = debug_level
         self.enable_pre_call_checks = enable_pre_call_checks
         self.enable_tag_filtering = enable_tag_filtering
@@ -502,6 +505,7 @@ def __init__(  # noqa: PLR0915
             }
         }
         """
+
         ### ROUTING SETUP ###
         self.routing_strategy_init(
             routing_strategy=routing_strategy,
@@ -4467,52 +4471,63 @@ def _create_deployment(
         - Deployment: The deployment object
         - None: If the deployment is not active for the current environment (if 'supported_environments' is set in litellm_params)
         """
-        deployment = Deployment(
-            **deployment_info,
-            model_name=_model_name,
-            litellm_params=LiteLLM_Params(**_litellm_params),
-            model_info=_model_info,
-        )
+        try:
+            deployment = Deployment(
+                **deployment_info,
+                model_name=_model_name,
+                litellm_params=LiteLLM_Params(**_litellm_params),
+                model_info=_model_info,
+            )
+            for field in CustomPricingLiteLLMParams.model_fields.keys():
+                if deployment.litellm_params.get(field) is not None:
+                    _model_info[field] = deployment.litellm_params[field]
+
+            ## REGISTER MODEL INFO IN LITELLM MODEL COST MAP
+            model_id = deployment.model_info.id
+            if model_id is not None:
+                litellm.register_model(
+                    model_cost={
+                        model_id: _model_info,
+                    }
+                )
 
-        for field in CustomPricingLiteLLMParams.model_fields.keys():
-            if deployment.litellm_params.get(field) is not None:
-                _model_info[field] = deployment.litellm_params[field]
+            ## OLD MODEL REGISTRATION ## Kept to prevent breaking changes
+            _model_name = deployment.litellm_params.model
+            if deployment.litellm_params.custom_llm_provider is not None:
+                _model_name = (
+                    deployment.litellm_params.custom_llm_provider + "/" + _model_name
+                )
 
-        ## REGISTER MODEL INFO IN LITELLM MODEL COST MAP
-        model_id = deployment.model_info.id
-        if model_id is not None:
             litellm.register_model(
                 model_cost={
-                    model_id: _model_info,
+                    _model_name: _model_info,
                 }
             )
 
-        ## OLD MODEL REGISTRATION ## Kept to prevent breaking changes
-        _model_name = deployment.litellm_params.model
-        if deployment.litellm_params.custom_llm_provider is not None:
-            _model_name = (
-                deployment.litellm_params.custom_llm_provider + "/" + _model_name
-            )
-
-        litellm.register_model(
-            model_cost={
-                _model_name: _model_info,
-            }
-        )
-
-        ## Check if LLM Deployment is allowed for this deployment
-        if self.deployment_is_active_for_environment(deployment=deployment) is not True:
-            verbose_router_logger.warning(
-                f"Ignoring deployment {deployment.model_name} as it is not active for environment {deployment.model_info['supported_environments']}"
-            )
-            return None
+            ## Check if LLM Deployment is allowed for this deployment
+            if (
+                self.deployment_is_active_for_environment(deployment=deployment)
+                is not True
+            ):
+                verbose_router_logger.warning(
+                    f"Ignoring deployment {deployment.model_name} as it is not active for environment {deployment.model_info['supported_environments']}"
+                )
+                return None
 
-        deployment = self._add_deployment(deployment=deployment)
+            deployment = self._add_deployment(deployment=deployment)
 
-        model = deployment.to_json(exclude_none=True)
+            model = deployment.to_json(exclude_none=True)
 
-        self.model_list.append(model)
-        return deployment
+            self.model_list.append(model)
+            return deployment
+        except Exception as e:
+            if self.ignore_invalid_deployments:
+                verbose_router_logger.exception(
+                    f"Error creating deployment: {e}, ignoring and continuing with other deployments."
+                )
+                return None
+            else:
+                raise e
 
     def deployment_is_active_for_environment(self, deployment: Deployment) -> bool:
         """
@@ -4760,12 +4775,11 @@ def add_deployment(self, deployment: Deployment) -> Optional[Deployment]:
 
         # add to model list
         _deployment = deployment.to_json(exclude_none=True)
-        self.model_list.append(_deployment)
-
         # initialize client
         self._add_deployment(deployment=deployment)
 
         # add to model names
+        self.model_list.append(_deployment)
         self.model_names.append(deployment.model_name)
         return deployment
 
@@ -4778,31 +4792,40 @@ def upsert_deployment(self, deployment: Deployment) -> Optional[Deployment]:
         Returns:
         - The added/updated deployment
         """
-        # check if deployment already exists
-        _deployment_model_id = deployment.model_info.id or ""
+        try:
+            # check if deployment already exists
+            _deployment_model_id = deployment.model_info.id or ""
 
-        _deployment_on_router: Optional[Deployment] = self.get_deployment(
-            model_id=_deployment_model_id
-        )
-        if _deployment_on_router is not None:
-            # deployment with this model_id exists on the router
-            if deployment.litellm_params == _deployment_on_router.litellm_params:
-                # No need to update
-                return None
+            _deployment_on_router: Optional[Deployment] = self.get_deployment(
+                model_id=_deployment_model_id
+            )
+            if _deployment_on_router is not None:
+                # deployment with this model_id exists on the router
+                if deployment.litellm_params == _deployment_on_router.litellm_params:
+                    # No need to update
+                    return None
 
-            # if there is a new litellm param -> then update the deployment
-            # remove the previous deployment
-            removal_idx: Optional[int] = None
-            for idx, model in enumerate(self.model_list):
-                if model["model_info"]["id"] == deployment.model_info.id:
-                    removal_idx = idx
+                # if there is a new litellm param -> then update the deployment
+                # remove the previous deployment
+                removal_idx: Optional[int] = None
+                for idx, model in enumerate(self.model_list):
+                    if model["model_info"]["id"] == deployment.model_info.id:
+                        removal_idx = idx
 
-            if removal_idx is not None:
-                self.model_list.pop(removal_idx)
+                if removal_idx is not None:
+                    self.model_list.pop(removal_idx)
 
-        # if the model_id is not in router
-        self.add_deployment(deployment=deployment)
-        return deployment
+            # if the model_id is not in router
+            self.add_deployment(deployment=deployment)
+            return deployment
+        except Exception as e:
+            if self.ignore_invalid_deployments:
+                verbose_router_logger.warning(
+                    f"Error upserting deployment: {e}, ignoring and continuing with other deployments."
+                )
+                return None
+            else:
+                raise e
 
     def delete_deployment(self, id: str) -> Optional[Deployment]:
         """
diff --git a/tests/litellm/test_router.py b/tests/litellm/test_router.py
@@ -296,3 +296,34 @@ async def test_router_amoderation_with_credential_name(mock_amoderation):
     )
     assert call_kwargs["litellm_credential_name"] == "my-custom-auth"
     assert call_kwargs["model"] == "text-moderation-stable"
+
+
+def test_router_ignore_invalid_deployments():
+    """
+    Test that router.ignore_invalid_deployments is set to True
+    """
+    from litellm.types.router import Deployment
+
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {"model": "my-bad-model"},
+            },
+        ],
+        ignore_invalid_deployments=True,
+    )
+
+    assert router.ignore_invalid_deployments is True
+    assert router.get_model_list() == []
+
+    ## check upsert deployment
+    router.upsert_deployment(
+        Deployment(
+            model_name="gpt-3.5-turbo",
+            litellm_params={"model": "my-bad-model"},
+            model_info={"tpm": 1000, "rpm": 1000},
+        )
+    )
+
+    assert router.get_model_list() == []