From a48acd95f860c4fc85853e20668eabffff07cae7 Mon Sep 17 00:00:00 2001
From: Camille Farineau <camille.farineau@datagalaxy.com>
Date: Wed, 7 May 2025 18:52:04 +0200
Subject: [PATCH 1/6] fix(embeddings): use non default tokenizer when passing
 list of lists of tokens (int)

---
 litellm/proxy/proxy_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 18e1d8d98a75..b215afc9e314 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -3821,7 +3821,7 @@ async def embeddings(  # noqa: PLR0915
                         input_list = []
                         for i in data["input"]:
                             input_list.append(
-                                litellm.decode(model="gpt-3.5-turbo", tokens=i)
+                                litellm.decode(model=m["model_name"], tokens=i)
                             )
                         data["input"] = input_list
                         break

From 18f57d6cc1f3e3b0b432abfda7e1e31bf5db2e26 Mon Sep 17 00:00:00 2001
From: Camille Farineau <camille.farineau@datagalaxy.com>
Date: Wed, 7 May 2025 18:52:37 +0200
Subject: [PATCH 2/6] feat(embeddings): allow for passthrough of list of lists
 of tokens to hosted_vllm models

---
 litellm/proxy/proxy_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index b215afc9e314..5c22bc4ea088 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -3814,6 +3814,7 @@ async def embeddings(  # noqa: PLR0915
                     if m["model_name"] == data["model"] and (
                         m["litellm_params"]["model"] in litellm.open_ai_embedding_models
                         or m["litellm_params"]["model"].startswith("azure/")
+                        or m["litellm_params"]["model"].startswith("hosted_vllm/")
                     ):
                         pass
                     else:

From a11ea61a4088a629b13c862caebbb45e38a044d3 Mon Sep 17 00:00:00 2001
From: Camille Farineau <camille.farineau@datagalaxy.com>
Date: Mon, 12 May 2025 15:13:37 +0200
Subject: [PATCH 3/6] Revert "fix(embeddings): use non default tokenizer when
 passing list of lists of tokens (int)"

This reverts commit a48acd95f860c4fc85853e20668eabffff07cae7.
---
 litellm/proxy/proxy_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 5c22bc4ea088..6cd3e845a652 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -3822,7 +3822,7 @@ async def embeddings(  # noqa: PLR0915
                         input_list = []
                         for i in data["input"]:
                             input_list.append(
-                                litellm.decode(model=m["model_name"], tokens=i)
+                                litellm.decode(model="gpt-3.5-turbo", tokens=i)
                             )
                         data["input"] = input_list
                         break

From 9ae916646fc20319a6e48514fb7b8e3f8c9f8d9d Mon Sep 17 00:00:00 2001
From: Camille Farineau <camille.farineau@datagalaxy.com>
Date: Mon, 12 May 2025 17:13:06 +0200
Subject: [PATCH 4/6] refactor(embeddings): use a list to verify if provider
 accept as input a list of tokens

---
 litellm/constants.py          | 6 ++++++
 litellm/proxy/proxy_server.py | 9 ++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/litellm/constants.py b/litellm/constants.py
index fa944c0dfaa1..8e3d529c16d3 100644
--- a/litellm/constants.py
+++ b/litellm/constants.py
@@ -164,6 +164,12 @@
     "meta_llama",
 ]
 
+LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS = [
+    "openai",
+    "azure",
+    "hosted_vllm"
+]
+
 
 OPENAI_CHAT_COMPLETION_PARAMS = [
     "functions",
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 6cd3e845a652..3b8710982084 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -28,6 +28,7 @@
 from litellm.constants import (
     DEFAULT_MAX_RECURSE_DEPTH,
     DEFAULT_SLACK_ALERTING_THRESHOLD,
+    LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS
 )
 from litellm.types.utils import (
     ModelResponse,
@@ -3808,13 +3809,15 @@ async def embeddings(  # noqa: PLR0915
             and isinstance(data["input"][0], list)
             and isinstance(data["input"][0][0], int)
         ):  # check if array of tokens passed in
-            # check if non-openai/azure model called - e.g. for langchain integration
+            # check if provider accept list of tokens as input - e.g. for langchain integration
             if llm_model_list is not None and data["model"] in router_model_names:
                 for m in llm_model_list:
                     if m["model_name"] == data["model"] and (
                         m["litellm_params"]["model"] in litellm.open_ai_embedding_models
-                        or m["litellm_params"]["model"].startswith("azure/")
-                        or m["litellm_params"]["model"].startswith("hosted_vllm/")
+                        or any(
+                            m["litellm_params"]["model"].startswith(provider)
+                            for provider in LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS
+                        )
                     ):
                         pass
                     else:

From 94955a7c2e93e61958bd48e7c0b15105bf84c869 Mon Sep 17 00:00:00 2001
From: Camille Farineau <camille.farineau@datagalaxy.com>
Date: Mon, 12 May 2025 17:16:57 +0200
Subject: [PATCH 5/6] fix(embeddings): verify the model name before validating
 if provider accept a arrays of tokens as input

When passing a list of tokens as input, verify the provider of the model by going through the list of models (`llm_model_list`). First, it check for model name then get the provider and verify if it accept or not arrays of tokens. If yes, then pass, else decode.
Previously, it was verifying provider and model name at the same time resulting in decoding even if the current model checked was not the target one (looping onto `llm_model_list`)
---
 litellm/proxy/proxy_server.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 3b8710982084..5eda8c5abb02 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -3812,23 +3812,23 @@ async def embeddings(  # noqa: PLR0915
             # check if provider accept list of tokens as input - e.g. for langchain integration
             if llm_model_list is not None and data["model"] in router_model_names:
                 for m in llm_model_list:
-                    if m["model_name"] == data["model"] and (
-                        m["litellm_params"]["model"] in litellm.open_ai_embedding_models
-                        or any(
-                            m["litellm_params"]["model"].startswith(provider)
-                            for provider in LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS
-                        )
-                    ):
-                        pass
-                    else:
-                        # non-openai/azure embedding model called with token input
-                        input_list = []
-                        for i in data["input"]:
-                            input_list.append(
-                                litellm.decode(model="gpt-3.5-turbo", tokens=i)
-                            )
-                        data["input"] = input_list
-                        break
+                    if m["model_name"] == data["model"]:
+                        if (m["litellm_params"]["model"] in litellm.open_ai_embedding_models
+                                or any(
+                                    m["litellm_params"]["model"].startswith(provider)
+                                    for provider in LITELLM_EMBEDDING_PROVIDERS_SUPPORTING_INPUT_ARRAY_OF_TOKENS
+                                )
+                        ):
+                            pass
+                        else:
+                            # non-openai/azure embedding model called with token input
+                            input_list = []
+                            for i in data["input"]:
+                                input_list.append(
+                                    litellm.decode(model="gpt-3.5-turbo", tokens=i)
+                                )
+                            data["input"] = input_list
+                            break
 
         ### CALL HOOKS ### - modify incoming data / reject request before calling the model
         data = await proxy_logging_obj.pre_call_hook(

From cedacd18bae48338fbc1aa42de74a0df50abdf8c Mon Sep 17 00:00:00 2001
From: Camille Farineau <camille.farineau@datagalaxy.com>
Date: Tue, 13 May 2025 10:45:31 +0200
Subject: [PATCH 6/6] test(embedding): add unit test to bypass decode for some
 providers with input as array of tokens

Ref: https://github.com/BerriAI/litellm/issues/10113
---
 .../test_configs/test_config_no_auth.yaml     |  6 ++
 tests/litellm/proxy/test_proxy_server.py      | 76 +++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 tests/litellm/proxy/test_configs/test_config_no_auth.yaml

diff --git a/tests/litellm/proxy/test_configs/test_config_no_auth.yaml b/tests/litellm/proxy/test_configs/test_config_no_auth.yaml
new file mode 100644
index 000000000000..1b6b9ad198e0
--- /dev/null
+++ b/tests/litellm/proxy/test_configs/test_config_no_auth.yaml
@@ -0,0 +1,6 @@
+model_list:
+- litellm_params:
+    model: hosted_vllm/embed_model
+  model_info:
+    description: this is a test embedding hosted_vllm model
+  model_name: vllm_embed_model
diff --git a/tests/litellm/proxy/test_proxy_server.py b/tests/litellm/proxy/test_proxy_server.py
index 919a00d67039..ba787bc1e1e2 100644
--- a/tests/litellm/proxy/test_proxy_server.py
+++ b/tests/litellm/proxy/test_proxy_server.py
@@ -1,9 +1,11 @@
+import asyncio
 import importlib
 import json
 import os
 import socket
 import subprocess
 import sys
+from unittest import mock
 from unittest.mock import AsyncMock, MagicMock, mock_open, patch
 
 import click
@@ -18,6 +20,51 @@
 )  # Adds the parent directory to the system-path
 
 import litellm
+from litellm.proxy.proxy_server import app, initialize
+
+example_embedding_result = {
+    "object": "list",
+    "data": [
+        {
+            "object": "embedding",
+            "index": 0,
+            "embedding": [
+                -0.006929283495992422,
+                -0.005336422007530928,
+                -4.547132266452536e-05,
+                -0.024047505110502243,
+                -0.006929283495992422,
+                -0.005336422007530928,
+                -4.547132266452536e-05,
+                -0.024047505110502243,
+                -0.006929283495992422,
+                -0.005336422007530928,
+                -4.547132266452536e-05,
+                -0.024047505110502243,
+            ],
+        }
+    ],
+    "model": "text-embedding-3-small",
+    "usage": {"prompt_tokens": 5, "total_tokens": 5},
+}
+
+def mock_patch_aembedding():
+    return mock.patch(
+        "litellm.proxy.proxy_server.llm_router.aembedding",
+        return_value=example_embedding_result,
+    )
+
+@pytest.fixture(scope="function")
+def client_no_auth():
+    # Assuming litellm.proxy.proxy_server is an object
+    from litellm.proxy.proxy_server import cleanup_router_config_variables
+
+    cleanup_router_config_variables()
+    filepath = os.path.dirname(os.path.abspath(__file__))
+    config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
+    # initialize can get run in parallel, it sets specific variables for the fast api app, sinc eit gets run in parallel different tests use the wrong variables
+    asyncio.run(initialize(config=config_fp, debug=True))
+    return TestClient(app)
 
 
 @pytest.mark.asyncio
@@ -189,3 +236,32 @@ def test_team_info_masking():
     print("Got exception: {}".format(exc_info.value))
     assert "secret-test-key" not in str(exc_info.value)
     assert "public-test-key" not in str(exc_info.value)
+
+
+@mock_patch_aembedding()
+def test_embedding_input_array_of_tokens(mock_aembedding, client_no_auth):
+    """
+    Test to bypass decoding input as array of tokens for selected providers
+
+    Ref: https://github.com/BerriAI/litellm/issues/10113
+    """
+    try:
+        test_data = {
+            "model": "vllm_embed_model",
+            "input": [[2046, 13269, 158208]],
+        }
+
+        response = client_no_auth.post("/v1/embeddings", json=test_data)
+
+        mock_aembedding.assert_called_once_with(
+            model="vllm_embed_model",
+            input=[[2046, 13269, 158208]],
+            metadata=mock.ANY,
+            proxy_server_request=mock.ANY,
+        )
+        assert response.status_code == 200
+        result = response.json()
+        print(len(result["data"][0]["embedding"]))
+        assert len(result["data"][0]["embedding"]) > 10  # this usually has len==1536 so
+    except Exception as e:
+        pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")