sgl-project · zhyncs · Jun 21, 2025 · Jun 18, 2025 · Jun 19, 2025 · Jun 19, 2025
diff --git a/benchmark/hicache/data_processing.py b/benchmark/hicache/data_processing.py
@@ -20,7 +20,7 @@
     get_gen_prefix_cache_path,
 )
 from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
-from sglang.srt.openai_api.protocol import ChatCompletionMessageContentPart
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionMessageContentPart
 from sglang.utils import encode_video_base64
 
 # type of content fields, can be only prompts or with images/videos

@@ -64,11 +64,14 @@
     "text = \"Once upon a time\"\n",
     "\n",
     "curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
+    "  -H \"Content-Type: application/json\" \\\n",
     "  -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
     "\n",
-    "text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n",
-    "    \"embedding\"\n",
-    "]\n",
+    "result = subprocess.check_output(curl_text, shell=True)\n",
+    "\n",
+    "print(result)\n",
+    "\n",
+    "text_embedding = json.loads(result)[\"data\"][0][\"embedding\"]\n",
     "\n",
     "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
    ]
@@ -152,6 +155,7 @@
     "input_ids = tokenizer.encode(text)\n",
     "\n",
     "curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
+    "  -H \"Content-Type: application/json\" \\\n",
     "  -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
     "\n",
     "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",

@@ -67,6 +67,7 @@
     "\n",
     "curl_command = f\"\"\"\n",
     "curl -s http://localhost:{port}/v1/chat/completions \\\\\n",
+    "  -H \"Content-Type: application/json\" \\\\\n",
     "  -d '{{\n",
     "    \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
     "    \"messages\": [\n",

@@ -36,7 +36,7 @@
     "import requests\n",
     "from PIL import Image\n",
     "\n",
-    "from sglang.srt.openai_api.protocol import ChatCompletionRequest\n",
+    "from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest\n",
     "from sglang.srt.conversation import chat_templates\n",
     "\n",
     "image = Image.open(\n",

@@ -15,9 +15,7 @@
 
 
 import dataclasses
-import json
 import logging
-import os
 from enum import auto
 
 from sglang.srt.entrypoints.openai.protocol import CompletionRequest
@@ -57,46 +55,6 @@ class CompletionTemplate:
 completion_templates: dict[str, CompletionTemplate] = {}
 
 
-def load_completion_template_for_openai_api(completion_template_arg):
-    global completion_template_name
-
-    logger.info(
-        f"Use completion template for the OpenAI-compatible API server: {completion_template_arg}"
-    )
-
-    if not completion_template_exists(completion_template_arg):
-        if not os.path.exists(completion_template_arg):
-            raise RuntimeError(
-                f"Completion template {completion_template_arg} is not a built-in template name "
-                "or a valid completion template file path."
-            )
-
-        assert completion_template_arg.endswith(
-            ".json"
-        ), "unrecognized format of completion template file"
-        with open(completion_template_arg, "r") as filep:
-            template = json.load(filep)
-            try:
-                fim_position = FimPosition[template["fim_position"]]
-            except KeyError:
-                raise ValueError(
-                    f"Unknown fim position: {template['fim_position']}"
-                ) from None
-            register_completion_template(
-                CompletionTemplate(
-                    name=template["name"],
-                    fim_begin_token=template["fim_begin_token"],
-                    fim_middle_token=template["fim_middle_token"],
-                    fim_end_token=template["fim_end_token"],
-                    fim_position=fim_position,
-                ),
-                override=True,
-            )
-        completion_template_name = template["name"]
-    else:
-        completion_template_name = completion_template_arg
-
-
 def register_completion_template(template: CompletionTemplate, override: bool = False):
     """Register a new completion template."""
     if not override:

@@ -11,7 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Conversation chat templates."""
+"""Conversation chat templates.
+
+This module provides conversation template definitions, data structures, and utilities
+for managing chat templates across different model types in SGLang.
+
+Key components:
+- Conversation class: Defines the structure and behavior of chat templates
+- SeparatorStyle enum: Different conversation formatting styles
+- Template registry: Functions to register and retrieve templates by name or model path
+- Built-in templates: Pre-defined templates for popular models
+"""
 
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
@@ -20,7 +30,7 @@
 from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
-from sglang.srt.openai_api.protocol import ChatCompletionRequest
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
 from sglang.srt.utils import read_system_prompt_from_file
 
 
@@ -618,7 +628,7 @@ def generate_chat_conv(
 
 
 # llama2 template
-# reference: https://huggingface.co/blog/codellama#conversational-instructions
+# reference: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 # reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
 register_conv_template(
     Conversation(

@@ -37,7 +37,6 @@
 import torch
 import uvloop
 
-from sglang.srt.code_completion_parser import load_completion_template_for_openai_api
 from sglang.srt.entrypoints.EngineBase import EngineBase
 from sglang.srt.managers.data_parallel_controller import (
     run_data_parallel_controller_process,
@@ -58,11 +57,8 @@
     UpdateWeightsFromTensorReqInput,
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.managers.template_manager import TemplateManager
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
-from sglang.srt.openai_api.adapter import (
-    guess_chat_template_name_from_model_path,
-    load_chat_template_for_openai_api,
-)
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
@@ -123,12 +119,13 @@ def __init__(self, **kwargs):
         logger.info(f"{server_args=}")
 
         # Launch subprocesses
-        tokenizer_manager, scheduler_info = _launch_subprocesses(
+        tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
             server_args=server_args,
             port_args=port_args,
         )
         self.server_args = server_args
         self.tokenizer_manager = tokenizer_manager
+        self.template_manager = template_manager
         self.scheduler_info = scheduler_info
 
         context = zmq.Context(2)
@@ -647,7 +644,7 @@ def sigquit_handler(signum, frame):
 
 def _launch_subprocesses(
     server_args: ServerArgs, port_args: Optional[PortArgs] = None
-) -> Tuple[TokenizerManager, Dict]:
+) -> Tuple[TokenizerManager, TemplateManager, Dict]:
     """
     Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
     """
@@ -732,7 +729,7 @@ def _launch_subprocesses(
 
         if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
             # When using `Engine` as a Python API, we don't want to block here.
-            return None, None
+            return None, None, None
 
         launch_dummy_health_check_server(server_args.host, server_args.port)
 
@@ -741,7 +738,7 @@ def _launch_subprocesses(
             logger.error(
                 f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
             )
-        return None, None
+        return None, None, None
 
     # Launch detokenizer process
     detoken_proc = mp.Process(
@@ -755,15 +752,15 @@ def _launch_subprocesses(
 
     # Launch tokenizer process
     tokenizer_manager = TokenizerManager(server_args, port_args)
-    if server_args.chat_template:
-        load_chat_template_for_openai_api(
-            tokenizer_manager, server_args.chat_template, server_args.model_path
-        )
-    else:
-        guess_chat_template_name_from_model_path(server_args.model_path)
 
-    if server_args.completion_template:
-        load_completion_template_for_openai_api(server_args.completion_template)
+    # Initialize templates
+    template_manager = TemplateManager()
+    template_manager.initialize_templates(
+        tokenizer_manager=tokenizer_manager,
+        model_path=server_args.model_path,
+        chat_template=server_args.chat_template,
+        completion_template=server_args.completion_template,
+    )
 
     # Wait for the model to finish loading
     scheduler_infos = []
@@ -787,4 +784,4 @@ def _launch_subprocesses(
     # Assume all schedulers have the same scheduler_info
     scheduler_info = scheduler_infos[0]
     tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
-    return tokenizer_manager, scheduler_info
+    return tokenizer_manager, template_manager, scheduler_info