Skip to content

feat(oai refactor): Replace openai_api with entrypoints/openai #7351

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 36 commits into from
Jun 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
853b9d3
fix finish_reason type for protocol.py
CatherineSue Jun 18, 2025
4be8420
Remove sglang/srt/openai_api
CatherineSue Jun 19, 2025
a5a0fe7
Remove openai/api_server.py and conftest.py
CatherineSue Jun 19, 2025
854ec42
Fix chat_template_name not found error
CatherineSue Jun 19, 2025
e665bff
Remove batch test in test_openai_server.py
CatherineSue Jun 19, 2025
c8efdd4
Use logger.exception in OpenAIServingBase
CatherineSue Jun 19, 2025
21b88a2
Fix variables not found error in chat_template
CatherineSue Jun 19, 2025
3aefe08
refactor: centralize template management and optimize Jinja template …
CatherineSue Jun 19, 2025
fba892e
Fix imports, is_multimodal not found, and enable_thinking parameter
CatherineSue Jun 19, 2025
39fddf6
Fix test_embedding_openai_server
jhinpan Jun 19, 2025
d2863cd
Temporarily disable `_generate_request_id_base`
CatherineSue Jun 19, 2025
3548b2d
Revert changes in Tokenizer manager & io_struct
jhinpan Jun 19, 2025
3a4eb59
Revert changes in `test_embedding_openai_server.py`
CatherineSue Jun 19, 2025
48c93f3
Add validation_exception_handler to http_server.py
CatherineSue Jun 20, 2025
8467a30
Skip embedding in log_requests
CatherineSue Jun 20, 2025
256dd1c
Remove V1RerankReqInput from io_struct
CatherineSue Jun 20, 2025
d61e6c9
Fix lint and enable_thinking parameter
CatherineSue Jun 20, 2025
2a09f33
Fix typings
CatherineSue Jun 20, 2025
40fda4e
Clear unused imports in entrypoints/openai and update tests
CatherineSue Jun 21, 2025
c46ebcb
Fix notebook curl command and add validation for json media-type in h…
CatherineSue Jun 21, 2025
b55abde
Clear notebook output
CatherineSue Jun 21, 2025
96685e0
Fix rebase error
CatherineSue Jun 21, 2025
0e635fa
Merge branch 'main' into chang/remove-adapter
zhyncs Jun 21, 2025
2eec38d
Fix hidden_states CI
CatherineSue Jun 21, 2025
61b352d
Merge branch 'main' into chang/remove-adapter
zhyncs Jun 21, 2025
c36b28e
Remove single List[str] input handling in serving_embedding.py
CatherineSue Jun 21, 2025
e583fd9
Fix reasoning_content CI by removing `enable_thinking` in reasoning c…
CatherineSue Jun 21, 2025
0f40188
Fix func name in `test_jinja_template_utils.py`
CatherineSue Jun 21, 2025
b22eb4a
Fix Qwen3ReasoningParser to work with enable_thinking parameter
CatherineSue Jun 21, 2025
50287ba
Fix BaseReasoningFormatDetector to handle streaming when force_reason…
CatherineSue Jun 21, 2025
2dc0341
Simplify the logic to handle streaming in BaseReasoningFormatDetector
CatherineSue Jun 21, 2025
e6b445e
Fix wrong import in jinja_template_utils.py
CatherineSue Jun 21, 2025
8ddc232
Fix tool call start check in BaseFormatDetector
CatherineSue Jun 21, 2025
376a589
Merge remote-tracking branch 'origin/main' into moirai/chang/remove-a…
CatherineSue Jun 21, 2025
8a867ff
Merge branch 'main' into chang/remove-adapter
CatherineSue Jun 21, 2025
83413ae
Add support for v1/models/{model_id}
CatherineSue Jun 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmark/hicache/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
get_gen_prefix_cache_path,
)
from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
from sglang.srt.openai_api.protocol import ChatCompletionMessageContentPart
from sglang.srt.entrypoints.openai.protocol import ChatCompletionMessageContentPart
from sglang.utils import encode_video_base64

# type of content fields, can be only prompts or with images/videos
Expand Down
10 changes: 7 additions & 3 deletions docs/backend/openai_api_embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,14 @@
"text = \"Once upon a time\"\n",
"\n",
"curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
" -H \"Content-Type: application/json\" \\\n",
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
"\n",
"text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n",
" \"embedding\"\n",
"]\n",
"result = subprocess.check_output(curl_text, shell=True)\n",
"\n",
"print(result)\n",
"\n",
"text_embedding = json.loads(result)[\"data\"][0][\"embedding\"]\n",
"\n",
"print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
]
Expand Down Expand Up @@ -152,6 +155,7 @@
"input_ids = tokenizer.encode(text)\n",
"\n",
"curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
" -H \"Content-Type: application/json\" \\\n",
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
"\n",
"input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
Expand Down
1 change: 1 addition & 0 deletions docs/backend/openai_api_vision.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"\n",
"curl_command = f\"\"\"\n",
"curl -s http://localhost:{port}/v1/chat/completions \\\\\n",
" -H \"Content-Type: application/json\" \\\\\n",
" -d '{{\n",
" \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
" \"messages\": [\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/backend/vlm_query.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"import requests\n",
"from PIL import Image\n",
"\n",
"from sglang.srt.openai_api.protocol import ChatCompletionRequest\n",
"from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest\n",
"from sglang.srt.conversation import chat_templates\n",
"\n",
"image = Image.open(\n",
Expand Down
42 changes: 0 additions & 42 deletions python/sglang/srt/code_completion_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@


import dataclasses
import json
import logging
import os
from enum import auto

from sglang.srt.entrypoints.openai.protocol import CompletionRequest
Expand Down Expand Up @@ -57,46 +55,6 @@ class CompletionTemplate:
completion_templates: dict[str, CompletionTemplate] = {}


def load_completion_template_for_openai_api(completion_template_arg):
global completion_template_name

logger.info(
f"Use completion template for the OpenAI-compatible API server: {completion_template_arg}"
)

if not completion_template_exists(completion_template_arg):
if not os.path.exists(completion_template_arg):
raise RuntimeError(
f"Completion template {completion_template_arg} is not a built-in template name "
"or a valid completion template file path."
)

assert completion_template_arg.endswith(
".json"
), "unrecognized format of completion template file"
with open(completion_template_arg, "r") as filep:
template = json.load(filep)
try:
fim_position = FimPosition[template["fim_position"]]
except KeyError:
raise ValueError(
f"Unknown fim position: {template['fim_position']}"
) from None
register_completion_template(
CompletionTemplate(
name=template["name"],
fim_begin_token=template["fim_begin_token"],
fim_middle_token=template["fim_middle_token"],
fim_end_token=template["fim_end_token"],
fim_position=fim_position,
),
override=True,
)
completion_template_name = template["name"]
else:
completion_template_name = completion_template_arg


def register_completion_template(template: CompletionTemplate, override: bool = False):
"""Register a new completion template."""
if not override:
Expand Down
16 changes: 13 additions & 3 deletions python/sglang/srt/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Conversation chat templates."""
"""Conversation chat templates.

This module provides conversation template definitions, data structures, and utilities
for managing chat templates across different model types in SGLang.

Key components:
- Conversation class: Defines the structure and behavior of chat templates
- SeparatorStyle enum: Different conversation formatting styles
- Template registry: Functions to register and retrieve templates by name or model path
- Built-in templates: Pre-defined templates for popular models
"""

# Adapted from
# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
Expand All @@ -20,7 +30,7 @@
from enum import IntEnum, auto
from typing import Callable, Dict, List, Optional, Tuple, Union

from sglang.srt.openai_api.protocol import ChatCompletionRequest
from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
from sglang.srt.utils import read_system_prompt_from_file


Expand Down Expand Up @@ -618,7 +628,7 @@ def generate_chat_conv(


# llama2 template
# reference: https://huggingface.co/blog/codellama#conversational-instructions
# reference: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
# reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
register_conv_template(
Conversation(
Expand Down
33 changes: 15 additions & 18 deletions python/sglang/srt/entrypoints/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
import torch
import uvloop

from sglang.srt.code_completion_parser import load_completion_template_for_openai_api
from sglang.srt.entrypoints.EngineBase import EngineBase
from sglang.srt.managers.data_parallel_controller import (
run_data_parallel_controller_process,
Expand All @@ -58,11 +57,8 @@
UpdateWeightsFromTensorReqInput,
)
from sglang.srt.managers.scheduler import run_scheduler_process
from sglang.srt.managers.template_manager import TemplateManager
from sglang.srt.managers.tokenizer_manager import TokenizerManager
from sglang.srt.openai_api.adapter import (
guess_chat_template_name_from_model_path,
load_chat_template_for_openai_api,
)
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
from sglang.srt.utils import (
Expand Down Expand Up @@ -123,12 +119,13 @@ def __init__(self, **kwargs):
logger.info(f"{server_args=}")

# Launch subprocesses
tokenizer_manager, scheduler_info = _launch_subprocesses(
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
server_args=server_args,
port_args=port_args,
)
self.server_args = server_args
self.tokenizer_manager = tokenizer_manager
self.template_manager = template_manager
self.scheduler_info = scheduler_info

context = zmq.Context(2)
Expand Down Expand Up @@ -647,7 +644,7 @@ def sigquit_handler(signum, frame):

def _launch_subprocesses(
server_args: ServerArgs, port_args: Optional[PortArgs] = None
) -> Tuple[TokenizerManager, Dict]:
) -> Tuple[TokenizerManager, TemplateManager, Dict]:
"""
Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
"""
Expand Down Expand Up @@ -732,7 +729,7 @@ def _launch_subprocesses(

if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
# When using `Engine` as a Python API, we don't want to block here.
return None, None
return None, None, None

launch_dummy_health_check_server(server_args.host, server_args.port)

Expand All @@ -741,7 +738,7 @@ def _launch_subprocesses(
logger.error(
f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
)
return None, None
return None, None, None

# Launch detokenizer process
detoken_proc = mp.Process(
Expand All @@ -755,15 +752,15 @@ def _launch_subprocesses(

# Launch tokenizer process
tokenizer_manager = TokenizerManager(server_args, port_args)
if server_args.chat_template:
load_chat_template_for_openai_api(
tokenizer_manager, server_args.chat_template, server_args.model_path
)
else:
guess_chat_template_name_from_model_path(server_args.model_path)

if server_args.completion_template:
load_completion_template_for_openai_api(server_args.completion_template)
# Initialize templates
template_manager = TemplateManager()
template_manager.initialize_templates(
tokenizer_manager=tokenizer_manager,
model_path=server_args.model_path,
chat_template=server_args.chat_template,
completion_template=server_args.completion_template,
)

# Wait for the model to finish loading
scheduler_infos = []
Expand All @@ -787,4 +784,4 @@ def _launch_subprocesses(
# Assume all schedulers have the same scheduler_info
scheduler_info = scheduler_infos[0]
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
return tokenizer_manager, scheduler_info
return tokenizer_manager, template_manager, scheduler_info
Loading
Loading