diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7f9f85e1f93..67c7e109c9f 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1167,6 +1167,10 @@ def _abort(self, request_id: str) -> None: exception=asyncio.CancelledError, verbose=self.log_requests) + async def get_vllm_config(self) -> VllmConfig: + """Get the vllm configuration of the vLLM engine.""" + return self.engine.get_vllm_config() + async def get_model_config(self) -> ModelConfig: """Get the model configuration of the vLLM engine.""" return self.engine.get_model_config() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 54f7b8fb69b..2347cdee904 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -914,6 +914,10 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: scheduler.abort_seq_group( request_id, seq_id_to_seq_group=self.seq_id_to_seq_group) + def get_vllm_config(self) -> VllmConfig: + """Gets the vllm configuration.""" + return self.vllm_config + def get_model_config(self) -> ModelConfig: """Gets the model configuration.""" return self.model_config diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index f058b13297b..6e56cbdbbf8 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -93,6 +93,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, self._errored_with: Optional[BaseException] = None # Get the configs. + self.vllm_config = engine_config self.model_config = engine_config.model_config self.decoding_config = engine_config.decoding_config @@ -377,6 +378,9 @@ async def get_input_preprocessor(self) -> InputPreprocessor: async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): return await self.tokenizer.get_lora_tokenizer_async(lora_request) + async def get_vllm_config(self) -> VllmConfig: + return self.vllm_config + async def get_decoding_config(self) -> DecodingConfig: return self.decoding_config diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index e2974b02c5b..7e5ac3a2845 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -5,7 +5,7 @@ from typing import AsyncGenerator, List, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function -from vllm.config import DecodingConfig, ModelConfig +from vllm.config import DecodingConfig, ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt @@ -220,6 +220,11 @@ async def abort(self, request_id: str) -> None: """ ... + @abstractmethod + async def get_vllm_config(self) -> VllmConfig: + """Get the vllm configuration of the vLLM engine.""" + ... + @abstractmethod async def get_model_config(self) -> ModelConfig: """Get the model configuration of the vLLM engine.""" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6a8bdd06022..2c15aa8a933 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -30,7 +30,7 @@ from typing_extensions import assert_never import vllm.envs as envs -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore from vllm.engine.multiprocessing.client import MQLLMEngineClient @@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI): "/load", "/ping", "/version", + "/server_info", ], registry=registry, ).add().instrument(app).expose(app) @@ -687,6 +688,11 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request): if envs.VLLM_SERVER_DEV_MODE: + @router.get("/server_info") + async def show_server_info(raw_request: Request): + server_info = {"vllm_config": str(raw_request.app.state.vllm_config)} + return JSONResponse(content=server_info) + @router.post("/reset_prefix_cache") async def reset_prefix_cache(raw_request: Request): """ @@ -894,7 +900,7 @@ async def log_response(request: Request, call_next): async def init_app_state( engine_client: EngineClient, - model_config: ModelConfig, + vllm_config: VllmConfig, state: State, args: Namespace, ) -> None: @@ -915,6 +921,8 @@ async def init_app_state( state.engine_client = engine_client state.log_stats = not args.disable_log_stats + state.vllm_config = vllm_config + model_config = vllm_config.model_config resolved_chat_template = load_chat_template(args.chat_template) if resolved_chat_template is not None: @@ -1069,8 +1077,8 @@ def signal_handler(*_) -> None: async with build_async_engine_client(args) as engine_client: app = build_app(args) - model_config = await engine_client.get_model_config() - await init_app_state(engine_client, model_config, app.state, args) + vllm_config = await engine_client.get_vllm_config() + await init_app_state(engine_client, vllm_config, app.state, args) def _listen_addr(a: str) -> str: if is_valid_ipv6_address(a): diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b77a6824cdd..6d24ba2bc98 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -64,7 +64,7 @@ def __init__( assert start_engine_loop self.model_config = vllm_config.model_config - + self.vllm_config = vllm_config self.log_requests = log_requests self.log_stats = log_stats @@ -379,6 +379,9 @@ def encode( ): raise ValueError("Not Supported on V1 yet.") + async def get_vllm_config(self) -> VllmConfig: + return self.vllm_config + async def get_model_config(self) -> ModelConfig: return self.model_config diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c67186f704..c05319f3d80 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -230,6 +230,9 @@ def step(self) -> list[RequestOutput]: return processed_outputs.request_outputs + def get_vllm_config(self): + return self.vllm_config + def get_model_config(self): return self.model_config