InternLM · lvhan028 · May 8, 2025 · Apr 1, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -178,7 +178,7 @@ async def _gather_tasks(tasks):
 
     out_token_throughput = np.round(token_latency_stats.size / elapsed_time, 2)
     total_token_throughput = np.round(concurrency * test_round * (input_seqlen + output_seqlen) / elapsed_time, 2)
-    print(f'\n{"-" * 50}\ntotal time: {elapsed_time:.2f}s\n'
+    print(f'\n{" - " * 50}\ntotal time: {elapsed_time:.2f}s\n'
           f'concurrency: {concurrency}, test_round: {test_round}\n'
           f'input_tokens: {input_seqlen}, output_tokens: {output_seqlen}\n'
           f'first_token latency(min, max, ave): '
@@ -188,7 +188,7 @@ async def _gather_tasks(tasks):
           f'{token_latency_ave}s\n'
           f'token_latency percentiles(50%,75%,95%,99%)(s): {percentiles}\n'
           f'throughput(output): {out_token_throughput} token/s\n'
-          f'throughput(total): {total_token_throughput} token/s\n{"-" * 50}')
+          f'throughput(total): {total_token_throughput} token/s\n{" - " * 50}')
     return model_path, \
         [first_token_latency_min, first_token_latency_max,
          first_token_latency_ave], \

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
+from lmdeploy.pytorch.disagg.config import EngineRole, MigrationBackend
 from lmdeploy.utils import get_max_batch_size
 
 from .cli import CLI
@@ -167,6 +167,8 @@ def add_parser_api_server():
         ArgumentHelper.dp_rank(pt_group)
         ArgumentHelper.ep(pt_group)
         ArgumentHelper.enable_microbatch(pt_group)
+        ArgumentHelper.role(pt_group)
+        ArgumentHelper.migration_backend(pt_group)
 
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
@@ -216,7 +218,13 @@ def add_parser_proxy():
         parser.set_defaults(run=SubCliServe.proxy)
         parser.add_argument('--server-name', type=str, default='0.0.0.0', help='Host ip for proxy serving')
         parser.add_argument('--server-port', type=int, default=8000, help='Server port of the proxy')
-        parser.add_argument('--strategy',
+        parser.add_argument('--serving-strategy',
+                            type=str,
+                            choices=['Hybrid', 'DistServe'],
+                            default='Hybrid',
+                            help='the strategy to serve, Hybrid for colocating Prefill and Decode'
+                            'workloads into same engine, DistServe for Prefill-Decode Disaggregation')
+        parser.add_argument('--routing-strategy',
                             type=str,
                             choices=['random', 'min_expected_latency', 'min_observed_latency'],
                             default='min_expected_latency',
@@ -226,6 +234,15 @@ def add_parser_proxy():
                             help='Whether to disable cache status of the '
                             'proxy. If set, the proxy will forget the status '
                             'of the previous time')
+
+        # For Disaggregation
+        parser.add_argument('--migration-protocol',
+                            type=str,
+                            choices=['RDMA', 'NVLINK'],
+                            default='RDMA',
+                            help='transport protocol of KV migration')
+        parser.add_argument('--link-type', type=str, choices=['RoCE', 'IB'], default='RoCE', help='RDMA Link Type')
+        parser.add_argument('--disable-gdr', action='store_true', help='with GPU Direct Memory Access')
         ArgumentHelper.api_keys(parser)
         ArgumentHelper.ssl(parser)
         ArgumentHelper.log_level(parser)
@@ -311,7 +328,9 @@ def api_server(args):
                                                  quant_policy=args.quant_policy,
                                                  eager_mode=args.eager_mode,
                                                  max_prefill_token_num=args.max_prefill_token_num,
-                                                 enable_microbatch=args.enable_microbatch)
+                                                 enable_microbatch=args.enable_microbatch,
+                                                 role=EngineRole[args.role],
+                                                 migration_backend=MigrationBackend[args.migration_backend])
         else:
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(dtype=args.dtype,

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -527,3 +527,22 @@ def enable_microbatch(parser):
         return parser.add_argument('--enable-microbatch',
                                    action='store_true',
                                    help='enable microbatch for specified model')
+
+    # For Disaggregation
+    @staticmethod
+    def role(parser):
+        return parser.add_argument('--role',
+                                   type=str,
+                                   default='Hybrid',
+                                   choices=['Hybrid', 'Prefill', 'Decode'],
+                                   help='Hybrid for Non-Disaggregated Engine;'
+                                   'Prefill for Disaggregated Prefill Engine;'
+                                   'Decode for Disaggregated Decode Engine;')
+
+    @staticmethod
+    def migration_backend(parser):
+        return parser.add_argument('--migration-backend',
+                                   type=str,
+                                   default='DLSlime',
+                                   choices=['DLSlime'],
+                                   help='kvcache migration management backend when PD disaggregation')
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -6,6 +6,9 @@
 import torch
 from pydantic.dataclasses import dataclass as pydantic_dataclass
 
+from lmdeploy.pytorch.disagg.config import EngineRole, MigrationBackend
+from lmdeploy.pytorch.disagg.request import MigrationRequest
+
 from .tokenizer import Tokenizer
 from .utils import get_logger
 
@@ -107,6 +110,11 @@ class GenerationConfig:
     output_logits: Literal['all', 'generation'] = None
     output_last_hidden_state: Literal['all', 'generation'] = None
 
+    # for disaggregation
+    with_cache: bool = False
+    preserve_cache: bool = False
+    migration_request: Optional[MigrationRequest] = None
+
     def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer):
         """convert stop_words/bad_sords to ids and append the ids to
         stop_token_ids/bad_token_ids."""
@@ -298,6 +306,10 @@ class PytorchEngineConfig:
         distributed_executor_backend (str): backend of distributed backend,
             options: ['uni', 'mp', 'ray']
         enable_microbatch (bool): enable microbatch for specified model
+        role (EngineRole): role of engin, options: ['Hybrid', 'Prefill',
+            'Decode']. Default to `EngineRole.Hybrid`.
+        migration_backend: migration backend. options: ['DLSlime'].
+            Default to `MigrationBackend.DLSlime`.
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -324,6 +336,9 @@ class PytorchEngineConfig:
     distributed_executor_backend: str = None
     enable_microbatch: bool = False
 
+    role: EngineRole = EngineRole.Hybrid
+    migration_backend: MigrationBackend = MigrationBackend.DLSlime
+
     def __post_init__(self):
         """Check input validation."""
         assert self.dtype in ['auto', 'float16', 'bfloat16']
@@ -404,6 +419,8 @@ class EngineOutput:
             may not equal to the length of token_ids
         logprobs (List[Dict[int, float]]): the top logprobs for each output
             position.
+        cache_block_ids (List[int]): send cache blocks back for migration in
+            Disaggregated LLM Serving when Prefill Engine is Done.
     """
     status: ResponseType
     token_ids: List[int]
@@ -412,6 +429,8 @@ class EngineOutput:
     logits: torch.Tensor = None
     last_hidden_state: torch.Tensor = None
 
+    cache_block_ids: Optional[List[int]] = None
+
 
 @dataclass
 class VisionConfig:

diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -4,6 +4,8 @@
 
 import torch
 
+from lmdeploy.pytorch.disagg.config import EngineRole, MigrationBackend
+
 
 def _update_torch_dtype(config: 'ModelConfig', dtype: str):
     """Update the torch dtype from the model config.
@@ -80,6 +82,10 @@ class CacheConfig:
     quant_policy: Literal[0, 4, 8] = 0
     device_type: str = 'cuda'
 
+    # For PD Disaggregation
+    role: EngineRole = EngineRole.Hybrid
+    migration_backend: MigrationBackend = MigrationBackend.DLSlime
+
     def __post_init__(self):
         """post init."""
         from lmdeploy.utils import get_logger

diff --git a/lmdeploy/pytorch/disagg/README.md b/lmdeploy/pytorch/disagg/README.md
@@ -0,0 +1,103 @@
+# LMDeploy-DistServe
+
+## Key Components
+
+1. **Router Service**: Coordinates between prefill/decode engines
+2. **Migration Manager**: Facilitates high-performance memory sharing
+
+## Installation
+
+```
+# Inference Engine
+pip install lmdeploy[all] >= 0.7.0
+
+# Transfer Engine
+pip install dlslime>=0.0.1.post7
+```
+
+## Quick Start
+
+A PD disaggregated deployment of DeepSeekV3 is shown below:
+
+### 1. Launch Router Service
+
+```shell
+lmdeploy serve proxy --server-name 0.0.0.0 --server-port 8000 --routing-strategy "min_expected_latency" --serving-strategy DistServe --log-level INFO
+```
+
+LMDeploy-DistServe support both NVLink and RDMA for kvcache transferring from Prefill Engine to Decode Engine. RDMA is default model. Set `--migration-protocol NVLink` for NVLink transport.
+
+### 2. Configure Endpoints
+
+First deploy your prefill and decode engines.
+
+```shell
+# Prefill Engine
+CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm2_5-7b-chat --server-port 23333 --role Prefill --proxy-url http://0.0.0.0:8000 --backend pytorch
+# Decode Engine
+CUDA_VISIBLE_DEVICES=1 lmdeploy serve api_server internlm/internlm2_5-7b-chat --server-port 23334 --role Decode --proxy-url http://0.0.0.0:8000 --backend pytorch
+```
+
+By now, only **Pytorch backend** supports PD Disaggregation.
+
+## API Usage
+
+```shell
+# API Invoke
+curl -X POST "http://localhost:8000/v1/completions" \
+-H "Content-Type: application/json" \
+-d '{"model": "internlm/internlm2_5-7b-chat", "temperature":0, "prompt": "Shanghai is a city that ", "max_tokens": 16, "stream": false}'
+# Output
+{
+  "id":"2",
+  "object":"text_completion",
+  "created":1743662400,"
+  model":"internlm/internlm2_5-7b-chat",
+  "choices":[
+    {
+      "index":0,
+      "text":" is very famous for its skyscrapers. It is also a city","logprobs":null,"finish_reason":"length"
+    }
+  ],
+  "usage": {
+    "prompt_tokens":7,"total_tokens":23,"completion_tokens":16
+  }
+}
+```
+
+## Trouble Shooting
+
+### RDMA Connection Failed:
+
+Make sure ibverbs is correctly installed:
+
+```
+# on Ubuntu
+sudo apt install libibverbs-dev
+# on CentOS
+sudo yum install ibverbs-devel
+```
+
+```bash
+ibstat        # Verify IB device status
+ibv_devinfo   # Check device capabilities
+```
+
+### Check GPU Direct RDMA:
+
+By now, lmdeploy-distserve use GPUDirect RDMA to perform KVTransfer. Make sure GPUDirect RDMA Driver is loaded to kernel.
+
+```bash
+lsmod | grep nv_peer_mem
+# GPUDirect RDMA info will be printed If GPUDirect RDMA is correctly loaded.
+```
+
+### Connection Pool
+
+Currently, if the Proxy disconnects, the connection pool must be warmed up again. A future enhancement could involve:
+
+A dedicated connection pool management server (e.g., using Raft-based tools like ETCD, as mentioned in Mooncake) to improve connection discovery and avoid repeated warmups.
+
+### Proxy
+
+Do not add an engine nodes to **different proxy** because it is not supported and is not considered as a right usage by now.
diff --git a/lmdeploy/pytorch/disagg/__init__.py b/lmdeploy/pytorch/disagg/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/pytorch/disagg/backend/__init__.py b/lmdeploy/pytorch/disagg/backend/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from lmdeploy.logger import get_logger
+
+logger = get_logger('lmdeploy')
+
+try:
+    logger.debug('Registering DLSlime Backend')
+    from .dlslime import DLSlimeBackend
+except ImportError:
+    logger.warning('Disable DLSlime Backend')
+
+try:
+    logger.debug('Registering Mooncake Backend')
+    from .mooncake import MooncakeBackend
+except ImportError:
+    logger.warning('Disable Mooncake Backend')
+
+try:
+    logger.debug('Registering InfiniStoreBackend Backend')
+    from .infinistore import InfiniStoreBackend
+except ImportError:
+    logger.warning('Disable InfiniStoreBackend Backend')
+
+__all__ = ['DLSlimeBackend', 'MooncakeBackend', 'InfiniStoreBackend']
diff --git a/lmdeploy/pytorch/disagg/backend/backend.py b/lmdeploy/pytorch/disagg/backend/backend.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import Registry
+
+MIGRATION_BACKENDS = Registry('migration_backend', locations=['lmdeploy.pytorch.disagg.backend.backend'])
diff --git a/lmdeploy/pytorch/disagg/backend/base.py b/lmdeploy/pytorch/disagg/backend/base.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+
+from lmdeploy.pytorch.disagg.config import MigrationProtocol
+from lmdeploy.pytorch.disagg.messages import DistServeRegisterMRMessage, MigrationAssignment
+from lmdeploy.pytorch.disagg.request import DistServeConnectionRequest, DistServeInitRequest
+
+
+class MigrationBackendImpl:
+
+    @abstractmethod
+    def p2p_initialize(self, init_request: DistServeInitRequest):
+        raise NotImplementedError
+
+    @abstractmethod
+    def register_memory_region(self, register_mr_request: DistServeRegisterMRMessage):
+        raise NotImplementedError
+
+    @abstractmethod
+    def endpoint_info(self, remote_engine_id: int, protocol: MigrationProtocol):
+        return NotImplementedError
+
+    @abstractmethod
+    def p2p_connect(self, conn_req: DistServeConnectionRequest):
+        raise NotImplementedError
+
+    @abstractmethod
+    def p2p_migrate(self, assignment: MigrationAssignment, async_op: bool = False):
+        raise NotImplementedError
+
+    @abstractmethod
+    def store(self, assignment: MigrationAssignment, async_op: bool = False):
+        raise NotImplementedError
+
+    @abstractmethod
+    def load(self, assignment: MigrationAssignment, async_op: bool = False):
+        raise NotImplementedError
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Copyright (c) OpenMMLab. All rights reserved.