InternLM · lvhan028 · May 8, 2025 · Apr 1, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
+from lmdeploy.disagg.messages import EngineRole, MigrationBackend, MigrationTransportProtocol
 from lmdeploy.utils import get_max_batch_size
 
 from .cli import CLI
@@ -125,6 +125,23 @@ def add_parser_api_server():
                             'engine’s tasks once the maximum number of concurrent requests is '
                             'reached, regardless of any additional requests sent by clients '
                             'concurrently during that time. Default to None.')
+        parser.add_argument('--role',
+                            type=str,
+                            default='Hybrid',
+                            choices=['Hybrid', 'Prefill', 'Decode'],
+                            help='Hybrid for Non-Disaggregated Engine;'
+                                 'Prefill for Disaggregated Prefill Engine;'
+                                 'Decode fro Disaggregated Decode Engine;')
+        parser.add_argument('--migration-backend',
+                            type=str,
+                            default='DLSlime',
+                            choices=['DLSlime', 'Mooncake', 'InfiniStore'],
+                            help='kvcache migration management backend when PD disaggregation')
+        parser.add_argument('--migration-protocol',
+                            type=str,
+                            default='RDMA',
+                            choices=['TCP', 'RDMA', 'NVLINK'],
+                            help='kvcache migration protocol')
         # common args
         ArgumentHelper.backend(parser)
         ArgumentHelper.log_level(parser)
@@ -215,7 +232,12 @@ def add_parser_proxy():
         parser.set_defaults(run=SubCliServe.proxy)
         parser.add_argument('--server-name', type=str, default='0.0.0.0', help='Host ip for proxy serving')
         parser.add_argument('--server-port', type=int, default=8000, help='Server port of the proxy')
-        parser.add_argument('--strategy',
+        parser.add_argument('--serving-strategy',
+                            type=str,
+                            choices=['Disaggregated', 'NonDisaggregated'],
+                            default='NonDisaggregated',
+                            help='the strategy to dispatch requests to nodes')
+        parser.add_argument('--routing-strategy',
                             type=str,
                             choices=['random', 'min_expected_latency', 'min_observed_latency'],
                             default='min_expected_latency',
@@ -307,7 +329,10 @@ def api_server(args):
                                                  device_type=args.device,
                                                  quant_policy=args.quant_policy,
                                                  eager_mode=args.eager_mode,
-                                                 max_prefill_token_num=args.max_prefill_token_num)
+                                                 max_prefill_token_num=args.max_prefill_token_num,
+                                                 role=EngineRole.__members__[args.role],
+                                                 migration_backend=MigrationBackend.__members__[args.migration_backend],
+                                                 migration_protocol=MigrationTransportProtocol.__members__[args.migration_protocol])
         else:
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(dtype=args.dtype,

diff --git a/lmdeploy/disagg/README.md b/lmdeploy/disagg/README.md
@@ -0,0 +1,58 @@
+# LMDeploy-DistServe
+
+## Key Components
+1. **Router Service**: Coordinates between prefill/decode engines
+4. **Migration Manager**: Facilitates high-performance memory sharing
+
+## Installation
+```
+# Inference Engine
+pip install lmdeploy[all] >= 0.7.0
+
+# Transfer Engine
+pip install dlslime==0.0.1.post1
+```
+
+## Quick Start
+### 1. Configure Endpoints
+First deploy your prefill and decode engines.
+
+``` shell
+# Prefill Engine
+CUDA_VISIBLE_DEVICES=0,1 lmdeploy serve api_server internlm/internlm2_5-7b-chat --server-port 23333 --role Prefill --tp 2 --cache-block-seq 32
+# Decode Engine
+CUDA_VISIBLE_DEVICES=2,3 lmdeploy serve api_server internlm/internlm2_5-7b-chat --server-port 23334 --role Decode --tp 2 --cache-block-seq 32
+```
+
+### 2. Launch Router Service
+
+``` shell
+python -m lmdeploy.disagg.router \
+    --host 0.0.0.0 \
+    --port 5000 \
+    --prefill-endpoint http://prefill-host:port1 http://prefill-host:port2 \
+    --decode-endpoint http://decode-host:port3 http://decode-host:port4
+```
+
+## API Usage
+
+```shell
+# API Invoke
+curl -X POST "http://localhost:5000/v1/completions" \
+-H "Content-Type: application/json" \
+-d '{"model": "internlm/internlm2_5-7b-chat", "temperature":0, "prompt": "Shanghai is a city that ", "max_tokens": 16, "stream": false}'
+# Output
+{"id":"2","object":"text_completion","created":1743662400,"model":"/nvme1/majinming/hub/models--internlm--internlm2_5-7b-chat/snapshots/4434a5ffc2582f9d5ac45085043ed3e3264f0a9b","choices":[{"index":0,"text":" is very famous for its skyscrapers. It is also a city","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":7,"total_tokens":23,"completion_tokens":16}}
+```
+
+## Trouble Shooting
+
+### RDMA Connection Failed:
+
+``` bash
+ibstatus      # Verify IB device status
+ibv_devinfo   # Check device capabilities
+```
+
+### Check NVSHMEM configuration:
+Make sure to verify NVSHMEM installation.
diff --git a/lmdeploy/disagg/__init__.py b/lmdeploy/disagg/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/disagg/backend/__init__.py b/lmdeploy/disagg/backend/__init__.py
@@ -0,0 +1,24 @@
+from typing import Dict
+from lmdeploy.logger import get_logger
+
+logger = get_logger("lmdeploy")
+
+
+try:
+    logger.debug("Registering DLSlime Backend")
+    from .dlslime import DLSlimeBackend
+except ImportError as e:
+    logger.debug("Disable DLSlime Backend")
+
+try:
+    logger.debug("Registering Mooncake Backend")
+    from .mooncake import MooncakeBackend
+except ImportError as e:
+    logger.debug("Disable Mooncake Backend")
+
+
+try:
+    logger.debug("Registering InfiniStoreBackend Backend")
+    from .infinistore import InfiniStoreBackend
+except ImportError as e:
+    logger.debug("Disable InfiniStoreBackend Backend")
diff --git a/lmdeploy/disagg/backend/backend.py b/lmdeploy/disagg/backend/backend.py
@@ -0,0 +1,12 @@
+from lmdeploy.disagg.messages import MigrationBackend
+
+
+MIGRATION_BACKENDS = {}
+
+
+def register_migration_backend(backend_name: MigrationBackend):
+    def register(cls):
+        MIGRATION_BACKENDS[backend_name] = cls
+        return cls
+
+    return register
diff --git a/lmdeploy/disagg/backend/base.py b/lmdeploy/disagg/backend/base.py
@@ -0,0 +1,40 @@
+from abc import abstractmethod
+
+from lmdeploy.disagg.messages import (
+    MigrationInitRequest,
+    MigrationConnectionRequest,
+    MigrationAssignment,
+    MigrationRegisterMemoryRequest,
+    MigrationTransportProtocol
+)
+
+
+class MigrationBackendImpl:
+    @abstractmethod
+    def p2p_initialize(self, init_request: MigrationInitRequest):
+        raise NotImplementedError
+
+    @abstractmethod
+    def register_memory_region(self, register_mr_request:MigrationRegisterMemoryRequest):
+        raise NotImplementedError
+
+    @abstractmethod
+    def endpoint_info(self, remote_engine_id: int, protocol: MigrationTransportProtocol):
+        return NotImplementedError
+
+    @abstractmethod
+    def p2p_connect(self, connect_request: MigrationConnectionRequest):
+        raise NotImplementedError
+
+    @abstractmethod
+    async def p2p_migrate(self, assignment: MigrationAssignment):
+        raise NotImplementedError
+
+    @abstractmethod
+    async def store(self, assignment: MigrationAssignment):
+        raise NotImplementedError
+
+    @abstractmethod
+    async def load(self, assignment: MigrationAssignment):
+        raise NotImplementedError
+
diff --git a/lmdeploy/disagg/backend/dlslime.py b/lmdeploy/disagg/backend/dlslime.py
@@ -0,0 +1,84 @@
+from typing import Dict
+
+from lmdeploy.disagg.messages import (
+    MigrationBackend,
+    MigrationInitRequest,
+    MigrationTransportProtocol,
+    DisaggEngineConfig,
+    MigrationConnectionRequest,
+    MigrationAssignment,
+    MigrationRegisterMemoryRequest
+)
+
+from lmdeploy.disagg.backend.base import MigrationBackendImpl
+from lmdeploy.disagg.backend.backend import register_migration_backend
+
+from dlslime import RDMAEndpoint, available_nic
+
+
+class DLSlimeMigrationManagement:
+    def __init__(self, init_request: MigrationInitRequest):
+        self.rank = init_request.rank
+        self.tp_rank = init_request.tp_rank
+        self.remote_engine_config: DisaggEngineConfig = init_request.remote_engine_config
+        self.endpoint: Dict[str, RDMAEndpoint] = {
+            MigrationTransportProtocol.TCP: None,
+            MigrationTransportProtocol.RDMA: None,
+            MigrationTransportProtocol.NVLINK: None,
+        }
+        if init_request.rdma_init_request:
+            if not init_request.rdma_init_request.device_name:
+                nics = available_nic()
+                init_request.rdma_init_request.device_name = nics[self.rank % len(nics)]
+            self.endpoint[MigrationTransportProtocol.RDMA] = RDMAEndpoint(
+                device_name=init_request.rdma_init_request.device_name,
+                ib_port=init_request.rdma_init_request.ib_port,
+                link_type=init_request.rdma_init_request.link_type
+            )
+
+    def register_memory_region(self, register_mr_request: MigrationRegisterMemoryRequest):
+        self.endpoint[register_mr_request.protocol].register_memory_region(
+            register_mr_request.mr_key,
+            register_mr_request.addr,
+            register_mr_request.length
+        )
+
+    def connect_to(self, connect_request: MigrationConnectionRequest):
+        self.endpoint[connect_request.protocol].connect_to(connect_request.remote_endpoint_info)
+
+    async def p2p_migrate(self, assignment: MigrationAssignment):
+        max_batch = 4096 + 2048
+        for i in range(0, len(assignment.target_offset), max_batch):
+            await self.endpoint[assignment.protocol].read_batch_async(
+                assignment.mr_key,
+                assignment.target_offset[i: i+max_batch],
+                assignment.source_offset[i: i+max_batch],
+                assignment.length
+            )
+
+
+@register_migration_backend(MigrationBackend.DLSlime)
+class DLSlimeBackend(MigrationBackendImpl):
+    def __init__(self):
+        self.links: Dict[int, DLSlimeMigrationManagement] = {}
+
+    def p2p_initialize(self, init_request: MigrationInitRequest):
+        self.links[init_request.remote_engine_id] = DLSlimeMigrationManagement(init_request)
+
+    def register_memory_region(self, register_mr_request:MigrationRegisterMemoryRequest):
+        self.links[register_mr_request.remote_engine_id].register_memory_region(register_mr_request)
+
+    def endpoint_info(self, remote_engine_id: int, protocol: MigrationTransportProtocol):
+        return self.links[remote_engine_id].endpoint[protocol].local_endpoint_info
+
+    def p2p_connect(self, connect_request: MigrationConnectionRequest):
+        self.links[connect_request.remote_engine_id].connect_to(connect_request)
+
+    async def p2p_migrate(self, assignment: MigrationAssignment):
+        await self.links[assignment.remote_engine_id].p2p_migrate(assignment)
+
+    async def store(self, assignment: MigrationAssignment):
+        raise NotImplementedError
+
+    async def load(self, assignment: MigrationAssignment):
+        raise NotImplementedError
diff --git a/lmdeploy/disagg/backend/infinistore.py b/lmdeploy/disagg/backend/infinistore.py
@@ -0,0 +1,35 @@
+from lmdeploy.disagg.messages import (
+    MigrationBackend,
+    MigrationInitRequest,
+    MigrationConnectionRequest,
+    MigrationAssignment,
+    MigrationRegisterMemoryRequest,
+    MigrationTransportProtocol
+)
+
+from lmdeploy.disagg.backend.backend import register_migration_backend
+from lmdeploy.disagg.backend.base import MigrationBackendImpl
+
+
+@register_migration_backend(MigrationBackend.InfiniStore)
+class InfiniStoreBackend(MigrationBackendImpl):
+    def p2p_initialize(self, init_request: MigrationInitRequest):
+        raise NotImplementedError
+
+    def register_memory_region(self, register_mr_request:MigrationRegisterMemoryRequest):
+        raise NotImplementedError
+
+    def endpoint_info(self, remote_engine_id: int, protocol: MigrationTransportProtocol):
+        return NotImplementedError
+
+    def p2p_connect(self, connect_request: MigrationConnectionRequest):
+        raise NotImplementedError
+
+    async def p2p_migrate(self, assignment: MigrationAssignment):
+        raise NotImplementedError
+
+    async def store(self, assignment: MigrationAssignment):
+        raise NotImplementedError
+
+    async def load(self, assignment: MigrationAssignment):
+        raise NotImplementedError
diff --git a/lmdeploy/disagg/backend/mooncake.py b/lmdeploy/disagg/backend/mooncake.py
@@ -0,0 +1,35 @@
+from lmdeploy.disagg.messages import (
+    MigrationBackend,
+    MigrationInitRequest,
+    MigrationConnectionRequest,
+    MigrationAssignment,
+    MigrationRegisterMemoryRequest,
+    MigrationTransportProtocol
+)
+
+from lmdeploy.disagg.backend.backend import register_migration_backend
+from lmdeploy.disagg.backend.base import MigrationBackendImpl
+
+
+@register_migration_backend(MigrationBackend.Mooncake)
+class MooncakeBackend(MigrationBackendImpl):
+    def p2p_initialize(self, init_request: MigrationInitRequest):
+        raise NotImplementedError
+
+    def register_memory_region(self, register_mr_request:MigrationRegisterMemoryRequest):
+        raise NotImplementedError
+
+    def endpoint_info(self, remote_engine_id: int, protocol: MigrationTransportProtocol):
+        return NotImplementedError
+
+    def p2p_connect(self, connect_request: MigrationConnectionRequest):
+        raise NotImplementedError
+
+    async def p2p_migrate(self, assignment: MigrationAssignment):
+        raise NotImplementedError
+
+    async def store(self, assignment: MigrationAssignment):
+        raise NotImplementedError
+
+    async def load(self, assignment: MigrationAssignment):
+        raise NotImplementedError
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Copyright (c) OpenMMLab. All rights reserved.