zzzyq
diff --git a/‎examples/lmcache/README.md
Lines changed: 56 additions & 0 deletions b/‎examples/lmcache/README.md
Lines changed: 56 additions & 0 deletions
diff --git a/‎examples/offline_inference/cpu_offload_lmcache.py renamed to ‎examples/lmcache/cpu_offload_lmcache_v0.py b/‎examples/offline_inference/cpu_offload_lmcache.py renamed to ‎examples/lmcache/cpu_offload_lmcache_v0.py
diff --git a/‎examples/lmcache/cpu_offload_lmcache_v1.py
Lines changed: 57 additions & 0 deletions b/‎examples/lmcache/cpu_offload_lmcache_v1.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎examples/offline_inference/disaggregated_prefill_lmcache.py renamed to ‎examples/lmcache/disagg_prefill_lmcache_v0.py b/‎examples/offline_inference/disaggregated_prefill_lmcache.py renamed to ‎examples/lmcache/disagg_prefill_lmcache_v0.py
diff --git a/‎examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
Lines changed: 13 additions & 0 deletions b/‎examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
Lines changed: 13 additions & 0 deletions b/‎examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
Lines changed: 136 additions & 0 deletions b/‎examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
Lines changed: 136 additions & 0 deletions
@@ -0,0 +1,56 @@
+# LMCache Examples
+
+This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing.
+
+## 1. Disaggregated Prefill in vLLM v1
+
+This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
+
+### Prerequisites
+
+- Install [LMCache](https://github.com/ai-dynamo/lmcache)
+- Install [NIXL](https://github.com/ai-dynamo/nixl)
+- At least 2 GPUs
+- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
+
+### Usage
+
+Run
+`cd disagg_prefill_lmcache_v1`
+to get into `disagg_prefill_lmcache_v1` folder, and then run
+
+```bash
+bash disagg_example_nixl.sh
+```
+
+to run disaggregated prefill and benchmark the performance.
+
+### Components
+
+#### Server Scripts
+- `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
+- `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
+- `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
+
+#### Configuration
+- `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
+- `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
+
+#### Log Files
+The main script generates several log files:
+- `prefiller.log` - Logs from the prefill server
+- `decoder.log` - Logs from the decode server
+- `proxy.log` - Logs from the proxy server
+
+## 2. CPU Offload Examples
+
+- `cpu_offload_lmcache_v0.py` - CPU offloading implementation for vLLM v0
+- `cpu_offload_lmcache_v1.py` - CPU offloading implementation for vLLM v1
+
+## 3. KV Cache Sharing
+
+The `kv_cache_sharing_lmcache_v1.py` example demonstrates how to share KV caches between vLLM v1 instances.
+
+## 4. Disaggregated Prefill in vLLM v0
+
+The `disaggregated_prefill_lmcache_v0.py` provides an example of how to run disaggregated prefill in vLLM v0.
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of cpu offloading
+with LMCache in vLLM v1.
+
+Note that lmcache needs to be installed to run this example.
+Learn more about LMCache in https://github.com/LMCache/LMCache.
+"""
+import os
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# LMCache-related environment variables
+# Use experimental features in LMCache
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+# LMCache is set to use 256 tokens per chunk
+os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+# Enable local CPU backend in LMCache
+os.environ["LMCACHE_LOCAL_CPU"] = "True"
+# Set local CPU memory limit to 5.0 GB
+os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+
+# This example script runs two requests with a shared prefix.
+shared_prompt = "Hello, how are you?" * 1000
+first_prompt = [
+    shared_prompt + "Hello, my name is",
+]
+second_prompt = [
+    shared_prompt + "Tell me a very long story",
+]
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+ktc = KVTransferConfig.from_cli(
+    '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
+# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+# memory. Reduce the value if your GPU has less memory.
+# Note that LMCache is not compatible with chunked prefill for now.
+llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+          kv_transfer_config=ktc,
+          max_model_len=8000,
+          gpu_memory_utilization=0.8)
+
+# Should be able to see logs like the following:
+# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
+# This indicates that the KV cache has been stored in LMCache.
+outputs = llm.generate(first_prompt, sampling_params)
+for output in outputs:
+    generated_text = output.outputs[0].text
+    print(f"Generated text: {generated_text!r}")
+
+# Clean up lmcache backend
+LMCacheEngineBuilder.destroy(ENGINE_NAME)
@@ -0,0 +1,13 @@
+local_cpu: False
+max_local_cpu_size: 0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: NULL
+
+enable_nixl: True
+nixl_role: "receiver"
+nixl_peer_host: "localhost"
+nixl_peer_port: 55555
+nixl_buffer_size: 1073741824 # 1GB
+nixl_buffer_device: "cuda"
+nixl_enable_gc: True
@@ -0,0 +1,13 @@
+local_cpu: False
+max_local_cpu_size: 0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: NULL
+
+enable_nixl: True
+nixl_role: "sender"
+nixl_peer_host: "localhost"
+nixl_peer_port: 55555
+nixl_buffer_size: 1073741824 # 1GB
+nixl_buffer_device: "cuda"
+nixl_enable_gc: True
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
+
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # can you check if the number of GPUs are >=2 via nvidia-smi?
+    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    python -c "import $1" > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        if [ "$1" == "nixl" ]; then
+            echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
+        else
+            echo "$1 is not installed. Please install it via pip install $1."
+        fi
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    kill -- -$$            # negative PID  ==  “this whole process-group”
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=1200
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+
+main() {
+    check_hf_token
+    check_num_gpus
+    ensure_python_library_installed lmcache
+    ensure_python_library_installed nixl
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM
+
+    echo "Launching prefiller, decoder and proxy..."
+    echo "Please check prefiller.log, decoder.log and proxy.log for logs."
+
+    bash disagg_vllm_launcher.sh prefiller \
+        > >(tee prefiller.log) 2>&1 &
+    prefiller_pid=$!
+    PIDS+=($prefiller_pid)
+
+    bash disagg_vllm_launcher.sh decoder  \
+        > >(tee decoder.log)  2>&1 &
+    decoder_pid=$!
+    PIDS+=($decoder_pid)
+
+    python3 disagg_proxy_server.py \
+        --host localhost \
+        --port 9000 \
+        --prefiller-host localhost \
+        --prefiller-port 8100 \
+        --decoder-host localhost \
+        --decoder-port 8200  \
+        > >(tee proxy.log)    2>&1 &
+    proxy_pid=$!
+    PIDS+=($proxy_pid)
+
+    wait_for_server 8100
+    wait_for_server 8200
+    wait_for_server 9000
+
+    echo "All servers are up. Starting benchmark..."
+
+    # begin benchmark
+    cd ../../../benchmarks/
+    python benchmark_serving.py --port 9000 --seed $(date +%s) \
+        --model meta-llama/Llama-3.1-8B-Instruct \
+        --dataset-name random --random-input-len 7500 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+
+}
+
+main