Skip to content

Commit 99ed526

Browse files
authored
[Misc] refactor examples series - lmcache (#16758)
Signed-off-by: reidliu41 <[email protected]> Co-authored-by: reidliu41 <[email protected]>
1 parent 207da28 commit 99ed526

File tree

1 file changed

+85
-50
lines changed

1 file changed

+85
-50
lines changed

examples/offline_inference/cpu_offload_lmcache.py

Lines changed: 85 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
This file demonstrates the example usage of cpu offloading
44
with LMCache.
55
6-
Note that `pip install lmcache` is needed to run this example.
7-
Learn more about LMCache in https://github.com/LMCache/LMCache.
6+
Note that `lmcache` is needed to run this example.
7+
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
8+
Learn more about LMCache environment setup, please refer to:
9+
https://docs.lmcache.ai/getting_started/installation.html
810
"""
11+
import contextlib
912
import os
1013
import time
1114

@@ -15,51 +18,83 @@
1518
from vllm import LLM, SamplingParams
1619
from vllm.config import KVTransferConfig
1720

18-
# LMCache-related environment variables
19-
# Use experimental features in LMCache
20-
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
21-
# LMCache is set to use 256 tokens per chunk
22-
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
23-
# Enable local CPU backend in LMCache
24-
os.environ["LMCACHE_LOCAL_CPU"] = "True"
25-
# Set local CPU memory limit to 5.0 GB
26-
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
27-
28-
# This example script runs two requests with a shared prefix.
29-
shared_prompt = "Hello, how are you?" * 1000
30-
first_prompt = [
31-
shared_prompt + "Hello, my name is",
32-
]
33-
second_prompt = [
34-
shared_prompt + "Tell me a very long story",
35-
]
36-
37-
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
38-
39-
ktc = KVTransferConfig.from_cli(
40-
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
41-
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
42-
# memory. Reduce the value if your GPU has less memory.
43-
# Note that LMCache is not compatible with chunked prefill for now.
44-
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
45-
kv_transfer_config=ktc,
46-
max_model_len=8000,
47-
enable_chunked_prefill=False,
48-
gpu_memory_utilization=0.8)
49-
50-
outputs = llm.generate(first_prompt, sampling_params)
51-
for output in outputs:
52-
generated_text = output.outputs[0].text
53-
print(f"Generated text: {generated_text!r}")
54-
print("First request done.")
55-
56-
time.sleep(1)
57-
58-
outputs = llm.generate(second_prompt, sampling_params)
59-
for output in outputs:
60-
generated_text = output.outputs[0].text
61-
print(f"Generated text: {generated_text!r}")
62-
print("Second request done.")
63-
64-
# Clean up lmcache backend
65-
LMCacheEngineBuilder.destroy(ENGINE_NAME)
21+
22+
def setup_environment_variables():
23+
# LMCache-related environment variables
24+
# Use experimental features in LMCache
25+
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
26+
# LMCache is set to use 256 tokens per chunk
27+
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
28+
# Enable local CPU backend in LMCache
29+
os.environ["LMCACHE_LOCAL_CPU"] = "True"
30+
# Set local CPU memory limit to 5.0 GB
31+
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
32+
33+
34+
@contextlib.contextmanager
35+
def build_llm_with_lmcache():
36+
ktc = KVTransferConfig.from_cli(
37+
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
38+
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
39+
# memory. Reduce the value if your GPU has less memory.
40+
# Note that LMCache is not compatible with chunked prefill for now.
41+
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
42+
kv_transfer_config=ktc,
43+
max_model_len=8000,
44+
enable_chunked_prefill=False,
45+
gpu_memory_utilization=0.8)
46+
47+
try:
48+
yield llm
49+
finally:
50+
# Clean up lmcache backend
51+
LMCacheEngineBuilder.destroy(ENGINE_NAME)
52+
53+
54+
def print_output(
55+
llm: LLM,
56+
prompt: list[str],
57+
sampling_params: SamplingParams,
58+
req_str: str,
59+
):
60+
start = time.time()
61+
outputs = llm.generate(prompt, sampling_params)
62+
print("-" * 50)
63+
for output in outputs:
64+
generated_text = output.outputs[0].text
65+
print(f"Generated text: {generated_text!r}")
66+
print(f"Generation took {time.time() - start:.2f} seconds, "
67+
f"{req_str} request done.")
68+
print("-" * 50)
69+
70+
71+
def main():
72+
setup_environment_variables()
73+
74+
with build_llm_with_lmcache() as llm:
75+
76+
# This example script runs two requests with a shared prefix.
77+
# Define the shared prompt and specific prompts
78+
shared_prompt = "Hello, how are you?" * 1000
79+
first_prompt = [
80+
shared_prompt + "Hello, my name is",
81+
]
82+
second_prompt = [
83+
shared_prompt + "Tell me a very long story",
84+
]
85+
86+
sampling_params = SamplingParams(temperature=0,
87+
top_p=0.95,
88+
max_tokens=10)
89+
90+
# Print the first output
91+
print_output(llm, first_prompt, sampling_params, "first")
92+
93+
time.sleep(1)
94+
95+
# print the second output
96+
print_output(llm, second_prompt, sampling_params, "second")
97+
98+
99+
if __name__ == "__main__":
100+
main()

0 commit comments

Comments
 (0)