Skip to content

[V1] Move usage stats to worker and start logging TPU hardware #16211

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 58 commits into from
Apr 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
f9d82ea
Track TPU usages in vLLM's data dashboards
dyli-google Mar 27, 2025
731b68a
Merge branch 'vllm-project:main' into main
dyli-google Mar 27, 2025
d2d9b9e
Make the code more robust
dyli-google Mar 27, 2025
f168647
Merge branch 'main' of https://github.com/dyli-google/vllm
dyli-google Mar 27, 2025
ee00cf7
Merge branch 'vllm-project:main' into main
dyli-google Apr 7, 2025
39d610f
Your descriptive message about the changes you made
dyli-google Apr 7, 2025
558c60f
format
dyli-google Apr 7, 2025
639f77b
use new API
dyli-google Apr 7, 2025
d5e7533
Merge branch 'vllm-project:main' into main
dyli-google Apr 7, 2025
d9b9d61
Merge branch 'vllm-project:main' into main
dyli-google Apr 7, 2025
8f055c9
address Simon's comments
dyli-google Apr 7, 2025
63bea36
Silence ImportError
dyli-google Apr 7, 2025
25fa30b
Merge branch 'vllm-project:main' into main
dyli-google Apr 8, 2025
8124c99
Merge branch 'vllm-project:main' into main
dyli-google Apr 9, 2025
6a4eea4
Use torch_xla.tpu.get_tpu_type() to get TPU version
dyli-google Apr 9, 2025
ae2f5a6
Merge branch 'vllm-project:main' into main
dyli-google Apr 10, 2025
5d2f2b6
Merge branch 'vllm-project:main' into main
dyli-google Apr 11, 2025
9b3a67c
Merge branch 'vllm-project:main' into main
dyli-google Apr 14, 2025
35fb26b
Merge branch 'vllm-project:main' into main
dyli-google Apr 14, 2025
b0912f0
Merge branch 'vllm-project:main' into main
dyli-google Apr 20, 2025
88dd6c6
Merge branch 'vllm-project:main' into main
dyli-google Apr 22, 2025
727bed5
Add usage to more engines
dyli-google Apr 22, 2025
4f94631
Merge branch 'vllm-project:main' into main
dyli-google Apr 22, 2025
619e496
fix error
dyli-google Apr 22, 2025
a1ae7ff
format
dyli-google Apr 23, 2025
1667fab
Merge branch 'vllm-project:main' into main
dyli-google Apr 23, 2025
9f725f6
Revert "format"
dyli-google Apr 23, 2025
b17dbc9
format
dyli-google Apr 23, 2025
5286466
Merge branch 'vllm-project:main' into main
dyli-google Apr 23, 2025
3bd0c9b
Use import torch_xla
dyli-google Apr 23, 2025
625d21c
Merge branch 'main' of https://github.com/dyli-google/vllm
dyli-google Apr 23, 2025
718729a
format
dyli-google Apr 23, 2025
6e61fba
format
dyli-google Apr 23, 2025
737646d
format
dyli-google Apr 23, 2025
0e093cc
Merge branch 'vllm-project:main' into main
dyli-google Apr 23, 2025
9940dad
Merge branch 'vllm-project:main' into main
dyli-google Apr 23, 2025
f825349
Try Qiliang's idea
dyli-google Apr 23, 2025
7798bde
Merge branch 'vllm-project:main' into main
dyli-google Apr 23, 2025
bbd7f5a
Use Yarong's 2nd idea
dyli-google Apr 24, 2025
5bf9f34
Merge branch 'main' into main
dyli-google Apr 24, 2025
4e38e67
revert vllm/engine/async_llm_engine.py
dyli-google Apr 24, 2025
fc18a7a
simplify code
dyli-google Apr 24, 2025
cf7997a
simplify
dyli-google Apr 24, 2025
3bd5730
fix typo
dyli-google Apr 24, 2025
4374c3c
format
dyli-google Apr 24, 2025
6829371
simplify
dyli-google Apr 24, 2025
3c55fc7
silence error
dyli-google Apr 24, 2025
bbee546
Suppress all exceptions
dyli-google Apr 24, 2025
429b6aa
format
dyli-google Apr 24, 2025
8939235
remove comment
dyli-google Apr 24, 2025
bc284db
Merge branch 'vllm-project:main' into main
dyli-google Apr 24, 2025
bac067a
report usage of TPU and GPU during worker init time
dyli-google Apr 24, 2025
3ad33a2
remove useless import
dyli-google Apr 24, 2025
5b0ab6d
format
dyli-google Apr 24, 2025
1f592e4
Merge branch 'vllm-project:main' into main
dyli-google Apr 24, 2025
98e7ae0
Merge branch 'vllm-project:main' into main
dyli-google Apr 24, 2025
689d343
Merge branch 'vllm-project:main' into main
dyli-google Apr 25, 2025
4eea0a9
Merge branch 'vllm-project:main' into main
dyli-google Apr 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions vllm/usage/usage_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,15 @@ def _report_usage_once(self, model_architecture: str,
cuda_get_device_properties(0, ("name", "total_memory")))
if current_platform.is_cuda():
self.cuda_runtime = torch.version.cuda
if current_platform.is_tpu():
try:
import torch_xla
self.gpu_count = torch_xla.runtime.world_size()
self.gpu_type = torch_xla.tpu.get_tpu_type()
self.gpu_memory_per_device = (
torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
except Exception:
pass
self.provider = _detect_cloud_provider()
self.architecture = platform.machine()
self.platform = platform.platform()
Expand Down
4 changes: 0 additions & 4 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
StatLoggerBase)
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
from vllm.v1.utils import report_usage_stats

logger = init_logger(__name__)

Expand Down Expand Up @@ -113,9 +112,6 @@ def __init__(
except RuntimeError:
pass

# If usage stat is enabled, collect relevant info.
report_usage_stats(vllm_config, usage_context)

@classmethod
def from_vllm_config(
cls,
Expand Down
4 changes: 0 additions & 4 deletions vllm/v1/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from vllm.v1.engine.parallel_sampling import ParentRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.executor.abstract import Executor
from vllm.v1.utils import report_usage_stats

logger = init_logger(__name__)

Expand Down Expand Up @@ -97,9 +96,6 @@ def __init__(
# for v0 compatibility
self.model_executor = self.engine_core.engine_core.model_executor # type: ignore

# If usage stat is enabled, collect relevant info.
report_usage_stats(vllm_config, usage_context)

@classmethod
def from_vllm_config(
cls,
Expand Down
4 changes: 3 additions & 1 deletion vllm/v1/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,9 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)


def report_usage_stats(vllm_config, usage_context: UsageContext) -> None:
def report_usage_stats(
vllm_config,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None:
"""Report usage statistics if enabled."""

if not is_usage_stats_enabled():
Expand Down
5 changes: 5 additions & 0 deletions vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from vllm.utils import GiB_bytes
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.utils import report_usage_stats
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from vllm.v1.worker.worker_base import WorkerBase

Expand Down Expand Up @@ -141,6 +142,10 @@ def init_device(self):
self.model_runner: GPUModelRunner = GPUModelRunner(
self.vllm_config, self.device)

if self.rank == 0:
# If usage stat is enabled, collect relevant info.
report_usage_stats(self.vllm_config)

# FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
# to hijack tensor allocation.
def load_model(self) -> None:
Expand Down
6 changes: 5 additions & 1 deletion vllm/v1/worker/tpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
KVCacheSpec)
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.utils import bind_kv_cache
from vllm.v1.utils import bind_kv_cache, report_usage_stats
from vllm.v1.worker.tpu_model_runner import TPUModelRunner

logger = init_logger(__name__)
Expand Down Expand Up @@ -133,6 +133,10 @@ def init_device(self):
# Init ModelRunner here, so that we have access to self.device.
self.model_runner = TPUModelRunner(self.vllm_config, self.device)

if rank == 0:
# If usage stat is enabled, collect relevant info.
report_usage_stats(self.vllm_config)

def determine_available_memory(self) -> int:
kv_caches: dict[str, torch.Tensor] = {}
kv_cache_spec = self.model_runner.get_kv_cache_spec()
Expand Down