[Misc] Change buckets of histogram_iteration_tokens to [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] to represent number of tokens (vllm-project#17033)

sfc-gh-zhwang · Yuqi Zhang · commit ef18f5f861a7 · 2025-05-24T08:02:37.000Z
Signed-off-by: sfc-gh-zhwang &lt;flex.wang@snowflake.com&gt;
Signed-off-by: Yuqi Zhang &lt;yuqizhang@google.com&gt;
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
@@ -140,16 +140,13 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
-        buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
-        if not vllm_config.model_config.enforce_eager:
-            buckets = vllm_config.compilation_config.\
-                cudagraph_capture_sizes.copy()
-            buckets.sort()
         self.histogram_iteration_tokens = self._histogram_cls(
             name="vllm:iteration_tokens_total",
             documentation="Histogram of number of tokens per engine_step.",
             labelnames=labelnames,
-            buckets=buckets)
+            buckets=[
+                1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096, 16192
+            ])
         self.histogram_time_to_first_token = self._histogram_cls(
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -232,7 +232,10 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
             prometheus_client.Histogram(
                 name="vllm:iteration_tokens_total",
                 documentation="Histogram of number of tokens per engine_step.",
-                buckets=build_cudagraph_buckets(vllm_config),
+                buckets=[
+                    1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096,
+                    16192
+                ],
                 labelnames=labelnames).labels(*labelvalues)
 
         self.histogram_max_num_generation_tokens_request = \
@@ -467,16 +470,6 @@ def build_1_2_5_buckets(max_value: int) -> list[int]:
     return build_buckets([1, 2, 5], max_value)
 
 
-def build_cudagraph_buckets(vllm_config: VllmConfig) -> list[int]:
-    if not vllm_config.model_config.enforce_eager:
-        buckets = vllm_config.compilation_config.\
-            cudagraph_capture_sizes.copy()
-        buckets.sort()
-        return buckets
-    else:
-        return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
-
-
 def setup_default_loggers(
     vllm_config: VllmConfig,
     log_stats: bool,