InternLM · lvhan028 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
diff --git a/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py b/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py
@@ -69,6 +69,7 @@ def build(in_features: int, out_features: int, block_size: int = 128, bias: bool
             logger.debug('build with DeepGemmLinearBlockedF8Impl')
             return DeepGemmLinearBlockedF8Impl(in_features, out_features, block_size, dtype)
         except:  # noqa
+            logger.warning('Failed to import deep_gemm, LinearBlockedF8 fallback to triton implementation.')
             return TritonLinearBlockedF8Impl(in_features, out_features, block_size, dtype)
 
 
@@ -89,6 +90,8 @@ def __init__(self, in_features: int, out_features: int, block_size: int, out_dty
 
     def warmup(self, warmup_meta: WarmupMeta):
         """warmup."""
+        import random
+
         from deep_gemm.jit_kernels.utils import get_m_alignment_for_contiguous_layout
         device = 'cuda'
         max_num_tokens = warmup_meta.max_num_tokens
@@ -100,7 +103,10 @@ def warmup(self, warmup_meta: WarmupMeta):
         scale = torch.empty(((n + block_size - 1) // block_size, (k + block_size - 1) // block_size),
                             dtype=torch.float32,
                             device=device)
-        for m in range(alignment, range_end, alignment):
+        # shuffle ranges so ranks might compile different kernels concurrently.
+        ranges = list(range(alignment, range_end, alignment))
+        random.shuffle(ranges)
+        for m in ranges:
             inputs = torch.empty(m, k, dtype=self.out_dtype, device=device)
             input_quant, input_scale = quant_fp8_tma(inputs, self.block_size, dtype=weight.dtype)
             deep_gemm_fp8(input_quant, input_scale, weight, scale, out_dtype=inputs.dtype)

diff --git a/lmdeploy/pytorch/backends/cuda/warmup_manager.py b/lmdeploy/pytorch/backends/cuda/warmup_manager.py
@@ -37,9 +37,11 @@ def warmup(self, warmup_meta: WarmupMeta):
         """Warmup meta."""
         if len(self._warmup_calls) == 0:
             return
-
+        import random
         logger.info('Warming up ops.')
-        for key, func in self._warmup_calls.items():
+        funcs = list(self._warmup_calls.values())
+        random.shuffle(funcs)
+        for func in funcs:
             func(warmup_meta)
 
 

diff --git a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
@@ -285,17 +285,22 @@ def grid(META):
 @contextmanager
 def _log_jit_build(M: int, N: int, K: int):
     from deep_gemm.jit.runtime import RuntimeCache
-    origin_func = RuntimeCache.__getitem__
+
+    if hasattr(RuntimeCache, 'get'):
+        func_name = 'get'
+    else:
+        func_name = '__getitem__'
+    origin_func = getattr(RuntimeCache, func_name)
 
     def __patched_func(self, *args, **kwargs):
         ret = origin_func(self, *args, **kwargs)
         if ret is None:
             logger.warning(f'DeepGemm build <gemm_fp8_fp8_bf16_nt>: M={M}, N={N}, K={K}. Please waiting.')
         return ret
 
-    RuntimeCache.__getitem__ = __patched_func
+    setattr(RuntimeCache, func_name, __patched_func)
     yield
-    RuntimeCache.__getitem__ = origin_func
+    setattr(RuntimeCache, func_name, origin_func)
 
 
 def deep_gemm_fp8(A: Tensor,