Closed
Description
I tried with main latest
python -m sglang.launch_server --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 --tp 8
Output:
'CompressedTensorsW8A16Fp8' is not defined. Did you mean: 'CompressedTensorsW8A8Fp8'?
Stack Trace:
[2025-04-07 07:33:52 TP0] Scheduler hit an exception: Traceback (most recent call last):
File "/raid/workspace/sglang/python/sglang/srt/managers/scheduler.py", line 1999, in run_scheduler_process
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/managers/scheduler.py", line 249, in __init__
self.tp_worker = TpWorkerClass(
^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 63, in __init__
self.worker = TpModelWorker(server_args, gpu_id, tp_rank, dp_rank, nccl_port)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 74, in __init__
self.model_runner = ModelRunner(
^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 178, in __init__
self.initialize(min_per_gpu_memory)
File "/raid/workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 188, in initialize
self.load_model()
File "/raid/workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 400, in load_model
self.model = get_model(
^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/model_loader/__init__.py", line 22, in get_model
return loader.load_model(
^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/model_loader/loader.py", line 365, in load_model
model = _initialize_model(
^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/model_loader/loader.py", line 146, in _initialize_model
return model_class(
^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/models/mllama4.py", line 34, in __init__
self.language_model = Llama4ForCausalLM(
^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 409, in __init__
super().__init__(config, quant_config, prefix)
File "/raid/workspace/sglang/python/sglang/srt/models/llama.py", line 380, in __init__
self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 417, in _init_model
return Llama4Model(config, quant_config=quant_config, prefix=prefix)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 354, in __init__
self.layers = make_layers(
^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/utils.py", line 402, in make_layers
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 356, in <lambda>
lambda idx, prefix: Llama4DecoderLayer(
^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 278, in __init__
self.self_attn = Llama4Attention(
^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 178, in __init__
self.qkv_proj = QKVParallelLinear(
^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/layers/linear.py", line 808, in __init__
super().__init__(
File "/raid/workspace/sglang/python/sglang/srt/layers/linear.py", line 336, in __init__
super().__init__(
File "/raid/workspace/sglang/python/sglang/srt/layers/linear.py", line 209, in __init__
self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py", line 120, in get_quant_method
scheme = self.get_scheme(layer=layer, layer_name=prefix)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py", line 509, in get_scheme
scheme = self._get_scheme_from_parts( # type: ignore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/raid/workspace/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py", line 392, in _get_scheme_from_parts
return CompressedTensorsW8A16Fp8(
^^^^^^^^^^^^^^^^^^^^^^^^^
NameError: name 'CompressedTensorsW8A16Fp8' is not defined. Did you mean: 'CompressedTensorsW8A8Fp8'?