Skip to content

Llama 4 Maverick fp8 CompressedTensorsW8A16Fp8 not defined #5134

Closed
@ehartford

Description

@ehartford

I tried with main latest

python -m sglang.launch_server --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 --tp 8

Output:

'CompressedTensorsW8A16Fp8' is not defined. Did you mean: 'CompressedTensorsW8A8Fp8'?

Stack Trace:

[2025-04-07 07:33:52 TP0] Scheduler hit an exception: Traceback (most recent call last):
  File "/raid/workspace/sglang/python/sglang/srt/managers/scheduler.py", line 1999, in run_scheduler_process
    scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/managers/scheduler.py", line 249, in __init__
    self.tp_worker = TpWorkerClass(
                     ^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 63, in __init__
    self.worker = TpModelWorker(server_args, gpu_id, tp_rank, dp_rank, nccl_port)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 74, in __init__
    self.model_runner = ModelRunner(
                        ^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 178, in __init__
    self.initialize(min_per_gpu_memory)
  File "/raid/workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 188, in initialize
    self.load_model()
  File "/raid/workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 400, in load_model
    self.model = get_model(
                 ^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/model_loader/__init__.py", line 22, in get_model
    return loader.load_model(
           ^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/model_loader/loader.py", line 365, in load_model
    model = _initialize_model(
            ^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/model_loader/loader.py", line 146, in _initialize_model
    return model_class(
           ^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/models/mllama4.py", line 34, in __init__
    self.language_model = Llama4ForCausalLM(
                          ^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 409, in __init__
    super().__init__(config, quant_config, prefix)
  File "/raid/workspace/sglang/python/sglang/srt/models/llama.py", line 380, in __init__
    self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 417, in _init_model
    return Llama4Model(config, quant_config=quant_config, prefix=prefix)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 354, in __init__
    self.layers = make_layers(
                  ^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/utils.py", line 402, in make_layers
    maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 356, in <lambda>
    lambda idx, prefix: Llama4DecoderLayer(
                        ^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 278, in __init__
    self.self_attn = Llama4Attention(
                     ^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/models/llama4.py", line 178, in __init__
    self.qkv_proj = QKVParallelLinear(
                    ^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/layers/linear.py", line 808, in __init__
    super().__init__(
  File "/raid/workspace/sglang/python/sglang/srt/layers/linear.py", line 336, in __init__
    super().__init__(
  File "/raid/workspace/sglang/python/sglang/srt/layers/linear.py", line 209, in __init__
    self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py", line 120, in get_quant_method
    scheme = self.get_scheme(layer=layer, layer_name=prefix)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py", line 509, in get_scheme
    scheme = self._get_scheme_from_parts(  # type: ignore
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/raid/workspace/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py", line 392, in _get_scheme_from_parts
    return CompressedTensorsW8A16Fp8(
           ^^^^^^^^^^^^^^^^^^^^^^^^^
NameError: name 'CompressedTensorsW8A16Fp8' is not defined. Did you mean: 'CompressedTensorsW8A8Fp8'?

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions