vllm启动失败

#1
by chuzhenfang - opened

ValueError: The output_size of gate's and up's weight = 320 is not divisible by weight quantization block_n = 128.

Using the "--enable-expert-parallel" option might help.

Using the "--enable-expert-parallel" option might help.

thanks! It's work. with tp=8, the output_size of gate's and up's weight is split to 320 from original 2560?

I am getting same error with sglang. Can someone help ?

[2025-07-25 09:53:38 TP3] Scheduler hit an exception: Traceback (most recent call last):
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 2921, in run_scheduler_process
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 344, in init
self.tp_worker = TpWorkerClass(
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 66, in init
self.worker = TpModelWorker(
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 81, in init
self.model_runner = ModelRunner(
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 234, in init
self.initialize(min_per_gpu_memory)
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 277, in initialize
self.load_model()
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 623, in load_model
self.model = get_model(
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/init.py", line 22, in get_model
return loader.load_model(
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 432, in load_model
model = _initialize_model(
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 174, in _initialize_model
return model_class(
File "/sgl-workspace/sglang/python/sglang/srt/models/qwen3_moe.py", line 702, in init
self.model = Qwen3MoeModel(
File "/sgl-workspace/sglang/python/sglang/srt/models/qwen3_moe.py", line 680, in init
super().init(
File "/sgl-workspace/sglang/python/sglang/srt/models/qwen2_moe.py", line 430, in init
self.layers, self.start_layer, self.end_layer = make_layers(
File "/sgl-workspace/sglang/python/sglang/srt/utils.py", line 525, in make_layers
+ [
File "/sgl-workspace/sglang/python/sglang/srt/utils.py", line 526, in
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
File "/sgl-workspace/sglang/python/sglang/srt/models/qwen2_moe.py", line 432, in
lambda idx, prefix: decoder_layer_type(
File "/sgl-workspace/sglang/python/sglang/srt/models/qwen3_moe.py", line 558, in init
self.mlp = Qwen3MoeSparseMoeBlock(
File "/sgl-workspace/sglang/python/sglang/srt/models/qwen3_moe.py", line 110, in init
self.experts = get_moe_impl_class()(
File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 148, in init
self.quant_method.create_weights(
File "/sgl-workspace/sglang/python/sglang/srt/layers/quantization/fp8.py", line 533, in create_weights
raise ValueError(
ValueError: The output_size of gate's and up's weight = 320 is not divisible by weight quantization block_n = 128.

For sglang, use "--enable-ep-moe" may help.

Sign up or log in to comment