Closed
Description
Script to reproduce (using sglang docker image)
set -x
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.rollout.name=sglang \
actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=sglang \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_fsdp_comparison' \
trainer.experiment_name='fsdp_1' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@
Error
Training Progress: 0%| | 0/7 [00:48<?, ?it/s]
Error executing job with overrides: ['algorithm.adv_estimator=grpo', 'data.train_files=/root/data/gsm8k/train.parquet', 'data.val_files=/root/data/gsm8k/test.parquet', 'data.train_batch_size=1024', 'data.max_prompt_length=512', 'data.max_response_length=1024', 'data.filter_overlong_prompts=True', 'data.truncation=error', 'actor_rollout_ref.rollout.name=sglang', 'actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.model.use_remove_padding=True', 'actor_rollout_ref.actor.ppo_mini_batch_size=256', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40', 'actor_rollout_ref.actor.use_kl_loss=True', 'actor_rollout_ref.actor.kl_loss_coef=0.001', 'actor_rollout_ref.actor.kl_loss_type=low_var_kl', 'actor_rollout_ref.actor.entropy_coeff=0', 'actor_rollout_ref.model.enable_gradient_checkpointing=True', 'actor_rollout_ref.actor.fsdp_config.param_offload=False', 'actor_rollout_ref.actor.fsdp_config.optimizer_offload=False', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40', 'actor_rollout_ref.rollout.tensor_model_parallel_size=2', 'actor_rollout_ref.rollout.name=sglang', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.6', 'actor_rollout_ref.rollout.n=5', 'actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40', 'actor_rollout_ref.ref.fsdp_config.param_offload=True', 'algorithm.use_kl_in_reward=False', 'trainer.critic_warmup=0', 'trainer.logger=[console,wandb]', 'trainer.project_name=verl_fsdp_comparison', 'trainer.experiment_name=fsdp_1', 'trainer.n_gpus_per_node=8', 'trainer.nnodes=1', 'trainer.save_freq=20', 'trainer.test_freq=5', 'trainer.total_epochs=1']
Traceback (most recent call last):
File "/data/verl/verl/trainer/main_ppo.py", line 64, in main
run_ppo(config)
File "/data/verl/verl/trainer/main_ppo.py", line 79, in run_ppo
ray.get(runner.run.remote(config))
File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2782, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 929, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AssertionError): ray::TaskRunner.run() (pid=85727, ip=172.17.0.2, actor_id=4ea891a2e4ff79b13aedd55c01000000, repr=<main_ppo.TaskRunner object at 0x74caae7af640>)
File "/data/verl/verl/trainer/main_ppo.py", line 186, in run
trainer.fit()
File "/data/verl/verl/trainer/ppo/ray_trainer.py", line 910, in fit
gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
File "/data/verl/verl/single_controller/ray/base.py", line 49, in func
output = ray.get(output)
ray.exceptions.RayTaskError(AssertionError): ray::WorkerDict.actor_rollout_generate_sequences() (pid=87129, ip=172.17.0.2, actor_id=2b0a81658b0632dbd2e0273401000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x79b6f839c970>)
File "/data/verl/verl/single_controller/ray/base.py", line 459, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/data/verl/verl/single_controller/base/decorator.py", line 465, in inner
return func(*args, **kwargs)
File "/data/verl/verl/workers/fsdp_workers.py", line 609, in generate_sequences
output = self.rollout.generate_sequences(prompts=prompts)
File "/data/verl/verl/utils/debug/performance.py", line 78, in f
return self.log(decorated_function, *args, **kwargs)
File "/data/verl/verl/utils/debug/performance.py", line 88, in log
output = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/data/verl/verl/workers/rollout/sglang_rollout/sglang_rollout.py", line 355, in generate_sequences
return DataProto(batch=batch, non_tensor_batch=non_tensor_batch)
File "<string>", line 6, in __init__
File "/data/verl/verl/protocol.py", line 214, in __post_init__
self.check_consistency()
File "/data/verl/verl/protocol.py", line 325, in check_consistency
assert val.shape[0] == batch_size, f"key {key} length {len(val)} is not equal to batch size {batch_size}"
AssertionError: key tools_kwargs length 256 is not equal to batch size 1280
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
(WorkerDict pid=87133) self.sampling_params={'n': 5, 'max_new_tokens': 1024, 'presence_penalty': 0.0, 'frequency_penalty': 0.0, 'repetition_penalty': 1.0, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False} [repeated 7x across cluster]
Metadata
Metadata
Assignees
Labels
No labels