Facing Out Of Memory issue for llama-13b model when trained on 4 gpus #3554
premanand09
started this conversation in
General
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
Hi,
I am trying to train llama-13b model on 4 gpu's each of size around 15360MiB. total size of GPU is around 61GB.
After launching the training, i am facing OOM issue for GPU. Ideally model should fit on these GPU memories. Any help here please.
│ 795 │ def _apply(self, fn): │
│ 796 │ │ for module in self.children(): │
│ ❱ 797 │ │ │ module._apply(fn) │
│ 798 │ │ │
│ 799 │ │ def compute_should_use_set_data(tensor, tensor_applied): │
│ 800 │ │ │ if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): │
│ │
│ /opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py:820 in _apply │
│ │
│ 817 │ │ │ # track autograd history of param_applied, so we have to use │
│ 818 │ │ │ # with torch.no_grad(): │
│ 819 │ │ │ with torch.no_grad(): │
│ ❱ 820 │ │ │ │ param_applied = fn(param) │
│ 821 │ │ │ should_use_set_data = compute_should_use_set_data(param, param_applied) │
│ 822 │ │ │ if should_use_set_data: │
│ 823 │ │ │ │ param.data = param_applied │
│ │
│ /opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py:1143 in convert │
│ │
│ 1140 │ │ │ if convert_to_format is not None and t.dim() in (4, 5): │
│ 1141 │ │ │ │ return t.to(device, dtype if t.is_floating_point() or t.is_complex() els │
│ 1142 │ │ │ │ │ │ │ non_blocking, memory_format=convert_to_format) │
│ ❱ 1143 │ │ │ return t.to(device, dtype if t.is_floating_point() or t.is_complex() else No │
│ 1144 │ │ │
│ 1145 │ │ return self._apply(convert) │
│ 1146 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB (GPU 0; 14.75 GiB total capacity; 14.16 GiB already allocated; 40.81
MiB free; 14.16 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid
fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Here is the command i am using :
deepspeed model/model_training/trainer_sft.py --local_rank=0 --configs defaults llama-13b-pretrain pretrain --cache_dir data_cache --output_dir saved_models/llama-13b-super-pretrain --deepspeed --show_dataset_stats --wandb_entity premnayar4
Here is the llama-13b-pretrain config:
llama-13b-pretrain:
dtype: fp16
log_dir: "llama_log_13b"
learning_rate: 1e-5
model_name: saved_models/llama-13B-hf-weights
output_dir: saved_models/llama-13b-super-pretrain
deepspeed_config: model/model_training/configs/zero_config_pretrain.json
weight_decay: 0.0
residual_dropout: 0.0
max_length: 2048
use_flash_attention: true
warmup_steps: 100
gradient_checkpointing: true
gradient_accumulation_steps: 4
per_device_train_batch_size: 4
per_device_eval_batch_size: 8
eval_steps: 200
save_steps: 500
num_train_epochs: 1
save_total_limit: 2
Here is the deepspeed config i am using below:
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"warmup_type": "linear"
}
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 1e9,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 1e9,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
Beta Was this translation helpful? Give feedback.
All reactions