(colo01) root@DESKTOP-5H0EB03:~/ColossalAI# cd examples/language/llama/scripts/benchmark_7B (colo01) root@DESKTOP-5H0EB03:~/ColossalAI/examples/language/llama/scripts/benchmark_7B# bash gemini.sh /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/pipeline/schedule/_utils.py:19: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/pipeline/schedule/_utils.py:19: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/pipeline/schedule/_utils.py:19: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/pipeline/schedule/_utils.py:19: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/utils/_pytree.py:332: UserWarning: is already registered as pytree node. Overwriting the previous registration. warnings.warn( /root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/utils/_pytree.py:332: UserWarning: is already registered as pytree node. Overwriting the previous registration. warnings.warn( /root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/utils/_pytree.py:332: UserWarning: is already registered as pytree node. Overwriting the previous registration. warnings.warn( /root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/utils/_pytree.py:332: UserWarning: is already registered as pytree node. Overwriting the previous registration. warnings.warn( /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/pipeline/schedule/_utils.py:19: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/utils/_pytree.py:332: UserWarning: is already registered as pytree node. Overwriting the previous registration. warnings.warn( /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/pipeline/schedule/_utils.py:19: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/utils/_pytree.py:332: UserWarning: is already registered as pytree node. Overwriting the previous registration. warnings.warn( /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/pipeline/schedule/_utils.py:19: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/utils/_pytree.py:332: UserWarning: is already registered as pytree node. Overwriting the previous registration. warnings.warn( /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/pipeline/schedule/_utils.py:19: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/utils/_pytree.py:332: UserWarning: is already registered as pytree node. Overwriting the previous registration. warnings.warn( /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/shardformer/layer/normalization.py:45: UserWarning: Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/shardformer/layer/normalization.py:45: UserWarning: Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/shardformer/layer/normalization.py:45: UserWarning: Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/shardformer/layer/normalization.py:45: UserWarning: Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/shardformer/layer/normalization.py:45: UserWarning: Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/shardformer/layer/normalization.py:45: UserWarning: Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/shardformer/layer/normalization.py:45: UserWarning: Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/shardformer/layer/normalization.py:45: UserWarning: Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/legacy/registry/__init__.py:1: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead. import torch.distributed.optim as dist_optim /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/legacy/registry/__init__.py:1: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead. import torch.distributed.optim as dist_optim /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/legacy/registry/__init__.py:1: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead. import torch.distributed.optim as dist_optim /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/legacy/registry/__init__.py:1: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead. import torch.distributed.optim as dist_optim /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/legacy/registry/__init__.py:1: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead. import torch.distributed.optim as dist_optim /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/legacy/registry/__init__.py:1: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead. import torch.distributed.optim as dist_optim /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/legacy/registry/__init__.py:1: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead. import torch.distributed.optim as dist_optim /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/legacy/registry/__init__.py:1: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead. import torch.distributed.optim as dist_optim /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py:282: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py:282: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py:282: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py:282: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py:282: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py:282: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py:282: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) /root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py:282: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled) [rank6]: Traceback (most recent call last): [rank6]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 364, in [rank6]: main() [rank6]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 118, in main [rank6]: colossalai.launch_from_torch() [rank6]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 175, in launch_from_torch [rank6]: launch( [rank6]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 64, in launch [rank6]: cur_accelerator.set_device(local_rank) [rank6]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py", line 50, in set_device [rank6]: torch.cuda.set_device(device) [rank6]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/cuda/__init__.py", line 420, in set_device [rank6]: torch._C._cuda_setDevice(device) [rank6]: RuntimeError: CUDA error: invalid device ordinal [rank6]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. [rank6]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1 [rank6]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. [rank5]: Traceback (most recent call last): [rank5]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 364, in [rank5]: main() [rank5]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 118, in main [rank5]: colossalai.launch_from_torch() [rank5]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 175, in launch_from_torch [rank5]: launch( [rank5]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 64, in launch [rank5]: cur_accelerator.set_device(local_rank) [rank5]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py", line 50, in set_device [rank5]: torch.cuda.set_device(device) [rank5]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/cuda/__init__.py", line 420, in set_device [rank5]: torch._C._cuda_setDevice(device) [rank5]: RuntimeError: CUDA error: invalid device ordinal [rank5]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. [rank5]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1 [rank5]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. [rank4]: Traceback (most recent call last): [rank4]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 364, in [rank4]: main() [rank4]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 118, in main [rank4]: colossalai.launch_from_torch() [rank4]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 175, in launch_from_torch [rank4]: launch( [rank4]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 64, in launch [rank4]: cur_accelerator.set_device(local_rank) [rank4]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py", line 50, in set_device [rank4]: torch.cuda.set_device(device) [rank4]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/cuda/__init__.py", line 420, in set_device [rank4]: torch._C._cuda_setDevice(device) [rank4]: RuntimeError: CUDA error: invalid device ordinal [rank4]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. [rank4]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1 [rank4]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. [rank3]: Traceback (most recent call last): [rank3]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 364, in [rank3]: main() [rank3]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 118, in main [rank3]: colossalai.launch_from_torch() [rank3]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 175, in launch_from_torch [rank3]: launch( [rank3]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 64, in launch [rank3]: cur_accelerator.set_device(local_rank) [rank3]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py", line 50, in set_device [rank3]: torch.cuda.set_device(device) [rank3]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/cuda/__init__.py", line 420, in set_device [rank3]: torch._C._cuda_setDevice(device) [rank3]: RuntimeError: CUDA error: invalid device ordinal [rank3]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. [rank3]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1 [rank3]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. [rank1]: Traceback (most recent call last): [rank1]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 364, in [rank1]: main() [rank1]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 118, in main [rank1]: colossalai.launch_from_torch() [rank1]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 175, in launch_from_torch [rank1]: launch( [rank1]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 64, in launch [rank1]: cur_accelerator.set_device(local_rank) [rank1]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py", line 50, in set_device [rank1]: torch.cuda.set_device(device) [rank1]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/cuda/__init__.py", line 420, in set_device [rank1]: torch._C._cuda_setDevice(device) [rank1]: RuntimeError: CUDA error: invalid device ordinal [rank1]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. [rank1]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1 [rank1]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. [rank7]: Traceback (most recent call last): [rank7]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 364, in [rank7]: main() [rank7]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 118, in main [rank7]: colossalai.launch_from_torch() [rank7]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 175, in launch_from_torch [rank7]: launch( [rank7]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 64, in launch [rank7]: cur_accelerator.set_device(local_rank) [rank7]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py", line 50, in set_device [rank7]: torch.cuda.set_device(device) [rank7]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/cuda/__init__.py", line 420, in set_device [rank7]: torch._C._cuda_setDevice(device) [rank7]: RuntimeError: CUDA error: invalid device ordinal [rank7]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. [rank7]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1 [rank7]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. [rank2]: Traceback (most recent call last): [rank2]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 364, in [rank2]: main() [rank2]: File "/root/ColossalAI/examples/language/llama/benchmark.py", line 118, in main [rank2]: colossalai.launch_from_torch() [rank2]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 175, in launch_from_torch [rank2]: launch( [rank2]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/initialize.py", line 64, in launch [rank2]: cur_accelerator.set_device(local_rank) [rank2]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/colossalai/accelerator/cuda_accelerator.py", line 50, in set_device [rank2]: torch.cuda.set_device(device) [rank2]: File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/cuda/__init__.py", line 420, in set_device [rank2]: torch._C._cuda_setDevice(device) [rank2]: RuntimeError: CUDA error: invalid device ordinal [rank2]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. [rank2]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1 [rank2]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. [09/21/24 06:54:35] INFO colossalai - colossalai - INFO: /root/miniconda3/envs/colo01/lib/python3.10/site-pa ckages/colossalai/initialize.py:75 launch INFO colossalai - colossalai - INFO: Distributed environment is initialized, world size: 8 WARNING colossalai - colossalai - WARNING: /root/miniconda3/envs/colo01/lib/python3.10/site-pa ckages/colossalai/booster/plugin/gemini_plugin.py:3 82 __init__ WARNING colossalai - colossalai - WARNING: enable_async_reduce sets pin_memory=True to achieve best performance, which is not implicitly set. [rank6]:[W921 06:54:36.685697360 ProcessGroupNCCL.cpp:1168] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) [rank5]:[W921 06:54:36.713733674 ProcessGroupNCCL.cpp:1168] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) [rank7]:[W921 06:54:36.732977056 ProcessGroupNCCL.cpp:1168] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) [rank4]:[W921 06:54:36.747104435 ProcessGroupNCCL.cpp:1168] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) [rank2]:[W921 06:54:36.769742070 ProcessGroupNCCL.cpp:1168] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) [rank3]:[W921 06:54:36.795658521 ProcessGroupNCCL.cpp:1168] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) [rank1]:[W921 06:54:36.814693097 ProcessGroupNCCL.cpp:1168] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) Model params: 6.28 B [extension] Compiling the JIT cpu_adam_x86 kernel during runtime now W0921 06:54:37.165000 140269539800896 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 810 closing signal SIGTERM W0921 06:54:37.165000 140269539800896 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 813 closing signal SIGTERM W0921 06:54:37.165000 140269539800896 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 817 closing signal SIGTERM E0921 06:54:37.229000 140269539800896 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 1 (pid: 811) of binary: /root/miniconda3/envs/colo01/bin/python Traceback (most recent call last): File "/root/miniconda3/envs/colo01/bin/torchrun", line 33, in sys.exit(load_entry_point('torch==2.4.1', 'console_scripts', 'torchrun')()) File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper return f(*args, **kwargs) File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main run(args) File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run elastic_launch( File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/root/miniconda3/envs/colo01/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ benchmark.py FAILED ------------------------------------------------------------ Failures: [1]: time : 2024-09-21_06:54:37 host : DESKTOP-5H0EB03. rank : 2 (local_rank: 2) exitcode : 1 (pid: 812) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [2]: time : 2024-09-21_06:54:37 host : DESKTOP-5H0EB03. rank : 4 (local_rank: 4) exitcode : 1 (pid: 814) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [3]: time : 2024-09-21_06:54:37 host : DESKTOP-5H0EB03. rank : 5 (local_rank: 5) exitcode : 1 (pid: 815) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [4]: time : 2024-09-21_06:54:37 host : DESKTOP-5H0EB03. rank : 6 (local_rank: 6) exitcode : 1 (pid: 816) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-09-21_06:54:37 host : DESKTOP-5H0EB03. rank : 1 (local_rank: 1) exitcode : 1 (pid: 811) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================ Error: failed to run torchrun --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr=127.0.0.1 --master_port=29500 benchmark.py -g -x -b 16 on 127.0.0.1, is localhost: True, exception: Encountered a bad command exit code! Command: 'cd /root/ColossalAI/examples/language/llama && export SHELL="/bin/bash" GCC_RANLIB="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-gcc-ranlib" WSL2_GUI_APPS_ENABLED="1" CONDA_EXE="/root/miniconda3/bin/conda" WSL_DISTRO_NAME="Ubuntu-24.04" build_alias="x86_64-conda-linux-gnu" CMAKE_ARGS="-DCMAKE_LINKER=/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-ld -DCMAKE_STRIP=/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-strip" GPROF="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-gprof" _CONDA_PYTHON_SYSCONFIGDATA_NAME="_sysconfigdata_x86_64_conda_cos7_linux_gnu" STRINGS="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-strings" CPP="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-cpp" NAME="DESKTOP-5H0EB03" PWD="/root/ColossalAI/examples/language/llama" LOGNAME="root" CONDA_PREFIX="/root/miniconda3/envs/colo01" CXX="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-c++" CXXFLAGS="-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /root/miniconda3/envs/colo01/include" DEBUG_CXXFLAGS="-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /root/miniconda3/envs/colo01/include" LDFLAGS="-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/root/miniconda3/envs/colo01/lib -Wl,-rpath-link,/root/miniconda3/envs/colo01/lib -L/root/miniconda3/envs/colo01/lib" HOME="/root" LANG="C.UTF-8" WSL_INTEROP="/run/WSL/300_interop" LS_COLORS="rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=00:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.avif=01;35:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:*~=00;90:*#=00;90:*.bak=00;90:*.crdownload=00;90:*.dpkg-dist=00;90:*.dpkg-new=00;90:*.dpkg-old=00;90:*.dpkg-tmp=00;90:*.old=00;90:*.orig=00;90:*.part=00;90:*.rej=00;90:*.rpmnew=00;90:*.rpmorig=00;90:*.rpmsave=00;90:*.swp=00;90:*.tmp=00;90:*.ucf-dist=00;90:*.ucf-new=00;90:*.ucf-old=00;90:" DEBUG_CFLAGS="-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /root/miniconda3/envs/colo01/include" WAYLAND_DISPLAY="wayland-0" CXX_FOR_BUILD="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-c++" ELFEDIT="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-elfedit" CONDA_PROMPT_MODIFIER="(colo01) " CMAKE_PREFIX_PATH="/root/miniconda3/envs/colo01:/root/miniconda3/envs/colo01/x86_64-conda-linux-gnu/sysroot/usr" CPPFLAGS="-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /root/miniconda3/envs/colo01/include" LD="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-ld" READELF="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-readelf" GXX="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-g++" GCC_AR="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-gcc-ar" LESSCLOSE="/usr/bin/lesspipe %s %s" ADDR2LINE="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-addr2line" TERM="xterm-256color" SIZE="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-size" GCC_NM="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-gcc-nm" HOST="x86_64-conda-linux-gnu" LESSOPEN="| /usr/bin/lesspipe %s" CC_FOR_BUILD="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-cc" USER="root" CONDA_SHLVL="2" AR="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-ar" AS="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-as" DEBUG_CPPFLAGS="-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /root/miniconda3/envs/colo01/include" host_alias="x86_64-conda-linux-gnu" DISPLAY=":0" SHLVL="2" NM="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-nm" GCC="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-gcc" LD_GOLD="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-ld.gold" CONDA_PYTHON_EXE="/root/miniconda3/bin/python" LD_LIBRARY_PATH="/usr/local/cuda-12.1/lib64:" XDG_RUNTIME_DIR="/run/user/0/" CONDA_DEFAULT_ENV="colo01" OBJCOPY="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-objcopy" OMP_NUM_THREADS="8" STRIP="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-strip" CUDA_HOME="/usr/local/cuda-12.1" XDG_DATA_DIRS="/usr/local/share:/usr/share:/var/lib/snapd/desktop" OBJDUMP="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-objdump" PATH="/root/miniconda3/envs/colo01/bin:/usr/local/cuda-12.1/bin:/root/miniconda3/envs/colo01/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/usr/lib/wsl/lib:/mnt/c/Windows/system32:/mnt/c/Windows:/mnt/c/Windows/System32/Wbem:/mnt/c/Windows/System32/WindowsPowerShell/v1.0:/mnt/c/Windows/System32/OpenSSH:/mnt/c/Program Files (x86)/NVIDIA Corporation/PhysX/Common:/mnt/c/Program Files/NVIDIA Corporation/NVIDIA NvDLISR:/mnt/c/WINDOWS/system32:/mnt/c/WINDOWS:/mnt/c/WINDOWS/System32/Wbem:/mnt/c/WINDOWS/System32/WindowsPowerShell/v1.0:/mnt/c/WINDOWS/System32/OpenSSH:/mnt/c/Program Files/PuTTY:/mnt/c/Program Files/Git/cmd:/mnt/c/Users/eccct/anaconda3/Scripts:/mnt/c/Program Files/OpenSSH-Win64:/mnt/c/Program Files/nodejs:/mnt/c/Program Files/Microsoft SQL Server/130/Tools/Binn:/mnt/c/Program Files/Microsoft SQL Server/Client SDK/ODBC/170/Tools/Binn:/mnt/c/Program Files/Docker/Docker/resources/bin:/mnt/c/Users/eccct/AppData/Local/Microsoft/WindowsApps:/mnt/d/A:/mnt/c/Users/eccct/Anaconda3:/mnt/c/Users/eccct/Anaconda3/Library/mingw-w64/bin:/mnt/c/Users/eccct/Anaconda3/Library/usr/bin:/mnt/c/Users/eccct/Anaconda3/Library/bin:/mnt/c/Users/eccct/Anaconda3/Scripts:/mnt/c/Users/eccct/AppData/Roaming/Aria2/maria2c.exe:/mnt/d/Microsoft VS Code/bin:/mnt/c/Users/eccct/AppData/Roaming/npm:/mnt/c/Users/eccct/.dotnet/tools:/mnt/c/Users/eccct/AppData/Local/Programs/Azure Dev CLI:/mnt/c/Users/eccct/.cache/lm-studio/bin:/snap/bin" CC="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-cc" CFLAGS="-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /root/miniconda3/envs/colo01/include" CXXFILT="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-c++filt" DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/0/bus" BUILD="x86_64-conda-linux-gnu" HOSTTYPE="x86_64" CONDA_PREFIX_1="/root/miniconda3" PULSE_SERVER="unix:/mnt/wslg/PulseServer" RANLIB="/root/miniconda3/envs/colo01/bin/x86_64-conda-linux-gnu-ranlib" CONDA_BUILD_SYSROOT="/root/miniconda3/envs/colo01/x86_64-conda-linux-gnu/sysroot" OLDPWD="/root/ColossalAI/examples/language/llama/scripts/benchmark_7B" _="/root/miniconda3/envs/colo01/bin/colossalai" CUDA_DEVICE_MAX_CONNECTIONS="1" && torchrun --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr=127.0.0.1 --master_port=29500 benchmark.py -g -x -b 16' Exit code: 1 Stdout: already printed Stderr: already printed ====== Training on All Nodes ===== 127.0.0.1: failure ====== Stopping All Nodes ===== 127.0.0.1: finish