Do dtype conversion in torch hook to save memory (#1384)

irenedea · Saaketh Narayan · web-flow · commit 596dd9dcef1d · 2024-07-23T01:39:35.000Z
* Do dtype conversion in torch hook to save memory

* update code comment

Co-authored-by: Saaketh Narayan &lt;saaketh@mosaicml.com&gt;

---------

Co-authored-by: Saaketh Narayan &lt;saaketh@mosaicml.com&gt;
diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -435,8 +435,8 @@ def _save_checkpoint(self, state: State, logger: Logger):
 
         cpu_offload = True
 
-        # Add a dtensor->cpu tensor hook to avoid CUDA OOM
-        def dtensor_to_tensor_hook(
+        # Add hook to move tensors to cpu to avoid CUDA OOM
+        def tensor_hook(
             module: nn.Module,
             state_dict: Dict[str, Any],
             prefix: str,
@@ -449,20 +449,23 @@ def dtensor_to_tensor_hook(
                     dtensor_fqns.append(fqn)
                     tensor = tensor.full_tensor()  # type: ignore
                     if dist.get_global_rank() == 0:
+                        # Offload any DTensors to CPU
                         if cpu_offload:
                             tensor = tensor.cpu()
                         state_dict[fqn] = tensor
+                    else:
+                        state_dict[fqn] = None
+                # Convert the state dict to the requested precision
+                if isinstance(tensor, torch.Tensor):
+                    state_dict[fqn] = tensor.to(dtype=self.dtype)
+                del tensor
             if dist.get_global_rank() != 0:
-                for fqn in dtensor_fqns:
-                    del state_dict[fqn]
+                state_dict = {}
             return state_dict
 
         hooks = []
         for _, module in state_dict_model.named_modules():
-            if isinstance(module, FSDP):
-                hooks.append(
-                    module._register_state_dict_hook(dtensor_to_tensor_hook),
-                )
+            hooks.append(module._register_state_dict_hook(tensor_hook),)
 
         state_dict = get_model_state_dict(
             state_dict_model,
@@ -474,11 +477,6 @@ def dtensor_to_tensor_hook(
         for hook in hooks:
             hook.remove()
 
-        # Convert the state dict to the requested precision
-        for k, v in state_dict.items():
-            if isinstance(v, torch.Tensor):
-                state_dict[k] = v.to(dtype=self.dtype)
-
         new_model_instance = None  # Need this for pyright because variable could be unbound
 
         if dist.get_global_rank() == 0:
@@ -537,7 +535,7 @@ def dtensor_to_tensor_hook(
                 original_tokenizer.save_pretrained(temp_save_dir)
 
             # Only need to edit files for MPT because it has custom code
-            if original_model.config.model_type == 'mpt':
+            if new_model_instance.config.model_type == 'mpt':
                 log.debug('Editing MPT files for HuggingFace compatibility')
                 edit_files_for_hf_compatibility(
                     temp_save_dir,
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -383,6 +383,8 @@ def test_huggingface_conversion_callback_interval(
     mlflow_logger_mock.model_registry_prefix = ''
     mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'
     mlflow_logger_mock._run_id = 'mlflow-run-id'
+    mlflow_logger_mock._enabled = True
+    mlflow_logger_mock.run_url = 'fake-url'
     checkpointer_callback.transform_model_pre_registration = MagicMock(
         wraps=checkpointer_callback.transform_model_pre_registration,
     )

Original file line number	Diff line number	Diff line change
`@@ -383,6 +383,8 @@ def test_huggingface_conversion_callback_interval(`
`383`	`383`	`mlflow_logger_mock.model_registry_prefix = ''`
`384`	`384`	`mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'`
`385`	`385`	`mlflow_logger_mock._run_id = 'mlflow-run-id'`
	`386`	`+ mlflow_logger_mock._enabled = True`
	`387`	`+ mlflow_logger_mock.run_url = 'fake-url'`
`386`	`388`	`checkpointer_callback.transform_model_pre_registration = MagicMock(`
`387`	`389`	`wraps=checkpointer_callback.transform_model_pre_registration,`
`388`	`390`	`)`