mosaicml · irenedea · Jul 23, 2024 · Jul 22, 2024 · Jul 23, 2024 · Jul 23, 2024
@@ -436,7 +436,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
         cpu_offload = True
 
         # Add a dtensor->cpu tensor hook to avoid CUDA OOM
-        def dtensor_to_tensor_hook(
+        def tensor_hook(
             module: nn.Module,
             state_dict: Dict[str, Any],
             prefix: str,
@@ -449,20 +449,23 @@ def dtensor_to_tensor_hook(
                     dtensor_fqns.append(fqn)
                     tensor = tensor.full_tensor()  # type: ignore
                     if dist.get_global_rank() == 0:
+                        # Offload any DTensors to CPU
                         if cpu_offload:
                             tensor = tensor.cpu()
                         state_dict[fqn] = tensor
+                    else:
+                        state_dict[fqn] = None
+                # Convert the state dict to the requested precision
+                if isinstance(tensor, torch.Tensor):
+                    state_dict[fqn] = tensor.to(dtype=self.dtype)
+                del tensor
             if dist.get_global_rank() != 0:
-                for fqn in dtensor_fqns:
-                    del state_dict[fqn]
+                state_dict = {}
             return state_dict
 
         hooks = []
         for _, module in state_dict_model.named_modules():
-            if isinstance(module, FSDP):
-                hooks.append(
-                    module._register_state_dict_hook(dtensor_to_tensor_hook),
-                )
+            hooks.append(module._register_state_dict_hook(tensor_hook),)
 
         state_dict = get_model_state_dict(
             state_dict_model,
@@ -474,11 +477,6 @@ def dtensor_to_tensor_hook(
         for hook in hooks:
             hook.remove()
 
-        # Convert the state dict to the requested precision
-        for k, v in state_dict.items():
-            if isinstance(v, torch.Tensor):
-                state_dict[k] = v.to(dtype=self.dtype)
-
         new_model_instance = None  # Need this for pyright because variable could be unbound
 
         if dist.get_global_rank() == 0:
@@ -537,7 +535,7 @@ def dtensor_to_tensor_hook(
                 original_tokenizer.save_pretrained(temp_save_dir)
 
             # Only need to edit files for MPT because it has custom code
-            if original_model.config.model_type == 'mpt':
+            if new_model_instance.config.model_type == 'mpt':
                 log.debug('Editing MPT files for HuggingFace compatibility')
                 edit_files_for_hf_compatibility(
                     temp_save_dir,

diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -383,6 +383,8 @@ def test_huggingface_conversion_callback_interval(
     mlflow_logger_mock.model_registry_prefix = ''
     mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'
     mlflow_logger_mock._run_id = 'mlflow-run-id'
+    mlflow_logger_mock._enabled = True
+    mlflow_logger_mock.run_url = 'fake-url'
     checkpointer_callback.transform_model_pre_registration = MagicMock(
         wraps=checkpointer_callback.transform_model_pre_registration,
     )