huggingface · SunMarc · Mar 13, 2024 · Mar 13, 2024 · Mar 13, 2024 · Mar 13, 2024
@@ -166,17 +166,20 @@ def deserialize_tensor_subclass(t, state_dict, prefix):
                 return t.__class__.__tensor_unflatten__(inner_tensors_dict, meta_dict, None, None)
 
             deserialized_weight = deserialize_tensor_subclass(self.qweight, state_dict, weight_name + ".")
+            device = self.weight.device if self.weight.device.type != "meta" else deserialized_weight.device
             assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
             if assign_to_params_buffers:
                 self.weight = torch.nn.Parameter(deserialized_weight)
             else:
                 if type(self.weight.data) != type(deserialized_weight):
                     # Reloading frozen weights into unfrozen module: move to the correct device and force assignment
-                    self.weight = torch.nn.Parameter(deserialized_weight.to(self.weight.device))
+                    self.weight = torch.nn.Parameter(deserialized_weight.to(device))
                 else:
                     # FIXME: here we should copy frozen weights into frozen module, but this leads to grad error
-                    self.weight = torch.nn.Parameter(deserialized_weight.to(self.weight.device))
-
+                    self.weight = torch.nn.Parameter(deserialized_weight.to(device))
+        # this is needed because we can't load it correctly when the bias is on the meta device
+        if prefix + "bias" in state_dict:
+            self.bias = torch.nn.Parameter(state_dict.pop(prefix + "bias"))
         super()._load_from_state_dict(
             state_dict, prefix, local_metadata, False, missing_keys, unexpected_keys, error_msgs
         )

@@ -2,7 +2,7 @@
 from typing import Dict, Union
 
 import torch
-from safetensors.torch import safe_open, save_file
+from safetensors.torch import _remove_duplicate_names, safe_open, save_file
 
 
 def safe_save(state_dict: Dict[str, Union[torch.Tensor, str]], filename: Union[str, os.PathLike]):
@@ -20,6 +20,14 @@ def safe_save(state_dict: Dict[str, Union[torch.Tensor, str]], filename: Union[s
             tensors[name] = value
         else:
             metadata[name] = value
+
+    to_removes = _remove_duplicate_names(tensors)
+    for kept_name, to_remove_group in to_removes.items():
+        for to_remove in to_remove_group:
+            del tensors[to_remove]
+
+    metadata["format"] = "pt"
+
     save_file(tensors, filename, metadata)