diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 09e72a695db3..a2d2465c9666 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -1664,6 +1664,8 @@ def _partition_param_sec(self, param, buffer=None, has_been_updated=False): ##support for NVME secondary param offload #print_rank_0(f"SEC Param id {param.ds_id} status is {param.ds_status}", force=True) if param.ds_status is ZeroParamStatus.AVAILABLE: + if param.ds_secondary_tensor is not None and not has_been_updated: ##param already partitioned + return #check padding tensor_size = self._aligned_size(param) partition_size = tensor_size // self.dp_world_size @@ -1702,7 +1704,8 @@ def _partition_param_sec(self, param, buffer=None, has_been_updated=False): sec_numel).copy_(one_dim_param.narrow(0, secondary_start, sec_numel)) # TODO: This is a temporary fix to avoid the issue that 2nd tensor all-gather happens before 2nd tensor partition is done - get_accelerator().current_stream().synchronize() + if not get_accelerator().resolves_data_dependency(): + get_accelerator().current_stream().synchronize() print_rank_0(f"{param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}", force=False)