From 1588c2bcf7f2ceb3e443bed8996661c57378de5d Mon Sep 17 00:00:00 2001 From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com> Date: Mon, 17 Jun 2024 19:55:32 +0300 Subject: [PATCH] _exec_forward_pass: place zeros(1) on the same device as the param (#5576) avoid mismatch between the param storage device and the .data device Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/pipe/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index ab4919a0f0abd..9e84121d50fa8 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -742,7 +742,7 @@ def _exec_forward_pass(self, buffer_id): raise ValueError("expecting a tensor or a tuple of tensors") part = PartitionedTensor(tensor=first_output, group=self.grid.get_slice_parallel_group()) # Clear the large output data, but save the computation graph - first_output.data = torch.zeros(1) + first_output.data = torch.zeros(1, device=first_output.data.device) self.pipe_buffers['output_tensors'][buffer_id] = first_output # Inject the partitioned tensor into the output before sending outputs = (part.to_meta(), part.data(), *outputs_tail)