_exec_forward_pass: place zeros(1) on the same device as the param (#…

…5576) avoid mismatch between the param storage device and the .data device Co-authored-by: Logan Adams <[email protected]>
deepspeedai · Jun 17, 2024 · 1588c2b · 1588c2b
1 parent eda5075
commit 1588c2b
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
@@ -742,7 +742,7 @@ def _exec_forward_pass(self, buffer_id):
                 raise ValueError("expecting a tensor or a tuple of tensors")
             part = PartitionedTensor(tensor=first_output, group=self.grid.get_slice_parallel_group())
             # Clear the large output data, but save the computation graph
-            first_output.data = torch.zeros(1)
+            first_output.data = torch.zeros(1, device=first_output.data.device)
             self.pipe_buffers['output_tensors'][buffer_id] = first_output
             # Inject the partitioned tensor into the output before sending
             outputs = (part.to_meta(), part.data(), *outputs_tail)