From 02288bc1fdd4dd0b48f3933db8948f5a0d5c9545 Mon Sep 17 00:00:00 2001 From: RyanInnerpeace Date: Thu, 30 Nov 2023 10:12:52 +0800 Subject: [PATCH] fix: to solve #4726 (#4727) To solve #4726 , I change the dtype of loss tensor into float32 in the last stage of pipeline. **test result** before dist.broadcast ``` [2023-11-24 14:06:04,709] [INFO] [engine.py:590:_aggregate_total_loss] [Rank 2] before dist.broadcast(is_last_stage) (tensor([2.3203, 2.3203], device='cuda:2'), torch.float32, device(type='cuda', index=2)), src_rank=2 (1, 2) [2023-11-24 14:06:04,710] [INFO] [engine.py:590:_aggregate_total_loss] [Rank 3] before dist.broadcast(is_last_stage) (tensor([2.3203, 2.3203], device='cuda:3'), torch.float32, device(type='cuda', index=3)), src_rank=3 (1, 2) ``` After dist.broadcast, you can see the broadcast result is correct between rank 2 and rank 0 as well as rank 3 and rank 1. ``` [2023-11-24 14:06:05,016] [INFO] [engine.py:608:_aggregate_total_loss] [Rank 1] after dist.broadcast(other stage) (tensor([2.3203, 2.3203], device='cuda:1'), torch.float32) [2023-11-24 14:06:05,043] [INFO] [engine.py:608:_aggregate_total_loss] [Rank 0] after dist.broadcast(other stage) (tensor([2.3203, 2.3203], device='cuda:0'), torch.float32) ``` For more information. please refer #4726. Co-authored-by: ryan Co-authored-by: Olatunji Ruwase --- deepspeed/runtime/pipe/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index f08657b793d7..c8d6a0bff444 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -556,7 +556,7 @@ def _aggregate_total_loss(self): agg_loss /= self.dp_world_size assert self.global_rank in self.grid.pp_group - losses = torch.stack([self.dp_group_loss, agg_loss]) + losses = torch.stack([self.dp_group_loss, agg_loss]).float() if self.is_pipe_parallel: dist.broadcast(tensor=losses, src=self.global_rank, group=self.mpu.get_pipe_parallel_group()) else: