bigscience-workshop · thomasw21 · Mar 24, 2022 · Mar 24, 2022 · Mar 25, 2022 · Mar 25, 2022
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -186,7 +186,9 @@ jobs:
           pip install pytest-timeout
 
       - name: Run tests
-        run: pytest --timeout=600 tests
+        # run: pytest --timeout=600 tests
+        # run just the test we want for now
+        run: pytest --timeout=600 tests/test_training.py::MegDSTestTraining::test_layer_norm_consistent_0_bf16
 
   stop-runner:
     name: Stop self-hosted EC2 runner

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
@@ -23,6 +23,7 @@
 from torch.nn.parameter import Parameter
 from torch.nn import init
 import importlib
+from megatron import mpu
 
 global fused_mix_prec_layer_norm_cuda
 fused_mix_prec_layer_norm_cuda = None
@@ -84,19 +85,13 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-    weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())]
-    torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group())
-    biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())]
-    torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group())
-    if any(torch.any(weight != self.weight) for weight in weights):
-        if mpu.get_tensor_model_parallel_rank() == 0:
-            print("Weight sync failed")
-            print(weights)
-    if any(torch.any(bias != self.bias) for bias in biases):
-        if mpu.get_tensor_model_parallel_rank() == 0:
-            print("Bias sync failed")
-            print(biases)
+    tp_world_size = mpu.get_tensor_model_parallel_world_size()
+    # TODO: hack in order to synchronize all layer norms despite them being unsynched
+    weight = torch.clone(self.weight)
+    bias = torch.clone(self.bias)
+    weight = mpu.reduce_from_tensor_model_parallel_region(weight) / tp_world_size
+    bias = mpu.reduce_from_tensor_model_parallel_region(bias) / tp_world_size
 def _reduce(input_): 
     """All-reduce the the input tensor across model parallel group.""" 
     # Bypass the function if we are using only 1 GPU. 
     if get_tensor_model_parallel_world_size()==1: 
         return input_ 
     # All-reduce. 
     torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group()) 
 def _reduce(input_): 
     """All-reduce the the input tensor across model parallel group.""" 
  
     # Bypass the function if we are using only 1 GPU. 
     if get_tensor_model_parallel_world_size()==1: 
         return input_ 
  
     # All-reduce. 
     torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group()) 
 
     return FusedLayerNormAffineFunction.apply(
-      input, self.weight, self.bias, self.normalized_shape,self.eps)
+      input, weight, bias, self.normalized_shape,self.eps)
 
diff --git a/megatron/testing_utils.py b/megatron/testing_utils.py
@@ -232,9 +232,9 @@ def get_gpu_count():
         return 0
 
 def torch_assert_equal(actual, expected, **kwargs):
-    # assert_equal was added around pt-1.9, it does better checks - e.g will check dimensions match
-    if hasattr(torch.testing, "assert_equal"):
-        return torch.testing.assert_equal(actual, expected, **kwargs)
+    # assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match
+    if hasattr(torch.testing, "assert_close"):
+        return torch.testing.assert_close(actual, expected, rtol=0.0, atol=0.0, **kwargs)
     else:
         return torch.allclose(actual, expected, rtol=0.0, atol=0.0)
 
@@ -886,4 +886,4 @@ def flatten_arguments(args):
 
     Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"]
     """
-    return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
+    return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
diff --git a/requirements.txt b/requirements.txt
@@ -6,9 +6,10 @@ pybind11
 regex
 six
 tensorboard
-torch>=1.7
+torch>=1.11
 transformers
-DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git
+# for now using this branch for bf16 work
+DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates
 # versions from HF transformers
 black==21.4b0
 isort>=5.5.4
diff --git a/tests/test_training.py b/tests/test_training.py
@@ -682,6 +682,8 @@ def test_layer_norm_consistent(self, variation):
             execute_subprocess_async(cmd, env=self.get_env())
 
         checkpoints = ["global_step10", "global_step20"]
+
+        # Check transformer layer norm
         keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]]
         for checkpoint in checkpoints:
@@ -691,8 +693,9 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                        torch_assert_equal(ref, weight, check_device=False)
 
+        # Check embed layer norm
         keys_to_compare = ["word_embeddings.norm.weight"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]]
         for checkpoint in checkpoints:
@@ -702,4 +705,15 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                        torch_assert_equal(ref, weight, check_device=False)
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)