fix tb logging bug

pjlab-sys4nlp · Oct 7, 2023 · f0e5ae3 · f0e5ae3
1 parent 8bff6be
commit f0e5ae3
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 40 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -9,7 +9,7 @@
             "type": "python",
             "request": "attach",
             "connect": {
-                "host": "SH-IDCA1404-10-140-54-23",
+                "host": "SH-IDCA1404-10-140-54-122",
                 "port": 5678
             },
             "pathMappings": [

diff --git a/scripts/cpt/fpt_13b.sh b/scripts/cpt/fpt_13b.sh
@@ -3,6 +3,8 @@
 #SBATCH --job-name=cpt-13b-test
 #SBATCH --output=logs/%x-%j.log
 #SBATCH --error=logs/%x-%j.log
+##SBATCH --output=logs/%x.log
+##SBATCH --error=logs/%x.log
 
 #SBATCH --partition=MoE
 #SBATCH --ntasks-per-node=1
@@ -12,7 +14,7 @@
 #SBATCH --nodes=2
 #SBATCH --gres=gpu:8
 #SBATCH --quotatype=auto
-#SBATCH --time=5:00:00
+##SBATCH --time=5:00:00
 
 source ~/anaconda3/bin/activate smoe
 
@@ -28,29 +30,40 @@ source ~/anaconda3/bin/activate smoe
     # export TORCH_SHOW_CPP_STACKTRACES=1
     # export CUDA_LAUNCH_BLOCKING=1
 
+    # comment="13B, expert 4/16, noisy gate, seq len 2048, lr=4e-4, expert weight re-scale"
     comment="13B, expert 4/16, noisy gate, seq len 2048, lr=4e-4"
     # comment="random initialized llama1-7B"
+    # comment="random initialized llama1-13B"
+    # comment="7B, expert 4/16, noisy gate, gradient shared neurons, w/o residual, w/o weight re-scale, lr2e-4"
+    # comment="3B MoE, debug"
 
     # model_type="llama"
+    # pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/llama_13B"
+    # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
     model_type="llama_moe"
+    # pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_3B-8Select2-4320Neurons-Share"
+    # pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_7B-16Select4-688Neurons-Share"
     pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons-Share"
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4-l2_norm
     # pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Clustering-l2/llama_13B-16Select4-up_proj"
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
-    # pretrained_model=$1
-    echo "==================> $pretrained_model <=================="
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Clustering-l2/llama_13B-16Select4-up_proj
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Graph-l2_norm/llama_13B-16Select4-up_proj
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Random/llama_13B-16Select4-up_proj
+    # pretrained_model=$1
+    echo "==================> $pretrained_model <=================="
 
     # tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
     # tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax-copy/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
     # tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
     tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_13B
+    # tokenizer_path="/mnt/petrelfs/share_data/quxiaoye/models/llama_3B"
+
     dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
+    # dataset_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized
 
-    lr=4e-4
+    lr=2e-4
     final_lr_portion=0.1
     per_device_train_batch_size=8
     per_device_eval_batch_size=1
@@ -71,6 +84,7 @@ source ~/anaconda3/bin/activate smoe
 
     data_cache=resources/cache
     output_dir=outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
+    # output_dir=/mnt/petrelfs/share_data/quxiaoye/models/tzhu_model_bak/cpt-13b-16gpus-lr2e-4
     mkdir -p $output_dir
     echo "output_dir: $output_dir"
     scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
@@ -85,6 +99,7 @@ source ~/anaconda3/bin/activate smoe
     echo "Node: $head_node"
     echo "Node IP: $head_node_ip"
 
+            # --resume_from_checkpoint /mnt/petrelfs/share_data/quxiaoye/models/tzhu_model_bak/cpt-13b-16gpus-lr2e-4/checkpoint-2000 \
     srun torchrun \
         --nnodes ${num_nodes} \
         --nproc_per_node ${num_gpu_per_node} \
@@ -117,7 +132,7 @@ source ~/anaconda3/bin/activate smoe
             --max_steps ${max_steps} \
             --max_train_samples ${max_train_samples} \
             --save_strategy steps \
-            --save_total_limit 2 \
+            --save_total_limit 1 \
             --save_steps 1000 \
             --dataloader_num_workers 0 \
             --gradient_accumulation_steps ${gradient_accumulation_steps} \

diff --git a/scripts/tokenize/redpajama.sh b/scripts/tokenize/redpajama.sh
@@ -2,9 +2,14 @@
 
 set -vx
 
-tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
-data_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data
-out_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
+# tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
+# data_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data
+# out_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
+
+tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_3B
+data_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples
+out_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized
+
 logs_dir=logs
 
 mkdir -p $logs_dir

diff --git a/smoe/callbacks/tensorboard.py b/smoe/callbacks/tensorboard.py
@@ -30,30 +30,29 @@ def on_log(
                         tokens = state.global_step * args.num_tokens_per_batch
                         token_loss_key = "train/loss_on_tokens"
                         self.tb_writer.add_scalar(token_loss_key, v, tokens)
-                elif (
-                    k == "train/balance_loss"
-                    and isinstance(v, torch.Tensor)
-                    and hasattr(v, "item")
-                ):
-                    self.tb_writer.add_scalar(k, v.item(), state.global_step)
-                elif (
-                    k == "train/num_dropped_tokens"
-                    and isinstance(v, tuple)
-                    and all(isinstance(n, torch.Tensor) for n in v)
-                ):
+                elif k == "train/balance_loss":
+                    if isinstance(v, torch.Tensor) and hasattr(v, "item"):
+                        _v = v.item()
+                    elif isinstance(v, float):
+                        _v = v
+                    else:
+                        continue
+                    self.tb_writer.add_scalar(k, _v, state.global_step)
+                elif k == "train/num_dropped_tokens" and isinstance(v, tuple):
                     # (tensor(1.0), tensor(2.3)) -> [1.0, 2.3]
-                    v = [n.item() for n in v]
+                    if all(isinstance(n, torch.Tensor) for n in v):
+                        v = [n.item() for n in v]
                     self.tb_writer.add_scalars(
                         f"{k}/layer",
                         {str(i): n for i, n in enumerate(v)},
                         state.global_step,
                     )
                     self.tb_writer.add_scalar(f"{k}/total", sum(v), state.global_step)
                 elif (
-                    (k == "train/gate_load" or k == "train/gate_importance")
-                    and isinstance(v, tuple)
-                    and all(isinstance(n, torch.Tensor) for n in v)
-                ):
+                    k == "train/gate_load" or k == "train/gate_importance"
+                ) and isinstance(v, tuple):
+                    if not all(isinstance(n, torch.Tensor) for n in v):
+                        v = [torch.tensor(n) for n in v]
                     # v: (tensor([1.0, 2.3, ... num_experts]), tensor([3.0, 4.5, ... num_experts]), ... num_layers)
                     self.tb_writer.add_scalars(
                         f"{k}/std/layer",
@@ -63,11 +62,4 @@ def on_log(
                     self.tb_writer.add_image(
                         k, get_heatmap_img_grid_for_tb(v), state.global_step
                     )
-                else:
-                    logger.warning(
-                        "Trainer is attempting to log a value of "
-                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
-                        "This invocation of Tensorboard's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute."
-                    )
             self.tb_writer.flush()
diff --git a/smoe/entrypoint/cpt/cpt_fpt.py b/smoe/entrypoint/cpt/cpt_fpt.py
@@ -49,7 +49,7 @@
 )
 
 
-# @wechat_sender()
+@wechat_sender()
 def main():
     model_args, data_args, training_args = parse_args(
         ModelArguments, DataArguments, EnhancedTrainingArguments
@@ -132,7 +132,7 @@ def main():
 
     # zhutong: this is for debug usage only
     if training_args.debug_mode:
-        config.num_hidden_layers = 2
+        config.num_hidden_layers = 1
 
     tokenizer_kwargs = {
         "cache_dir": model_args.cache_dir,
@@ -177,6 +177,13 @@ def main():
         block_size = min(data_args.block_size, tokenizer.model_max_length)
 
     if data_args.prob_map is None:
+        # slimpajama samples openllama-3B tokenized
+        # data_args.prob_map = {
+        #     "cc": 0.67,
+        #     "wikipedia": 0.33,
+        # }
+
+        # redpajama
         data_args.prob_map = {
             "en_cc": 0.67,
             "en_c4": 0.15,
@@ -239,6 +246,7 @@ def main():
         ModelClass = MODEL_MAP[model_args.model_type]
 
         # model = LlamaForCausalLM(config)
+        # model.half()
         # model.to(torch_dtype)
 
         model: LlamaForCausalLM | LlamaMoEForCausalLM = ModelClass.from_pretrained(

diff --git a/smoe/trainer/llama_lr_scheduling.py b/smoe/trainer/llama_lr_scheduling.py
@@ -192,10 +192,12 @@ def _maybe_log_save_evaluate(
                 4,
             )
             logs["learning_rate"] = self._get_learning_rate()
-            logs["num_dropped_tokens"] = num_dropped_tokens
-            logs["gate_load"] = gate_load
-            logs["gate_importance"] = gate_importance
-            logs["balance_loss"] = balance_loss
+            logs["num_dropped_tokens"] = [x.item() for x in num_dropped_tokens]
+            logs["gate_load"] = [x.detach().cpu().tolist() for x in gate_load]
+            logs["gate_importance"] = [
+                x.detach().cpu().tolist() for x in gate_importance
+            ]
+            logs["balance_loss"] = balance_loss.item()
 
             self._total_loss_scalar += tr_loss_scalar
             self._globalstep_last_logged = self.state.global_step
@@ -376,6 +378,8 @@ def _inner_training_loop(
             self._created_lr_scheduler = False
 
         if self.is_deepspeed_enabled:
+            # # zhutong: move model to cuda device for fused optim init
+            # self.model.to(self.accelerator.device)
             self.optimizer, self.lr_scheduler = deepspeed_init(
                 self, num_training_steps=max_steps
             )

diff --git a/smoe/utils/io.py b/smoe/utils/io.py
@@ -45,7 +45,10 @@ def tell(self):
 
     def __iter__(self):
         for line in self.fin:
-            yield json.loads(line)
+            try:
+                yield json.loads(line)
+            except json.JSONDecodeError:
+                pass
         self.fin.close()
 
 

diff --git a/smoe/utils/visualization/visualize.py b/smoe/utils/visualization/visualize.py
@@ -191,6 +191,9 @@ def visualize_swiglu_output(
 
 
 def find_factors_with_minimal_sum(number):
+    if number == 1:
+        return (1, 1)
+
     # Initialize variables to keep track of the factors with the minimal sum
     min_sum = float("inf")
     min_factors = None
@@ -286,7 +289,7 @@ def vis_tuple_heatmaps(tensors: tuple[torch.FloatTensor]):
     axes = axes.reshape(*img_grid)
     for i in range(data.shape[0]):
         ax = axes[i // img_grid[1], i % img_grid[1]]
-        im = ax.imshow(
+        ax.imshow(
             data[i].cpu().reshape(*shape).float().detach().numpy(),
             cmap=cmap,
             interpolation="nearest",