Skip to content

Commit

Permalink
fix tb logging bug
Browse files Browse the repository at this point in the history
  • Loading branch information
Spico197 committed Oct 7, 2023
1 parent 8bff6be commit f0e5ae3
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"type": "python",
"request": "attach",
"connect": {
"host": "SH-IDCA1404-10-140-54-23",
"host": "SH-IDCA1404-10-140-54-122",
"port": 5678
},
"pathMappings": [
Expand Down
25 changes: 20 additions & 5 deletions scripts/cpt/fpt_13b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#SBATCH --job-name=cpt-13b-test
#SBATCH --output=logs/%x-%j.log
#SBATCH --error=logs/%x-%j.log
##SBATCH --output=logs/%x.log
##SBATCH --error=logs/%x.log

#SBATCH --partition=MoE
#SBATCH --ntasks-per-node=1
Expand All @@ -12,7 +14,7 @@
#SBATCH --nodes=2
#SBATCH --gres=gpu:8
#SBATCH --quotatype=auto
#SBATCH --time=5:00:00
##SBATCH --time=5:00:00

source ~/anaconda3/bin/activate smoe

Expand All @@ -28,29 +30,40 @@ source ~/anaconda3/bin/activate smoe
# export TORCH_SHOW_CPP_STACKTRACES=1
# export CUDA_LAUNCH_BLOCKING=1

# comment="13B, expert 4/16, noisy gate, seq len 2048, lr=4e-4, expert weight re-scale"
comment="13B, expert 4/16, noisy gate, seq len 2048, lr=4e-4"
# comment="random initialized llama1-7B"
# comment="random initialized llama1-13B"
# comment="7B, expert 4/16, noisy gate, gradient shared neurons, w/o residual, w/o weight re-scale, lr2e-4"
# comment="3B MoE, debug"

# model_type="llama"
# pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/llama_13B"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
model_type="llama_moe"
# pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_3B-8Select2-4320Neurons-Share"
# pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_7B-16Select4-688Neurons-Share"
pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons-Share"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4-l2_norm
# pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Clustering-l2/llama_13B-16Select4-up_proj"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
# pretrained_model=$1
echo "==================> $pretrained_model <=================="
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Clustering-l2/llama_13B-16Select4-up_proj
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Graph-l2_norm/llama_13B-16Select4-up_proj
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Random/llama_13B-16Select4-up_proj
# pretrained_model=$1
echo "==================> $pretrained_model <=================="

# tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
# tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax-copy/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
# tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_13B
# tokenizer_path="/mnt/petrelfs/share_data/quxiaoye/models/llama_3B"

dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
# dataset_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized

lr=4e-4
lr=2e-4
final_lr_portion=0.1
per_device_train_batch_size=8
per_device_eval_batch_size=1
Expand All @@ -71,6 +84,7 @@ source ~/anaconda3/bin/activate smoe

data_cache=resources/cache
output_dir=outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
# output_dir=/mnt/petrelfs/share_data/quxiaoye/models/tzhu_model_bak/cpt-13b-16gpus-lr2e-4
mkdir -p $output_dir
echo "output_dir: $output_dir"
scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
Expand All @@ -85,6 +99,7 @@ source ~/anaconda3/bin/activate smoe
echo "Node: $head_node"
echo "Node IP: $head_node_ip"

# --resume_from_checkpoint /mnt/petrelfs/share_data/quxiaoye/models/tzhu_model_bak/cpt-13b-16gpus-lr2e-4/checkpoint-2000 \
srun torchrun \
--nnodes ${num_nodes} \
--nproc_per_node ${num_gpu_per_node} \
Expand Down Expand Up @@ -117,7 +132,7 @@ source ~/anaconda3/bin/activate smoe
--max_steps ${max_steps} \
--max_train_samples ${max_train_samples} \
--save_strategy steps \
--save_total_limit 2 \
--save_total_limit 1 \
--save_steps 1000 \
--dataloader_num_workers 0 \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
Expand Down
11 changes: 8 additions & 3 deletions scripts/tokenize/redpajama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@

set -vx

tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
data_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data
out_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
# tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
# data_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data
# out_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed

tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_3B
data_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples
out_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized

logs_dir=logs

mkdir -p $logs_dir
Expand Down
38 changes: 15 additions & 23 deletions smoe/callbacks/tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,30 +30,29 @@ def on_log(
tokens = state.global_step * args.num_tokens_per_batch
token_loss_key = "train/loss_on_tokens"
self.tb_writer.add_scalar(token_loss_key, v, tokens)
elif (
k == "train/balance_loss"
and isinstance(v, torch.Tensor)
and hasattr(v, "item")
):
self.tb_writer.add_scalar(k, v.item(), state.global_step)
elif (
k == "train/num_dropped_tokens"
and isinstance(v, tuple)
and all(isinstance(n, torch.Tensor) for n in v)
):
elif k == "train/balance_loss":
if isinstance(v, torch.Tensor) and hasattr(v, "item"):
_v = v.item()
elif isinstance(v, float):
_v = v
else:
continue
self.tb_writer.add_scalar(k, _v, state.global_step)
elif k == "train/num_dropped_tokens" and isinstance(v, tuple):
# (tensor(1.0), tensor(2.3)) -> [1.0, 2.3]
v = [n.item() for n in v]
if all(isinstance(n, torch.Tensor) for n in v):
v = [n.item() for n in v]
self.tb_writer.add_scalars(
f"{k}/layer",
{str(i): n for i, n in enumerate(v)},
state.global_step,
)
self.tb_writer.add_scalar(f"{k}/total", sum(v), state.global_step)
elif (
(k == "train/gate_load" or k == "train/gate_importance")
and isinstance(v, tuple)
and all(isinstance(n, torch.Tensor) for n in v)
):
k == "train/gate_load" or k == "train/gate_importance"
) and isinstance(v, tuple):
if not all(isinstance(n, torch.Tensor) for n in v):
v = [torch.tensor(n) for n in v]
# v: (tensor([1.0, 2.3, ... num_experts]), tensor([3.0, 4.5, ... num_experts]), ... num_layers)
self.tb_writer.add_scalars(
f"{k}/std/layer",
Expand All @@ -63,11 +62,4 @@ def on_log(
self.tb_writer.add_image(
k, get_heatmap_img_grid_for_tb(v), state.global_step
)
else:
logger.warning(
"Trainer is attempting to log a value of "
f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
"This invocation of Tensorboard's writer.add_scalar() "
"is incorrect so we dropped this attribute."
)
self.tb_writer.flush()
12 changes: 10 additions & 2 deletions smoe/entrypoint/cpt/cpt_fpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
)


# @wechat_sender()
@wechat_sender()
def main():
model_args, data_args, training_args = parse_args(
ModelArguments, DataArguments, EnhancedTrainingArguments
Expand Down Expand Up @@ -132,7 +132,7 @@ def main():

# zhutong: this is for debug usage only
if training_args.debug_mode:
config.num_hidden_layers = 2
config.num_hidden_layers = 1

tokenizer_kwargs = {
"cache_dir": model_args.cache_dir,
Expand Down Expand Up @@ -177,6 +177,13 @@ def main():
block_size = min(data_args.block_size, tokenizer.model_max_length)

if data_args.prob_map is None:
# slimpajama samples openllama-3B tokenized
# data_args.prob_map = {
# "cc": 0.67,
# "wikipedia": 0.33,
# }

# redpajama
data_args.prob_map = {
"en_cc": 0.67,
"en_c4": 0.15,
Expand Down Expand Up @@ -239,6 +246,7 @@ def main():
ModelClass = MODEL_MAP[model_args.model_type]

# model = LlamaForCausalLM(config)
# model.half()
# model.to(torch_dtype)

model: LlamaForCausalLM | LlamaMoEForCausalLM = ModelClass.from_pretrained(
Expand Down
12 changes: 8 additions & 4 deletions smoe/trainer/llama_lr_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,12 @@ def _maybe_log_save_evaluate(
4,
)
logs["learning_rate"] = self._get_learning_rate()
logs["num_dropped_tokens"] = num_dropped_tokens
logs["gate_load"] = gate_load
logs["gate_importance"] = gate_importance
logs["balance_loss"] = balance_loss
logs["num_dropped_tokens"] = [x.item() for x in num_dropped_tokens]
logs["gate_load"] = [x.detach().cpu().tolist() for x in gate_load]
logs["gate_importance"] = [
x.detach().cpu().tolist() for x in gate_importance
]
logs["balance_loss"] = balance_loss.item()

self._total_loss_scalar += tr_loss_scalar
self._globalstep_last_logged = self.state.global_step
Expand Down Expand Up @@ -376,6 +378,8 @@ def _inner_training_loop(
self._created_lr_scheduler = False

if self.is_deepspeed_enabled:
# # zhutong: move model to cuda device for fused optim init
# self.model.to(self.accelerator.device)
self.optimizer, self.lr_scheduler = deepspeed_init(
self, num_training_steps=max_steps
)
Expand Down
5 changes: 4 additions & 1 deletion smoe/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def tell(self):

def __iter__(self):
for line in self.fin:
yield json.loads(line)
try:
yield json.loads(line)
except json.JSONDecodeError:
pass
self.fin.close()


Expand Down
5 changes: 4 additions & 1 deletion smoe/utils/visualization/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ def visualize_swiglu_output(


def find_factors_with_minimal_sum(number):
if number == 1:
return (1, 1)

# Initialize variables to keep track of the factors with the minimal sum
min_sum = float("inf")
min_factors = None
Expand Down Expand Up @@ -286,7 +289,7 @@ def vis_tuple_heatmaps(tensors: tuple[torch.FloatTensor]):
axes = axes.reshape(*img_grid)
for i in range(data.shape[0]):
ax = axes[i // img_grid[1], i % img_grid[1]]
im = ax.imshow(
ax.imshow(
data[i].cpu().reshape(*shape).float().detach().numpy(),
cmap=cmap,
interpolation="nearest",
Expand Down

0 comments on commit f0e5ae3

Please sign in to comment.