diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index e6eea2183de5..52d7c95ec9d8 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -495,7 +495,8 @@ def get_model_num_kv_heads(self, config): num_kv_heads = None # multi_query_group_num is for chatglm2 & chatglm3 kv_head_names = [ - 'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads' + 'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads', + 'attention_heads' ] for name in kv_head_names: if hasattr(config, name): diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 64dc5479940c..cf70c4530c82 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -274,7 +274,13 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): _autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group) # 3. Try to get num_key_heads from model_config.num_key_value_heads - num_kv_heads = _autotp.get_model_num_kv_heads(model_config) + if hasattr(model_config, "vision_config"): + if "MllamaVisionEncoderLayer" in str(module): + num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config) + else: + num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config) + else: + num_kv_heads = _autotp.get_model_num_kv_heads(model_config) # 4. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division set_num_kv_heads(num_kv_heads) @@ -339,6 +345,8 @@ def set_lm_head(module): "weight") and not module.embed_out.weight.is_meta and isinstance( module.embed_out, torch.nn.Linear): module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out") + elif hasattr(module.language_model, "lm_head"): + module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head") return module def conv2d_parallel_shard_weights(model, rank, world_size): diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py index 679230ca7d4c..55cfa8f59c91 100755 --- a/deepspeed/runtime/constants.py +++ b/deepspeed/runtime/constants.py @@ -77,7 +77,7 @@ # Steps STEPS_PER_PRINT = "steps_per_print" -STEPS_PER_PRINT_DEFAULT = 10 +STEPS_PER_PRINT_DEFAULT = None ######################################### # Training micro batch size per GPU diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index b590ea432658..0371c5663a2d 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -2145,8 +2145,6 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}): else: self.zero_grad() - report_progress = self.global_rank == 0 if self.global_rank else True - # Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function. overflow = False if hasattr(self.optimizer, "overflow"): @@ -2166,8 +2164,10 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}): # pipe_engine.train_batch() self.lr_scheduler.step(self.train_batch_size()) - if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0: - self._report_progress(self.global_steps + 1) + if self.steps_per_print() is not None: + report_progress = self.global_rank == 0 if self.global_rank else True + if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0: + self._report_progress(self.global_steps + 1) self.losses = None self.global_steps += 1 diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py index 00f17dea709c..64ae8ac0e5b4 100755 --- a/deepspeed/utils/timer.py +++ b/deepspeed/utils/timer.py @@ -198,7 +198,7 @@ def get_mean(self, names, normalizer=1.0, reset=True): class ThroughputTimer: - def __init__(self, config, batch_size, start_step=2, steps_per_output=50, monitor_memory=False, logging_fn=None): + def __init__(self, config, batch_size, start_step=2, steps_per_output=None, monitor_memory=False, logging_fn=None): from deepspeed.utils import logger self.config = config self.start_time = 0 @@ -238,6 +238,11 @@ def start(self): get_accelerator().synchronize() self.start_time = time.time() + def _is_report_boundary(self): + if self.steps_per_output is None: + return False + return self.global_step_count % self.steps_per_output == 0 + def stop(self, global_step=False, report_speed=True): if not self.config.enabled or not self.started: return @@ -255,7 +260,7 @@ def stop(self, global_step=False, report_speed=True): self.step_elapsed_time += duration if global_step: - if report_speed and self.global_step_count % self.steps_per_output == 0: + if report_speed and self._is_report_boundary(): self.logging( "epoch={}/micro_step={}/global_step={}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, " "MemAllocated={}GB, MaxMemAllocated={}GB".format(