Skip to content

Commit

Permalink
Merge branch 'master' into rearrange_ops
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Oct 8, 2024
2 parents dee13b9 + f74ea69 commit 89cdf11
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 9 deletions.
3 changes: 2 additions & 1 deletion deepspeed/module_inject/auto_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,8 @@ def get_model_num_kv_heads(self, config):
num_kv_heads = None
# multi_query_group_num is for chatglm2 & chatglm3
kv_head_names = [
'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads'
'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads',
'attention_heads'
]
for name in kv_head_names:
if hasattr(config, name):
Expand Down
10 changes: 9 additions & 1 deletion deepspeed/module_inject/replace_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,13 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
_autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)

# 3. Try to get num_key_heads from model_config.num_key_value_heads
num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
if hasattr(model_config, "vision_config"):
if "MllamaVisionEncoderLayer" in str(module):
num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config)
else:
num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config)
else:
num_kv_heads = _autotp.get_model_num_kv_heads(model_config)

# 4. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
set_num_kv_heads(num_kv_heads)
Expand Down Expand Up @@ -339,6 +345,8 @@ def set_lm_head(module):
"weight") and not module.embed_out.weight.is_meta and isinstance(
module.embed_out, torch.nn.Linear):
module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
elif hasattr(module.language_model, "lm_head"):
module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head")
return module

def conv2d_parallel_shard_weights(model, rank, world_size):
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@

# Steps
STEPS_PER_PRINT = "steps_per_print"
STEPS_PER_PRINT_DEFAULT = 10
STEPS_PER_PRINT_DEFAULT = None

#########################################
# Training micro batch size per GPU
Expand Down
8 changes: 4 additions & 4 deletions deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2145,8 +2145,6 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
else:
self.zero_grad()

report_progress = self.global_rank == 0 if self.global_rank else True

# Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function.
overflow = False
if hasattr(self.optimizer, "overflow"):
Expand All @@ -2166,8 +2164,10 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
# pipe_engine.train_batch()
self.lr_scheduler.step(self.train_batch_size())

if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
self._report_progress(self.global_steps + 1)
if self.steps_per_print() is not None:
report_progress = self.global_rank == 0 if self.global_rank else True
if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
self._report_progress(self.global_steps + 1)

self.losses = None
self.global_steps += 1
Expand Down
9 changes: 7 additions & 2 deletions deepspeed/utils/timer.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def get_mean(self, names, normalizer=1.0, reset=True):

class ThroughputTimer:

def __init__(self, config, batch_size, start_step=2, steps_per_output=50, monitor_memory=False, logging_fn=None):
def __init__(self, config, batch_size, start_step=2, steps_per_output=None, monitor_memory=False, logging_fn=None):
from deepspeed.utils import logger
self.config = config
self.start_time = 0
Expand Down Expand Up @@ -238,6 +238,11 @@ def start(self):
get_accelerator().synchronize()
self.start_time = time.time()

def _is_report_boundary(self):
if self.steps_per_output is None:
return False
return self.global_step_count % self.steps_per_output == 0

def stop(self, global_step=False, report_speed=True):
if not self.config.enabled or not self.started:
return
Expand All @@ -255,7 +260,7 @@ def stop(self, global_step=False, report_speed=True):
self.step_elapsed_time += duration

if global_step:
if report_speed and self.global_step_count % self.steps_per_output == 0:
if report_speed and self._is_report_boundary():
self.logging(
"epoch={}/micro_step={}/global_step={}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, "
"MemAllocated={}GB, MaxMemAllocated={}GB".format(
Expand Down

0 comments on commit 89cdf11

Please sign in to comment.