From 36bfe87d345c2ad0d3a21b46b212f2687fb59c2d Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Thu, 24 Oct 2024 15:05:12 -0500 Subject: [PATCH] Pretrain tok sec (#1805) --- litgpt/pretrain.py | 17 +++++++++++++++-- litgpt/utils.py | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/litgpt/pretrain.py b/litgpt/pretrain.py index e10df56a5b..739ac2df77 100644 --- a/litgpt/pretrain.py +++ b/litgpt/pretrain.py @@ -237,9 +237,22 @@ def main( # Save final checkpoint save_checkpoint(fabric, state, tokenizer_dir, out_dir / "final" / "lit_model.pth") - fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s") + total_tokens = state["iter_num"] * train.micro_batch_size * model.max_seq_length * fabric.world_size + + # Print formatted output + separator = "-" * 40 + fabric.print(separator) + fabric.print("| Performance") + fabric.print(f"| - Total tokens : {total_tokens:,}") + fabric.print(f"| - Training Time : {(time.perf_counter()-train_time):.2f} s") + fabric.print(f"| - Tok/sec : {total_tokens / train_time:.2f} tok/s") + fabric.print("| " + "-" * 40) + if fabric.device.type == "cuda": - fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + memory_used = torch.cuda.max_memory_allocated() / 1e9 + fabric.print("| Memory Usage") + fabric.print(f"| - Memory Used : {memory_used:.2f} GB") + fabric.print(separator) def fit( diff --git a/litgpt/utils.py b/litgpt/utils.py index 10e3831745..2180762617 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -782,7 +782,7 @@ def create_finetuning_performance_report(training_time, token_counts, device_typ memory_used = torch.cuda.max_memory_allocated() / 1e9 output += f"| Memory Usage \n" output += f"| - Memory Used : {memory_used:.02f} GB \n" - output += "=======================================================\n" + output += "-------------------------------------------------------\n" return output