diff --git a/image/vision_transformer/hybrid_parallel/configs/vit_1d_tp2_pp2.py b/image/vision_transformer/hybrid_parallel/configs/vit_1d_tp2_pp2.py index 23a4501..2450ab1 100644 --- a/image/vision_transformer/hybrid_parallel/configs/vit_1d_tp2_pp2.py +++ b/image/vision_transformer/hybrid_parallel/configs/vit_1d_tp2_pp2.py @@ -34,4 +34,3 @@ # pipeline config NUM_MICRO_BATCHES = parallel['pipeline'] -TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE) diff --git a/image/vision_transformer/hybrid_parallel/train_with_engine.py b/image/vision_transformer/hybrid_parallel/train_with_engine.py index a0eaf0a..1dbce22 100644 --- a/image/vision_transformer/hybrid_parallel/train_with_engine.py +++ b/image/vision_transformer/hybrid_parallel/train_with_engine.py @@ -71,8 +71,8 @@ def train_imagenet(): # create dataloaders root = os.environ['DATA'] - train_dataloader, test_dataloader = build_dali_imagenet(root, rand_augment=False) - + train_dataloader, test_dataloader = build_dali_imagenet(root, train_batch_size=gpc.config.BATCH_SIZE, \ + test_batch_size=gpc.config.BATCH_SIZE) # create loss function criterion = CrossEntropyLoss(label_smoothing=0.1) @@ -92,7 +92,6 @@ def train_imagenet(): test_dataloader=test_dataloader) logger.info("Engine is built", ranks=[0]) - # create schedule schedule = None tensor_shape = getattr(gpc.config, 'TENSOR_SHAPE', None) diff --git a/image/vision_transformer/hybrid_parallel/train_with_trainer.py b/image/vision_transformer/hybrid_parallel/train_with_trainer.py index f5edeaa..a152abc 100644 --- a/image/vision_transformer/hybrid_parallel/train_with_trainer.py +++ b/image/vision_transformer/hybrid_parallel/train_with_trainer.py @@ -66,7 +66,8 @@ def train_imagenet(): # create dataloader root = os.environ['DATA'] - train_dataloader, test_dataloader = build_dali_imagenet(root, rand_augment=False) + train_dataloader, test_dataloader = build_dali_imagenet(root, train_batch_size=gpc.config.BATCH_SIZE, \ + test_batch_size=gpc.config.BATCH_SIZE) # create loss function criterion = CrossEntropyLoss(label_smoothing=0.1) diff --git a/language/gpt/gpt2_configs/gpt2_pp1d.py b/language/gpt/gpt2_configs/gpt2_pp1d.py index c5869f8..ad97b09 100644 --- a/language/gpt/gpt2_configs/gpt2_pp1d.py +++ b/language/gpt/gpt2_configs/gpt2_pp1d.py @@ -5,7 +5,6 @@ from colossalai.amp import AMP_TYPE import torch - BATCH_SIZE = 8 NUM_EPOCHS = 60 SEQ_LEN = 1024 @@ -14,17 +13,11 @@ HIDDEN_SIZE = 768 PIPELINE = 2 TENSOR_PARALLEL = 2 -MODE = '1d' -TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE) +MODE = '1d' -fp16 = dict( - mode=AMP_TYPE.NAIVE -) +fp16 = dict(mode=AMP_TYPE.NAIVE) -parallel = dict( - pipeline=PIPELINE, - tensor=dict(mode=MODE, size=TENSOR_PARALLEL) -) +parallel = dict(pipeline=PIPELINE, tensor=dict(mode=MODE, size=TENSOR_PARALLEL)) optimizer = dict( type=Adam, diff --git a/language/gpt/train_gpt.py b/language/gpt/train_gpt.py index 3c80708..3bc47e2 100644 --- a/language/gpt/train_gpt.py +++ b/language/gpt/train_gpt.py @@ -141,8 +141,7 @@ def mask_function(attention_mask=None): test_interval=1, hooks=hook_list, display_progress=True, - return_output_label=False, - max_steps=5) + return_output_label=False) if __name__ == '__main__':