[format] Improve readability by fixing various naming conventions and…

… typos (#128)
hpcaitech · May 26, 2022 · 22feb71 · 22feb71
1 parent f27b788
commit 22feb71
Show file tree

Hide file tree

Showing 44 changed files with 86 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -72,7 +72,7 @@ The `image` and `language` folders are for complex model applications. The `feat
 ## Discussion
 
 Discussion about the Colossal-AI project and examples is always welcomed! We would love to exchange ideas with the community to better help this project grow.
-If you think there is a need to discuss anything, you may jump to our [dicussion forum](https://github.com/hpcaitech/ColossalAI/discussions) and create a topic there.
+If you think there is a need to discuss anything, you may jump to our [discussion forum](https://github.com/hpcaitech/ColossalAI/discussions) and create a topic there.
 
 If you encounter any problem while running these examples, you may want to raise an issue in this repository.
 
@@ -88,9 +88,9 @@ If you find that an example is broken (not working) or not user-friendly, you ma
 
 If you wish to add an example for a specific application, please follow the steps below.
 
-1. create a folder in the `image`, `language` or `features` folders. Generally we do not accept new examples for `features` as one example is often enough. **We encourage contribution with hybrid parallel or models of different domains (e.g. GAN, self-supervised, detection, video understadning, text classification, text generation)**
+1. create a folder in the `image`, `language` or `features` folders. Generally we do not accept new examples for `features` as one example is often enough. **We encourage contribution with hybrid parallel or models of different domains (e.g. GAN, self-supervised, detection, video understanding, text classification, text generation)**
 2. Prepare configuration files and `train.py`
-3. Prepare a detailed readme on envirionment setup, dataset preparation, code execution, etc. in your example folder
+3. Prepare a detailed readme on environment setup, dataset preparation, code execution, etc. in your example folder
 4. Update the table of content (first section above) in this readme file
 
 

diff --git a/benchmark/zero/common/train.py b/benchmark/zero/common/train.py
@@ -15,7 +15,7 @@ def _train(epoch, rank, world_size, train_dataloader, model, criterion, optimize
     use_autocast = CONFIG['method'] in ['torch', 'colossalai'] and \
         'fp16' in CONFIG and CONFIG['fp16'].get('enabled', True)
     clip_grad_norm = CONFIG.get('gradient_clipping', 0.)
-    use_integraded_clip_grad = CONFIG['method'] in ['fairscale']
+    use_integrated_clip_grad = CONFIG['method'] in ['fairscale']
     use_colossalai_zero_v1 = CONFIG['method'] == 'colossalai' and CONFIG.get('sharded_model_version', 2) == 1
 
     model.train()
@@ -103,7 +103,7 @@ def _train(epoch, rank, world_size, train_dataloader, model, criterion, optimize
             scaler.scale(loss).backward()
             scaler.unscale_(optimizer)
             if clip_grad_norm > 0:
-                if use_integraded_clip_grad:    # fairscale style
+                if use_integrated_clip_grad:    # fairscale style
                     model.clip_grad_norm_(clip_grad_norm)
                 else:    # torch style
                     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
@@ -114,7 +114,7 @@ def _train(epoch, rank, world_size, train_dataloader, model, criterion, optimize
         else:    # torch & fairscale normal style
             loss.backward()
             if clip_grad_norm > 0:
-                if use_integraded_clip_grad:    # fairscale style
+                if use_integrated_clip_grad:    # fairscale style
                     model.clip_grad_norm_(clip_grad_norm)
                 else:    # torch style
                     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)

diff --git a/benchmark/zero/common/vit.py b/benchmark/zero/common/vit.py
@@ -144,7 +144,7 @@ def __init__(self,
                                        interp_type=types.INTERP_TRIANGULAR)
                     flip_lr = False
 
-                # center crop and normalise
+                # center crop and normalize
                 images = fn.crop_mirror_normalize(images,
                                                   dtype=types.FLOAT,
                                                   crop=(crop, crop),

diff --git a/features/amp/README.md b/features/amp/README.md
@@ -6,7 +6,7 @@ You may refer to [our documentation on mixed precision training](https://colossa
 
 > ⚠️ This example is only for demo purpose, no guarantee on the convergence performance
 
-# Prerequiste
+# Prerequisite
 
 ```shell
 pip install timm scipy

diff --git a/features/amp/train.py b/features/amp/train.py
@@ -61,7 +61,7 @@ def main():
     # build loss
     criterion = torch.nn.CrossEntropyLoss()
 
-    # lr_scheduelr
+    # lr_scheduler
     lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=1, total_steps=gpc.config.NUM_EPOCHS)
 
     engine, train_dataloader, _, _ = colossalai.initialize(

diff --git a/features/gradient_clipping/README.md b/features/gradient_clipping/README.md
@@ -19,7 +19,7 @@ export DATA=/path/to/data
 
 ## Verify Gradient Clipping
 
-To verify gradinet clipping, we can just check the change of parameter values.  
+To verify gradient clipping, we can just check the change of parameter values.  
 
 ```bash
 colossalai run --nproc_per_node 1 train.py

diff --git a/features/tensor_parallel/README.md b/features/tensor_parallel/README.md
@@ -5,7 +5,7 @@
 To use tensor parallelism, there are several steps to follow:
 
 1. define `parallel` in your configuration file. Set `mode` for `tensor` to `1d`, `2d`, `2.5d` or `3d`.
-2. cosntruct your model, replace `torch.nn.Linear` with `colossalai.nn.Linear`.
+2. construct your model, replace `torch.nn.Linear` with `colossalai.nn.Linear`.
 3. split the input data accordingly
 
 ## Reference

diff --git a/image/detr/README.md b/image/detr/README.md
@@ -4,7 +4,7 @@ Reproduce the DETR model with ColossalAI
 ## Background
 This project is the reproduction of [DETR model](https://arxiv.org/abs/2005.12872) with [ColossalAI](https://github.com/hpcaitech/ColossalAI) tool.
 
-## Envirionment setup
+## Environment setup
 ```
 git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI

diff --git a/image/detr/models/matcher.py b/image/detr/models/matcher.py
@@ -65,7 +65,7 @@ def forward(self, outputs, targets):
 
         # Compute the classification cost. Contrary to the loss, we don't use the NLL,
         # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
         cost_class = -out_prob[:, tgt_ids]
 
         # Compute the L1 cost between boxes

diff --git a/image/detr/util/misc.py b/image/detr/util/misc.py
@@ -257,7 +257,7 @@ def log_every(self, iterable, print_freq, header=None):
 #         sha = _run(['git', 'rev-parse', 'HEAD'])
 #         subprocess.check_output(['git', 'diff'], cwd=cwd)
 #         diff = _run(['git', 'diff-index', 'HEAD'])
-#         diff = "has uncommited changes" if diff else "clean"
+#         diff = "has uncomitted changes" if diff else "clean"
 #         branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
 #     except Exception:
 #         pass

diff --git a/image/moe/README.md b/image/moe/README.md
@@ -7,12 +7,12 @@ It is designed to improve the performance of our models without any additional t
 version moe parallelism will cause a moderate computation overhead and additional memory usage. But
 we are happy to announce that recently enabled CUDA kernels have solved the problem above. There 
 are only two things that you need to concern. One is the additional communication time which highly
-depends on the topology and bandwith of the network in running environment. Another is extra memory usgae,
+depends on the topology and bandwidth of the network in running environment. Another is extra memory usage,
 since we have a larger model thanks to MoE. We will continuously maintain and optimize our MoE system
 and be encouraged by any issue that can help us improve our system.
 
 At present, we have provided Widenet and ViT-MoE in our model zoo (more information about Widenet can be 
-found [here](https://arxiv.org/abs/2107.11817)). We now support a recent tecnique proposed by Microsoft, PR-MoE.
+found [here](https://arxiv.org/abs/2107.11817)). We now support a recent technique proposed by Microsoft, PR-MoE.
 You can access [here](https://arxiv.org/abs/2201.05596) to know more about PR-MoE.
 Directly use ViT-MoE in our model zoo or use MoeModule in your model to exploit PR-MoE.
 

diff --git a/image/simclr/train_linear.py b/image/simclr/train_linear.py
@@ -59,7 +59,7 @@ def main():
     # build optimizer
     optimizer = colossalai.nn.FusedSGD(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY, momentum=gpc.config.MOMENTUM)
 
-    # lr_scheduelr
+    # lr_scheduler
     lr_scheduler = CosineAnnealingWarmupLR(optimizer, warmup_steps=5, total_steps=gpc.config.NUM_EPOCHS)
 
     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(

diff --git a/image/simclr/train_simclr.py b/image/simclr/train_simclr.py
@@ -64,7 +64,7 @@ def main():
     optimizer = colossalai.nn.FusedSGD(model.parameters(), lr=gpc.config.LEARNING_RATE,
                                        weight_decay=gpc.config.WEIGHT_DECAY, momentum=gpc.config.MOMENTUM)
 
-    # lr_scheduelr
+    # lr_scheduler
     lr_scheduler = CosineAnnealingWarmupLR(optimizer, warmup_steps=10, total_steps=gpc.config.NUM_EPOCHS)
 
     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(

diff --git a/image/vilt/utils/objectives.py b/image/vilt/utils/objectives.py
@@ -7,7 +7,7 @@
 
 
 def cost_matrix_cosine(x, y, eps=1e-5):
-    """Compute cosine distnace across every pairs of x, y (batched)
+    """Compute cosine distance across every pairs of x, y (batched)
     [B, L_x, D] [B, L_y, D] -> [B, Lx, Ly]"""
     assert x.dim() == y.dim()
     assert x.size(0) == y.size(0)

diff --git a/image/vilt/utils/transforms/randaug.py b/image/vilt/utils/transforms/randaug.py
@@ -1,4 +1,4 @@
-# code in this file is adpated from rpmcruz/autoaugment
+# code in this file is adapted from rpmcruz/autoaugment
 # https://github.com/rpmcruz/autoaugment/blob/master/transformations.py
 import random
 
@@ -158,7 +158,7 @@ def Identity(img, v):
     return img
 
 
-def augment_list():  # 16 oeprations and their ranges
+def augment_list():  # 16 operations and their ranges
     # https://github.com/google-research/uda/blob/master/image/randaugment/policies.py#L57
     # l = [
     #     (Identity, 0., 1.0),

diff --git a/image/vision_transformer/data_parallel/README.md b/image/vision_transformer/data_parallel/README.md
@@ -48,7 +48,7 @@ As can be seen from the above figure, the ViT model eventually converges well af
 # Details
 `config.py`
 
-This is a [configuration file](https://colossalai.org/config.html) that defines hyperparameters and trainign scheme (fp16, gradient accumulation, etc.). The config content can be accessed through `gpc.config` in the program.
+This is a [configuration file](https://colossalai.org/config.html) that defines hyperparameters and training scheme (fp16, gradient accumulation, etc.). The config content can be accessed through `gpc.config` in the program.
 
 In this example, we trained ViT-Base/16 for 300 epochs on the ImageNet-1K dataset. The batch size is expanded to 32K through data parallelism. Since only 4 A100 GPUs on one small server are used, and the GPU memory is limited, the batch size of 32K cannot be used directly. Therefore, the batch size used on each GPU is only 256, and the 256 batch size is equivalently expanded to 8K through gradient accumulation 32 times. Finally, data parallelism is used between 4 GPUs to achieve an equivalent batch size of 32K.
 

diff --git a/image/vision_transformer/data_parallel/train.py b/image/vision_transformer/data_parallel/train.py
@@ -45,7 +45,7 @@ def main():
     # build loss
     criterion = MixupLoss(loss_fn_cls=torch.nn.CrossEntropyLoss)
 
-    # lr_scheduelr
+    # lr_scheduler
     lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=1, total_steps=gpc.config.NUM_EPOCHS)
 
     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model, optimizer, criterion, train_dataloader,

diff --git a/image/vision_transformer/data_parallel/train_with_cifar10.py b/image/vision_transformer/data_parallel/train_with_cifar10.py
@@ -40,7 +40,7 @@ def main():
     # build loss
     criterion = torch.nn.CrossEntropyLoss()
 
-    # lr_scheduelr
+    # lr_scheduler
     lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
 
     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model, optimizer, criterion, train_dataloader,

diff --git a/image/vision_transformer/hybrid_parallel/train_with_cifar10.py b/image/vision_transformer/hybrid_parallel/train_with_cifar10.py
@@ -68,7 +68,7 @@ def main():
         pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
     logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
 
-    # craete dataloaders
+    # create dataloaders
     root = os.environ.get('DATA', './data')
     train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
 
@@ -83,7 +83,7 @@ def main():
                                            total_steps=gpc.config.NUM_EPOCHS,
                                            warmup_steps=gpc.config.WARMUP_EPOCHS)
 
-    # intiailize
+    # initialize
     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
                                                                          optimizer=optimizer,
                                                                          criterion=criterion,

diff --git a/image/vision_transformer/hybrid_parallel/train_with_engine.py b/image/vision_transformer/hybrid_parallel/train_with_engine.py
@@ -69,7 +69,7 @@ def train_imagenet():
         pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
     logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
 
-    # craete dataloaders
+    # create dataloaders
     root = os.environ['DATA']
     train_dataloader, test_dataloader = build_dali_imagenet(root, rand_augment=False)
 
@@ -84,7 +84,7 @@ def train_imagenet():
                                            total_steps=gpc.config.NUM_EPOCHS,
                                            warmup_steps=gpc.config.WARMUP_EPOCHS)
 
-    # intiailize
+    # initialize
     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
                                                                          optimizer=optimizer,
                                                                          criterion=criterion,

diff --git a/image/vision_transformer/hybrid_parallel/train_with_trainer.py b/image/vision_transformer/hybrid_parallel/train_with_trainer.py
@@ -79,7 +79,7 @@ def train_imagenet():
                                            total_steps=gpc.config.NUM_EPOCHS,
                                            warmup_steps=gpc.config.WARMUP_EPOCHS)
 
-    # intiailize
+    # initialize
     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
                                                                          optimizer=optimizer,
                                                                          criterion=criterion,

diff --git a/language/bert/hybrid_parallel/colossalai_utils/model_zoo/colo_bert.py b/language/bert/hybrid_parallel/colossalai_utils/model_zoo/colo_bert.py
@@ -1,5 +1,5 @@
 """
-Adapted from huggingface modeling_bert.py. Change the necessary part to use Colossolai.
+Adapted from huggingface modeling_bert.py. Change the necessary part to use Colossalai.
 """
 import torch
 import math
@@ -644,7 +644,7 @@ def forward(
         else:
             encoder_extended_attention_mask = None
 
-        ###print("BertModel:emeddings:", embedding_output.shape)
+        ###print("BertModel:embeddings:", embedding_output.shape)
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,

diff --git a/language/bert/hybrid_parallel/common/train.py b/language/bert/hybrid_parallel/common/train.py
@@ -15,7 +15,7 @@ def _train(epoch, rank, world_size, train_dataloader, model, criterion, optimize
     use_autocast = CONFIG['method'] in ['torch', 'colossalai'] and \
         'fp16' in CONFIG and CONFIG['fp16'].get('enabled', True)
     clip_grad_norm = CONFIG.get('gradient_clipping', 0.)
-    use_integraded_clip_grad = CONFIG['method'] in ['fairscale']
+    use_integrated_clip_grad = CONFIG['method'] in ['fairscale']
     use_colossalai_zero_v1 = CONFIG['method'] == 'colossalai' and CONFIG.get('sharded_model_version', 2) == 1
 
     model.train()

diff --git a/language/bert/sequene_parallel/README.md b/language/bert/sequene_parallel/README.md
@@ -124,7 +124,7 @@ make
 
 In the `config.py` provided, a set of parameters are defined including training scheme, model, etc.
 You can also modify the ColossalAI setting. For example, if you wish to parallelize over the 
-sequence diemsnion on 8 GPUs. You can change `size=4` to `size=8`. If you wish to use pipeline parallelism, you can set `pipeline=<num_of_pipeline_stages>`.
+sequence dimension on 8 GPUs. You can change `size=4` to `size=8`. If you wish to use pipeline parallelism, you can set `pipeline=<num_of_pipeline_stages>`.
 
 ### Step 4. Invoke parallel training
 

diff --git a/language/bert/sequene_parallel/data/__init__.py b/language/bert/sequene_parallel/data/__init__.py
@@ -28,7 +28,7 @@ def build_train_valid_test_data_iterators(train_iters,
     # Backward compatibility, assume fixed batch size.
     # if iteration > 0 and consumed_train_samples == 0:
     #     assert train_samples is None, \
-    #         'only backward compatiblity support for iteration-based training'
+    #         'only backward compatibility support for iteration-based training'
     #     consumed_train_samples = iteration * global_batch_size
     # if iteration > 0 and consumed_valid_samples == 0:
     #     if train_samples is None:
@@ -54,7 +54,7 @@ def build_train_valid_test_data_iterators(train_iters,
         train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
             train_valid_test_num_samples=train_val_test_num_samples, **kwargs)
 
-        # Build dataloders.
+        # Build dataloaders.
         dp_size = gpc.get_world_size(ParallelMode.DATA)
         train_dataloader = build_pretraining_data_loader(
             train_ds, consumed_samples=0, micro_batch_size=global_batch_size//dp_size)

diff --git a/language/bert/sequene_parallel/data/bert_helper.py b/language/bert/sequene_parallel/data/bert_helper.py
@@ -53,7 +53,7 @@ def broadcast_data(keys, data, datatype):
     members of the same model parallel group.
 
     Arguments:
-        keys: list of keys in the data disctionary to be broadcasted
+        keys: list of keys in the data dictionary to be broadcasted
         data: data dictionary of string keys and cpu tensor values.
         datatype: torch data type of all tensors in data associated
                   with keys.

diff --git a/language/bert/sequene_parallel/data/datasets/bert_dataset.py b/language/bert/sequene_parallel/data/datasets/bert_dataset.py
@@ -73,7 +73,7 @@ def __getitem__(self, idx):
         sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
         # Note that this rng state should be numpy and not python since
         # python randint is inclusive whereas the numpy one is exclusive.
-        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
+        # We % 2**32 since numpy requires the seed to be between 0 and 2**32 - 1
         np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
         return build_training_sample(
             sample,
@@ -126,16 +126,16 @@ def get_samples_mapping_(indexed_dataset, data_prefix, num_epochs, max_num_sampl
         # Build samples mapping
         verbose = torch.distributed.get_rank() == 0
         start_time = time.time()
-        logger.info('\n > building sapmles index mapping for {} ...'.format(name), ranks=[0])
+        logger.info('\n > building samples index mapping for {} ...'.format(name), ranks=[0])
         # First compile and then import.
         samples_mapping = helpers.build_mapping(indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs,
                                                 max_num_samples, max_seq_length, short_seq_prob, seed, verbose,
                                                 2 if binary_head else 1)
-        logger.info('\n > done building sapmles index maping', ranks=[0])
+        logger.info('\n > done building samples index maping', ranks=[0])
         np.save(indexmap_filename, samples_mapping, allow_pickle=True)
         logger.info('\n > saved the index mapping in {}'.format(indexmap_filename), ranks=[0])
         # Make sure all the ranks have built the mapping
-        logger.info('\n > elasped time to build and save samples mapping '
+        logger.info('\n > elapsed time to build and save samples mapping '
                     '(seconds): {:4f}'.format(time.time() - start_time),
                     ranks=[0])
     # This should be a barrier but nccl barrier assumes
@@ -161,7 +161,7 @@ def get_samples_mapping_(indexed_dataset, data_prefix, num_epochs, max_num_sampl
 
 def build_training_sample(sample, target_seq_length, max_seq_length, vocab_id_list, vocab_id_to_token_dict, cls_id,
                           sep_id, mask_id, pad_id, masked_lm_prob, np_rng, binary_head):
-    """Biuld training sample.
+    """Build training sample.
 
     Arguments:
         sample: A list of sentences in which each sentence is a list token ids.

diff --git a/language/bert/sequene_parallel/data/datasets/blendable_dataset.py b/language/bert/sequene_parallel/data/datasets/blendable_dataset.py
@@ -39,7 +39,7 @@ def __init__(self, datasets, weights):
         assert sum_weights > 0.0
         weights /= sum_weights
 
-        # Build indecies.
+        # Build indices.
         start_time = time.time()
         assert num_datasets < 255
         self.dataset_index = np.zeros(self.size, dtype=np.uint8)

diff --git a/language/bert/sequene_parallel/data/datasets/builder.py b/language/bert/sequene_parallel/data/datasets/builder.py
@@ -26,7 +26,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
                                            skip_warmup)
 
     # Get start and end indices of train/valid/train into doc-idx
-    # Note that doc-idx is desinged to be num-docs + 1 so we can
+    # Note that doc-idx is designed to be num-docs + 1 so we can
     # easily iterate over it.
     total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)