Skip to content

Commit

Permalink
Add CloudFormation templates for FSx and EC2. Move Dockerfiles up one…
Browse files Browse the repository at this point in the history
… level. Better tqdm logging, disabled on SageMaker. Move argparse lists to separate file. (#15)

* Optimizer checkpointing.

* TqdmHandler for logger.

* Move docker/ out of albert/. Move argparse to its own function at the top. Disable tqdm if on SageMaker. Add --log_frequency argument.

* Abstract argparse into an arguments.py file.

* Move SM parameters into arguments.py.

* Add CloudFormation templates.

* Add CloudFormation templates, for real.

* Mild name fixes.
  • Loading branch information
jarednielsen authored May 20, 2020
1 parent f1d1bae commit baae6c0
Show file tree
Hide file tree
Showing 15 changed files with 497 additions and 381 deletions.
4 changes: 2 additions & 2 deletions models/nlp/albert/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ Language models help AWS customers to improve search results, text classificatio

1. Create an FSx volume.

2. Download the datasets onto FSx. You will need English Wikipedia and BookCorpus, and helper scripts for downloading will be forthcoming.
2. Download the datasets onto FSx. The simplest way to start is with English Wikipedia.

3. Create an Elastic Container Registry repository. Then build a Docker image from `docker/ngc_sagemaker.Dockerfile` and push it to ECR.
3. Create an Amazon Elastic Container Registry (ECR) repository. Then build a Docker image from `docker/ngc_sagemaker.Dockerfile` and push it to ECR.

```bash
export IMAGE=${ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/${REPO}:ngc_tf21_sagemaker
Expand Down
123 changes: 123 additions & 0 deletions models/nlp/albert/arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
""" Since arguments are duplicated in run_pretraining.py and sagemaker_pretraining.py, they have
been abstracted into this file. It also makes the training scripts much shorter.
"""

import argparse
import os


def populate_pretraining_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--model_dir", help="Unused, but passed by SageMaker")
parser.add_argument("--model_type", default="albert", choices=["albert", "bert"])
parser.add_argument("--model_size", default="base", choices=["base", "large"])
parser.add_argument("--batch_size", type=int, default=32, help="per GPU")
parser.add_argument("--gradient_accumulation_steps", type=int, default=2)
parser.add_argument("--max_seq_length", type=int, default=512, choices=[128, 512])
parser.add_argument("--warmup_steps", type=int, default=3125)
parser.add_argument("--total_steps", type=int, default=125000)
parser.add_argument("--learning_rate", type=float, default=0.00176)
parser.add_argument("--end_learning_rate", type=float, default=3e-5)
parser.add_argument("--learning_rate_decay_power", type=float, default=1.0)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.0)
parser.add_argument("--max_grad_norm", type=float, default=1.0)
parser.add_argument("--optimizer", default="lamb", choices=["lamb", "adam"])
parser.add_argument("--name", default="", help="Additional info to append to metadata")
parser.add_argument("--log_frequency", type=int, default=1000)
parser.add_argument(
"--load_from", default="scratch", choices=["scratch", "checkpoint", "huggingface"],
)
parser.add_argument("--checkpoint_path", default=None)
parser.add_argument(
"--fsx_prefix",
default="/fsx",
choices=["/fsx", "/opt/ml/input/data/training"],
help="Change to /opt/ml/input/data/training on SageMaker",
)
# SageMaker does not work with 'store_const' args, since it parses into a dictionary
# We will treat any value not equal to None as True, and use --skip_xla=true
parser.add_argument(
"--skip_xla",
choices=["true"],
help="For debugging. Faster startup time, slower runtime, more GPU vRAM.",
)
parser.add_argument(
"--eager",
choices=["true"],
help="For debugging. Faster launch, slower runtime, more GPU vRAM.",
)
parser.add_argument(
"--skip_sop", choices=["true"], help="Only use MLM loss, and exclude the SOP loss.",
)
parser.add_argument(
"--skip_mlm", choices=["true"], help="Only use SOP loss, and exclude the MLM loss.",
)
parser.add_argument(
"--pre_layer_norm",
choices=["true"],
help="Place layer normalization before the attention & FFN, rather than after adding the residual connection. https://openreview.net/pdf?id=B1x8anVFPr",
)
parser.add_argument("--extra_squad_steps", type=str)
parser.add_argument("--fast_squad", choices=["true"])
parser.add_argument("--dummy_eval", choices=["true"])
parser.add_argument("--seed", type=int, default=42)


def populate_squad_parser(parser: argparse.ArgumentParser) -> None:
# Model loading
parser.add_argument("--model_type", default="albert", choices=["albert", "bert"])
parser.add_argument("--model_size", default="base", choices=["base", "large"])
parser.add_argument("--load_from", required=True)
parser.add_argument("--load_step", type=int)
parser.add_argument("--skip_xla", choices=["true"])
parser.add_argument("--eager", choices=["true"])
parser.add_argument(
"--pre_layer_norm",
choices=["true"],
help="See https://github.com/huggingface/transformers/pull/3929",
)
parser.add_argument(
"--fsx_prefix",
default="/fsx",
choices=["/fsx", "/opt/ml/input/data/training"],
help="Change to /opt/ml/input/data/training on SageMaker",
)
# Hyperparameters from https://arxiv.org/pdf/1909.11942.pdf#page=17
parser.add_argument("--batch_size", default=6, type=int)
parser.add_argument("--total_steps", default=8144, type=int)
parser.add_argument("--warmup_steps", default=814, type=int)
parser.add_argument("--learning_rate", default=3e-5, type=float)
parser.add_argument("--dataset", default="squadv2")
parser.add_argument("--seed", type=int, default=42)
# Logging information
parser.add_argument("--name", default="default")
parser.add_argument("--validate_frequency", default=1000, type=int)
parser.add_argument("--checkpoint_frequency", default=500, type=int)
parser.add_argument("--model_dir", help="Unused, but passed by SageMaker")


def populate_sagemaker_parser(parser: argparse.ArgumentParser) -> None:
# SageMaker parameters
parser.add_argument(
"--source_dir",
help="For example, /Users/myusername/Desktop/deep-learning-models/models/nlp/albert",
)
parser.add_argument("--entry_point", default="run_pretraining.py")
parser.add_argument("--role", default=os.environ["SAGEMAKER_ROLE"])
parser.add_argument("--image_name", default=os.environ["SAGEMAKER_IMAGE_NAME"])
parser.add_argument("--fsx_id", default=os.environ["SAGEMAKER_FSX_ID"])
parser.add_argument(
"--subnet_ids", help="Comma-separated string", default=os.environ["SAGEMAKER_SUBNET_IDS"]
)
parser.add_argument(
"--security_group_ids",
help="Comma-separated string",
default=os.environ["SAGEMAKER_SECURITY_GROUP_IDS"],
)
# Instance specs
parser.add_argument(
"--instance_type",
type=str,
default="ml.p3dn.24xlarge",
choices=["ml.p3dn.24xlarge", "ml.p3.16xlarge", "ml.g4dn.12xlarge"],
)
parser.add_argument("--instance_count", type=int, default=1)
4 changes: 0 additions & 4 deletions models/nlp/albert/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@
)


def get_initializer(stddev):
return tf.keras.initializers.TruncatedNormal(stddev=stddev)


def load_qa_from_pretrained(
model: Optional[tf.keras.Model] = None,
name: Optional[str] = None,
Expand Down
Loading

0 comments on commit baae6c0

Please sign in to comment.