aws-samples · mhuguesaws · Jul 31, 2023 · Aug 3, 2023 · Jul 31, 2023 · Aug 3, 2023
diff --git a/1.architectures/1.vpc_network/deploy.sh b/1.architectures/1.vpc_network/deploy.sh
@@ -0,0 +1,6 @@
+aws cloudformation create-stack --stack-name vpc-stack-ml\
+                                --template-body file://2.vpc-one-az.yaml \
+                                --parameters ParameterKey=SubnetsAZ,ParameterValue=us-west-2a \
+                                             ParameterKey=VPCName,ParameterValue="ML HPC VPC" \
+                                --capabilities CAPABILITY_IAM
+
diff --git a/1.architectures/2.aws-parallelcluster/config_with_grafana_monitoring/create-additional-sg.sh b/1.architectures/2.aws-parallelcluster/config_with_grafana_monitoring/create-additional-sg.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+read -p "Please enter the vpc id of your cluster: " vpc_id
+echo -e "creating a security group with $vpc_id..."
+security_group=$(aws ec2 create-security-group --group-name grafana-sg --description "Open HTTP/HTTPS ports" --vpc-id ${vpc_id} --output text)
+aws ec2 authorize-security-group-ingress --group-id ${security_group} --protocol tcp --port 443 --cidr 0.0.0.0/0
+aws ec2 authorize-security-group-ingress --group-id ${security_group} --protocol tcp --port 80 —-cidr 0.0.0.0/0
diff --git a/...2.aws-parallelcluster/config_with_grafana_monitoring/pcluster-with-monitoring-config.yaml b/...2.aws-parallelcluster/config_with_grafana_monitoring/pcluster-with-monitoring-config.yaml
@@ -0,0 +1,103 @@
+Imds:
+  ImdsSupport: v2.0
+Image:
+  Os: alinux2
+  CustomAmi: ami-053d893ccc907c49c
+Tags:
+  - Key: 'Grafana'
+    Value: 'true'
+HeadNode:
+  InstanceType: c6i.8xlarge
+  Networking:
+    SubnetId: subnet-08cdcb1f4d6abc7f3
+    AdditionalSecurityGroups:
+      - sg-0bbb389be5f1e6563
+  Ssh:
+    KeyName: pcluster-key
+  LocalStorage:
+    RootVolume:
+      Size: 100
+      DeleteOnTermination: true # that's your root and /home volume for users
+  CustomActions:
+    OnNodeConfigured:
+      Script: https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-monitoring/main/post-install.sh
+      Args:
+        - v0.9
+  Iam:
+    AdditionalIamPolicies: # grant ECR, SSM and S3 read access
+      - Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
+      - Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
+      - Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
+      - Policy: arn:aws:iam::aws:policy/CloudWatchFullAccess
+      - Policy: arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess
+      - Policy: arn:aws:iam::aws:policy/AmazonSSMFullAccess
+      - Policy: arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
+Scheduling:
+  Scheduler: slurm
+  SlurmSettings:
+    ScaledownIdletime: 60
+  SlurmQueues:
+    - Name: compute-gpu
+      CapacityType: ONDEMAND
+      Networking:
+        SubnetIds:
+          - subnet-04226fa682376b4f6
+        PlacementGroup:
+          Enabled: true
+      ComputeSettings:
+        LocalStorage:
+          EphemeralVolume:
+            MountDir: /local_scratch # each instance has a local scratch on NVMe
+          RootVolume:
+            Size: 200
+      CustomActions:
+        OnNodeConfigured:
+          Script: https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-monitoring/main/post-install.sh
+          Args:
+            - v0.9
+      Iam:
+        AdditionalIamPolicies:
+          - Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
+          - Policy: arn:aws:iam::aws:policy/CloudWatchFullAccess
+          - Policy: arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess
+          - Policy: arn:aws:iam::aws:policy/AmazonSSMFullAccess
+          - Policy: arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
+      # The capacity reservation section is recommended if you use instances
+      # with a targeted ODCRs. You can also use a capacity resource group and
+      # CapacityReservationResourceGroupArn if you want to regroup
+      # multiple reservations
+      #CapacityReservationTarget:
+      #  CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
+      ComputeResources:
+        - Name: distributed-ml
+          InstanceType: g4dn.12xlarge
+          MinCount: 0 # if min = max then capacity is maintained and will
+          MaxCount: 4 # not scale down
+          Efa:
+            Enabled: true
+SharedStorage:
+  - MountDir: /fsx
+    Name: fsx
+    StorageType: FsxLustre
+    FsxLustreSettings:
+      StorageCapacity: 4800 # size it to your storage and throughput needs
+      PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
+      DeploymentType: PERSISTENT_2
+  - Name: SharedEBS
+    StorageType: Ebs
+    MountDir: /apps # Store your shared apps & scripts here
+    EbsSettings:
+      VolumeType: gp3
+      Size: 200
+      Throughput: 300
+      Iops: 6000
+Monitoring:
+  DetailedMonitoring: true
+  Logs:
+    CloudWatch:
+      Enabled: true # good for debug
+  Dashboards:
+    CloudWatch:
+      Enabled: false # provide basic dashboards
+
+
diff --git a/1.architectures/2.aws-parallelcluster/create-cluster.sh b/1.architectures/2.aws-parallelcluster/create-cluster.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+pcluster create-cluster --cluster-configuration pcluster-config.yaml --cluster-name pcluster-ml --region us-west-2 --suppress-validators "type:InstanceTypeBaseAMICompatibleValidator" --rollback-on-failure "false"
diff --git a/1.architectures/2.aws-parallelcluster/create-key-pair.sh b/1.architectures/2.aws-parallelcluster/create-key-pair.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+aws ec2 create-key-pair --key-name pcluster-workshop-key --query KeyMaterial --output text > pcluster-workshop-key.pem
+sudo chmod 600 pcluster-workshop-key.pem
diff --git a/1.architectures/2.aws-parallelcluster/install-pcluster.sh b/1.architectures/2.aws-parallelcluster/install-pcluster.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Create Virtual env
+python3 -m pip install --upgrade pip
+python3 -m pip install --user --upgrade virtualenv
+
+python3 -m virtualenv ~/apc-ve
+
+source ~/apc-ve/bin/activate
+
+pip3 install awscli
+
+pip3 install aws-parallelcluster
diff --git a/2.ami_and_containers/1.amazon_machine_image/packer-ami.pkr.hcl b/2.ami_and_containers/1.amazon_machine_image/packer-ami.pkr.hcl
@@ -33,7 +33,7 @@ variable "eks_version" {
 
 variable "aws_region" {
   type    = string
-  default = "us-east-1"
+  default = "us-west-2"
 }
 
 variable "instance_type" {

diff --git a/3.test_cases/9.nemo-multimodal/nemo_configs/1.config.yaml b/3.test_cases/9.nemo-multimodal/nemo_configs/1.config.yaml
@@ -27,7 +27,7 @@ wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on
 env_vars:
   NCCL_DEBUG: INFO # Logging level for NCCL. Set to "INFO" for debug information
   TRANSFORMER_OFFLINE: 1
-  FI_EFA_USE_DEVICE_RDMA: 1
+  #FI_EFA_USE_DEVICE_RDMA: 1
   FI_PROVIDER: efa
   NCCL_LAUNCH_MODE: parallel
   FI_EFA_FORK_SAFE: 1

diff --git a/3.test_cases/9.nemo-multimodal/nemo_configs/3.download_multimodal.yaml b/3.test_cases/9.nemo-multimodal/nemo_configs/3.download_multimodal.yaml
@@ -7,7 +7,11 @@ run:
 
 dataset_repo_id: ${HUGGINGFACE_DATASET_REPO_ID}  # huggingface dataset repo id, in the format of {user_or_company}/{dataset_name}
 #  See https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads
+<<<<<<< HEAD:3.test_cases/9.nemo-multimodal/nemo_configs/3.download_multimodal.yaml
 dataset_output_root: ${DATASET_PATH}
+=======
+dataset_output_root: /fsx/laion-art-data
+>>>>>>> adea2bd (Updated training and data preparation configs):3.test_cases/8.nemo-multimodal/download_multimodal.yaml
 
 download_parquet:
   enable: True

diff --git a/...st_cases/9.nemo-multimodal/nemo_configs/stable_diffusion_860m_res_256_pretrain_hydra.yaml b/...st_cases/9.nemo-multimodal/nemo_configs/stable_diffusion_860m_res_256_pretrain_hydra.yaml
@@ -0,0 +1,175 @@
+run:
+  name: stable_diffusion_860m_res_256_pretrain
+  results_dir: /apps/nemo-src/launcher_scripts/results/stable_diffusion_860m_res_256_pretrain
+  time_limit: 2-00:00:00
+  dependency: singleton
+name: stable-diffusion-train
+trainer:
+  devices: 8
+  num_nodes: 2
+  accelerator: gpu
+  precision: 16
+  logger: false
+  enable_checkpointing: false
+  replace_sampler_ddp: false
+  max_epochs: 5
+  max_steps: 82500
+  log_every_n_steps: 10
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: false
+  enable_model_summary: true
+exp_manager:
+  explicit_log_dir: /apps/nemo-src/launcher_scripts/results/stable_diffusion_860m_res_256_pretrain/results
+  exp_dir: null
+  name: nemo_stable_diffusion
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: stable-diffusion
+    group: nemo-sd
+    name: nemo_stable_diffusion
+    resume: true
+  create_checkpoint_callback: true
+  create_tensorboard_logger: true
+  checkpoint_callback_params:
+    every_n_train_steps: 1000
+    every_n_epochs: 0
+    monitor: reduced_train_loss
+    filename: nemo-stable-diffusion--{reduced_train_loss:.2f}-{step}-{consumed_samples}
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  ema:
+    enable: true
+    decay: 0.9999
+    validate_original_weights: false
+    every_n_steps: 1
+    cpu_offload: false
+model:
+  precision: 16
+  micro_batch_size: 128
+  global_batch_size: 8192
+  linear_start: 0.00085
+  linear_end: 0.012
+  num_timesteps_cond: 1
+  log_every_t: 200
+  timesteps: 1000
+  first_stage_key: images
+  cond_stage_key: captions
+  image_size: 64
+  channels: 4
+  cond_stage_trainable: false
+  conditioning_key: crossattn
+  monitor: val/loss_simple_ema
+  scale_factor: 0.18215
+  use_ema: false
+  scale_by_std: false
+  ckpt_path: null
+  ignore_keys: []
+  parameterization: eps
+  clip_denoised: true
+  load_only_unet: false
+  cosine_s: 0.008
+  given_betas: null
+  original_elbo_weight: 0
+  v_posterior: 0
+  l_simple_weight: 1
+  use_positional_encodings: false
+  learn_logvar: false
+  logvar_init: 0
+  beta_schedule: linear
+  loss_type: l2
+  concat_mode: true
+  cond_stage_forward: null
+  text_embedding_dropout_rate: 0
+  fused_opt: true
+  inductor: true
+  inductor_cudagraphs: false
+  capture_cudagraph_iters: -1
+  channels_last: true
+  unet_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
+    from_pretrained: null
+    from_NeMo: true
+    image_size: 32
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions:
+    - 4
+    - 2
+    - 1
+    num_res_blocks: 2
+    channel_mult:
+    - 1
+    - 2
+    - 4
+    - 4
+    num_heads: 8
+    use_spatial_transformer: true
+    transformer_depth: 1
+    context_dim: 768
+    use_checkpoint: false
+    legacy: false
+    use_flash_attention: true
+  first_stage_config:
+    _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+    from_pretrained: null
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult:
+      - 1
+      - 2
+      - 4
+      - 4
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity
+  cond_stage_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+    version: openai/clip-vit-large-patch14
+    device: cuda
+    max_length: 77
+  seed: 666
+  resume_from_checkpoint: null
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  optim:
+    name: fused_adam
+    lr: 8.192e-05
+    weight_decay: 0.0
+    betas:
+    - 0.9
+    - 0.999
+    sched:
+      name: WarmupHoldPolicy
+      warmup_steps: 10000
+      hold_steps: 10000000000000
+  nsys_profile:
+    enabled: false
+    start_step: 10
+    end_step: 10
+    ranks:
+    - 0
+    gen_shape: false
+  data:
+    num_workers: 16
+    train:
+      dataset_path:
+      - /fsx/laion-art-data/wdinfo.pkl
+      augmentations:
+        resize_smallest_side: 256
+        center_crop_h_w: 256, 256
+        horizontal_flip: false
+      filterings: null
+    webdataset:
+      infinite_sampler: false
+      local_root_path: /fsx/laion-art-data/tarfiles_reorganized/task0000/
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		pcluster create-cluster --cluster-configuration pcluster-config.yaml --cluster-name pcluster-ml --region us-west-2 --suppress-validators "type:InstanceTypeBaseAMICompatibleValidator" --rollback-on-failure "false"