Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add draft gpu troubles #290

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions 1.architectures/1.vpc_network/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
aws cloudformation create-stack --stack-name vpc-stack-ml\
--template-body file://2.vpc-one-az.yaml \
--parameters ParameterKey=SubnetsAZ,ParameterValue=us-west-2a \
ParameterKey=VPCName,ParameterValue="ML HPC VPC" \
--capabilities CAPABILITY_IAM

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

read -p "Please enter the vpc id of your cluster: " vpc_id
echo -e "creating a security group with $vpc_id..."
security_group=$(aws ec2 create-security-group --group-name grafana-sg --description "Open HTTP/HTTPS ports" --vpc-id ${vpc_id} --output text)
aws ec2 authorize-security-group-ingress --group-id ${security_group} --protocol tcp --port 443 --cidr 0.0.0.0/0
aws ec2 authorize-security-group-ingress --group-id ${security_group} --protocol tcp --port 80 —-cidr 0.0.0.0/0
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
Imds:
ImdsSupport: v2.0
Image:
Os: alinux2
CustomAmi: ami-053d893ccc907c49c
Tags:
- Key: 'Grafana'
Value: 'true'
HeadNode:
InstanceType: c6i.8xlarge
Networking:
SubnetId: subnet-08cdcb1f4d6abc7f3
AdditionalSecurityGroups:
- sg-0bbb389be5f1e6563
Ssh:
KeyName: pcluster-key
LocalStorage:
RootVolume:
Size: 100
DeleteOnTermination: true # that's your root and /home volume for users
CustomActions:
OnNodeConfigured:
Script: https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-monitoring/main/post-install.sh
Args:
- v0.9
Iam:
AdditionalIamPolicies: # grant ECR, SSM and S3 read access
- Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
- Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
- Policy: arn:aws:iam::aws:policy/CloudWatchFullAccess
- Policy: arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess
- Policy: arn:aws:iam::aws:policy/AmazonSSMFullAccess
- Policy: arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
Scheduling:
Scheduler: slurm
SlurmSettings:
ScaledownIdletime: 60
SlurmQueues:
- Name: compute-gpu
CapacityType: ONDEMAND
Networking:
SubnetIds:
- subnet-04226fa682376b4f6
PlacementGroup:
Enabled: true
ComputeSettings:
LocalStorage:
EphemeralVolume:
MountDir: /local_scratch # each instance has a local scratch on NVMe
RootVolume:
Size: 200
CustomActions:
OnNodeConfigured:
Script: https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-monitoring/main/post-install.sh
Args:
- v0.9
Iam:
AdditionalIamPolicies:
- Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- Policy: arn:aws:iam::aws:policy/CloudWatchFullAccess
- Policy: arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess
- Policy: arn:aws:iam::aws:policy/AmazonSSMFullAccess
- Policy: arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
# The capacity reservation section is recommended if you use instances
# with a targeted ODCRs. You can also use a capacity resource group and
# CapacityReservationResourceGroupArn if you want to regroup
# multiple reservations
#CapacityReservationTarget:
# CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
ComputeResources:
- Name: distributed-ml
InstanceType: g4dn.12xlarge
MinCount: 0 # if min = max then capacity is maintained and will
MaxCount: 4 # not scale down
Efa:
Enabled: true
SharedStorage:
- MountDir: /fsx
Name: fsx
StorageType: FsxLustre
FsxLustreSettings:
StorageCapacity: 4800 # size it to your storage and throughput needs
PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
DeploymentType: PERSISTENT_2
- Name: SharedEBS
StorageType: Ebs
MountDir: /apps # Store your shared apps & scripts here
EbsSettings:
VolumeType: gp3
Size: 200
Throughput: 300
Iops: 6000
Monitoring:
DetailedMonitoring: true
Logs:
CloudWatch:
Enabled: true # good for debug
Dashboards:
CloudWatch:
Enabled: false # provide basic dashboards


3 changes: 3 additions & 0 deletions 1.architectures/2.aws-parallelcluster/create-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

pcluster create-cluster --cluster-configuration pcluster-config.yaml --cluster-name pcluster-ml --region us-west-2 --suppress-validators "type:InstanceTypeBaseAMICompatibleValidator" --rollback-on-failure "false"
4 changes: 4 additions & 0 deletions 1.architectures/2.aws-parallelcluster/create-key-pair.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

aws ec2 create-key-pair --key-name pcluster-workshop-key --query KeyMaterial --output text > pcluster-workshop-key.pem
sudo chmod 600 pcluster-workshop-key.pem
13 changes: 13 additions & 0 deletions 1.architectures/2.aws-parallelcluster/install-pcluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# Create Virtual env
python3 -m pip install --upgrade pip
python3 -m pip install --user --upgrade virtualenv

python3 -m virtualenv ~/apc-ve

source ~/apc-ve/bin/activate

pip3 install awscli

pip3 install aws-parallelcluster
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ variable "eks_version" {

variable "aws_region" {
type = string
default = "us-east-1"
default = "us-west-2"
}

variable "instance_type" {
Expand Down
2 changes: 1 addition & 1 deletion 3.test_cases/9.nemo-multimodal/nemo_configs/1.config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ wandb_api_key_file: null # File where the w&B api key is stored. Key must be on
env_vars:
NCCL_DEBUG: INFO # Logging level for NCCL. Set to "INFO" for debug information
TRANSFORMER_OFFLINE: 1
FI_EFA_USE_DEVICE_RDMA: 1
#FI_EFA_USE_DEVICE_RDMA: 1
FI_PROVIDER: efa
NCCL_LAUNCH_MODE: parallel
FI_EFA_FORK_SAFE: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ run:

dataset_repo_id: ${HUGGINGFACE_DATASET_REPO_ID} # huggingface dataset repo id, in the format of {user_or_company}/{dataset_name}
# See https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads
<<<<<<< HEAD:3.test_cases/9.nemo-multimodal/nemo_configs/3.download_multimodal.yaml
dataset_output_root: ${DATASET_PATH}
=======
dataset_output_root: /fsx/laion-art-data
>>>>>>> adea2bd (Updated training and data preparation configs):3.test_cases/8.nemo-multimodal/download_multimodal.yaml

download_parquet:
enable: True
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
run:
name: stable_diffusion_860m_res_256_pretrain
results_dir: /apps/nemo-src/launcher_scripts/results/stable_diffusion_860m_res_256_pretrain
time_limit: 2-00:00:00
dependency: singleton
name: stable-diffusion-train
trainer:
devices: 8
num_nodes: 2
accelerator: gpu
precision: 16
logger: false
enable_checkpointing: false
replace_sampler_ddp: false
max_epochs: 5
max_steps: 82500
log_every_n_steps: 10
accumulate_grad_batches: 1
gradient_clip_val: 1.0
benchmark: false
enable_model_summary: true
exp_manager:
explicit_log_dir: /apps/nemo-src/launcher_scripts/results/stable_diffusion_860m_res_256_pretrain/results
exp_dir: null
name: nemo_stable_diffusion
create_wandb_logger: false
wandb_logger_kwargs:
project: stable-diffusion
group: nemo-sd
name: nemo_stable_diffusion
resume: true
create_checkpoint_callback: true
create_tensorboard_logger: true
checkpoint_callback_params:
every_n_train_steps: 1000
every_n_epochs: 0
monitor: reduced_train_loss
filename: nemo-stable-diffusion--{reduced_train_loss:.2f}-{step}-{consumed_samples}
resume_if_exists: true
resume_ignore_no_checkpoint: true
ema:
enable: true
decay: 0.9999
validate_original_weights: false
every_n_steps: 1
cpu_offload: false
model:
precision: 16
micro_batch_size: 128
global_batch_size: 8192
linear_start: 0.00085
linear_end: 0.012
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: images
cond_stage_key: captions
image_size: 64
channels: 4
cond_stage_trainable: false
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: false
scale_by_std: false
ckpt_path: null
ignore_keys: []
parameterization: eps
clip_denoised: true
load_only_unet: false
cosine_s: 0.008
given_betas: null
original_elbo_weight: 0
v_posterior: 0
l_simple_weight: 1
use_positional_encodings: false
learn_logvar: false
logvar_init: 0
beta_schedule: linear
loss_type: l2
concat_mode: true
cond_stage_forward: null
text_embedding_dropout_rate: 0
fused_opt: true
inductor: true
inductor_cudagraphs: false
capture_cudagraph_iters: -1
channels_last: true
unet_config:
_target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
from_pretrained: null
from_NeMo: true
image_size: 32
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions:
- 4
- 2
- 1
num_res_blocks: 2
channel_mult:
- 1
- 2
- 4
- 4
num_heads: 8
use_spatial_transformer: true
transformer_depth: 1
context_dim: 768
use_checkpoint: false
legacy: false
use_flash_attention: true
first_stage_config:
_target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
from_pretrained: null
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
_target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
version: openai/clip-vit-large-patch14
device: cuda
max_length: 77
seed: 666
resume_from_checkpoint: null
apex_transformer_log_level: 30
gradient_as_bucket_view: true
optim:
name: fused_adam
lr: 8.192e-05
weight_decay: 0.0
betas:
- 0.9
- 0.999
sched:
name: WarmupHoldPolicy
warmup_steps: 10000
hold_steps: 10000000000000
nsys_profile:
enabled: false
start_step: 10
end_step: 10
ranks:
- 0
gen_shape: false
data:
num_workers: 16
train:
dataset_path:
- /fsx/laion-art-data/wdinfo.pkl
augmentations:
resize_smallest_side: 256
center_crop_h_w: 256, 256
horizontal_flip: false
filterings: null
webdataset:
infinite_sampler: false
local_root_path: /fsx/laion-art-data/tarfiles_reorganized/task0000/
Loading