Skip to content

Commit

Permalink
Add ddp config and script.
Browse files Browse the repository at this point in the history
  • Loading branch information
KanaiYuma-aist committed Jan 28, 2025
1 parent 6ff0d7f commit dd0f2ad
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 0 deletions.
72 changes: 72 additions & 0 deletions examples/torch/MNIST/config_ddp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
trainer:
accelerator: "gpu"
devices: 8
strategy: "ddp"
max_epochs: 10
callbacks:
- _target_: lightning.pytorch.callbacks.ModelCheckpoint
filename: "{epoch:04d}"
save_last: True
save_top_k: -1

task:
_target_: torchvision_task.Resnet50Task
model:
_target_: torchvision.models.resnet50
weights:
_target_: hydra.utils.get_object
path: torchvision.models.ResNet50_Weights.DEFAULT
optimizer_config:
_target_: aiaccel.torch.lightning.OptimizerConfig
optimizer_generator:
_partial_: True
_target_: torch.optim.AdamW
lr: 1.e-4
num_classes: 10

datamodule:
_target_: aiaccel.torch.lightning.datamodules.single_datamodule.SingleDataModule
train_dataset_fn:
_partial_: true
_target_: torchvision.datasets.MNIST
root: "./dataset"
train: True
download: True
transform:
_target_: torchvision.transforms.Compose
transforms:
- _target_: torchvision.transforms.Resize
size:
- 256
- 256
- _target_: torchvision.transforms.Grayscale
num_output_channels: 3
- _target_: torchvision.transforms.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
std:
- 0.5
val_dataset_fn:
_partial_: true
_target_: torchvision.datasets.MNIST
root: "./dataset"
train: False
download: True
transform:
_target_: torchvision.transforms.Compose
transforms:
- _target_: torchvision.transforms.Resize
size:
- 256
- 256
- _target_: torchvision.transforms.Grayscale
num_output_channels: 3
- _target_: torchvision.transforms.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
std:
- 0.5
batch_size: 128
wrap_scatter_dataset: False
18 changes: 18 additions & 0 deletions examples/torch/MNIST/train_ddp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#! /bin/bash

#PBS -q rt_HF
#PBS -l select=1
#PBS -l walltime=1:00:00
#PBS -P grpname
#PBS -j oe

cd ${PBS_O_WORKDIR}

source /etc/profile.d/modules.sh
module load cuda/12.6/12.6.1

source path_to_aiaccel_env/bin/activate

wd=path_to_working_directory

singularity exec --nv path_to_python.sif python -m aiaccel.torch.apps.train $wd/config_ddp.yaml --working_directory $wd

0 comments on commit dd0f2ad

Please sign in to comment.