From 5aaae95dbbd9918ef5498bae2414b09e8b43b7b9 Mon Sep 17 00:00:00 2001 From: zaccharieramzi Date: Thu, 6 Jan 2022 17:53:59 +0100 Subject: [PATCH 01/24] WIP started coding the wandb hydra example for JZ --- docs/examples/tf/tf_wandb_hydra/README.md | 19 +++++++++++++++++++ .../tf/tf_wandb_hydra/requirements.txt | 3 +++ 2 files changed, 22 insertions(+) create mode 100644 docs/examples/tf/tf_wandb_hydra/README.md create mode 100644 docs/examples/tf/tf_wandb_hydra/requirements.txt diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md new file mode 100644 index 0000000..b70e92c --- /dev/null +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -0,0 +1,19 @@ +# [Weights&Biases - Hydra](https://github.com/jean-zay-users/jean-zay-doc/tree/master/docs/examples/tf/tf_wandb_hydra) + + +## Installation + +To run this example, you need to clone the jean-zay repo in your `$WORK` dir: +``` +cd $WORK &&\ +git clone https://github.com/jean-zay-users/jean-zay-doc.git +``` + +You can then install the requirements: +``` +module purge +module load tensorflow-gpu/py3/2.6.0 +pip install --user -r $WORK/jean-zay-doc/docs/examples/tf/tf_wandb_hydra/requirements.txt +``` + +## Run diff --git a/docs/examples/tf/tf_wandb_hydra/requirements.txt b/docs/examples/tf/tf_wandb_hydra/requirements.txt new file mode 100644 index 0000000..b38d6a6 --- /dev/null +++ b/docs/examples/tf/tf_wandb_hydra/requirements.txt @@ -0,0 +1,3 @@ +hydra-core +wandb +hydra-submitit-launcher \ No newline at end of file From ad57780f92eb1cc88b1a4324034b7c3123bfc7bf Mon Sep 17 00:00:00 2001 From: zaccharieramzi Date: Thu, 6 Jan 2022 18:05:27 +0100 Subject: [PATCH 02/24] added a mnist example in the hydra style --- .../examples/tf/tf_wandb_hydra/train_mnist.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 docs/examples/tf/tf_wandb_hydra/train_mnist.py diff --git a/docs/examples/tf/tf_wandb_hydra/train_mnist.py b/docs/examples/tf/tf_wandb_hydra/train_mnist.py new file mode 100644 index 0000000..95ecf2d --- /dev/null +++ b/docs/examples/tf/tf_wandb_hydra/train_mnist.py @@ -0,0 +1,58 @@ +# all taken from https://www.tensorflow.org/guide/keras/functional +import hydra + +@hydra.main(config_path='../conf', config_name='config') +def train_dense_model_main(cfg): + return train_dense_model(cfg) + + +def my_model(input_shape=784, output_num=10, activation='relu', hidden_size=64): + inputs = keras.Input(shape=input_shape) + x = layers.Dense(hidden_size, activation=activation)(inputs) + x = layers.Dense(hidden_size, activation=activation)(x) + outputs = layers.Dense(output_num)(x) + return keras.Model(inputs=inputs, outputs=outputs, name='mnist_model') + +def model_compile(model, loss, optimizer='rmsprop'): + if loss == 'xent': + loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True) + model.compile(loss=loss, + optimizer=optimizer, + metrics=['accuracy']) + +def data(n_train=60_000, n_test=10_000, n_features=784, n_classes=10): + # training and inference + # network is not reachable, so we use random data + x_train = tf.random.normal((n_train, n_features), dtype='float32') + x_test = tf.random.normal((n_test, n_features), dtype='float32') + y_train = tf.random.uniform((n_train,), minval=0, maxval=n_classes, dtype='int32') + y_test = tf.random.uniform((n_test,), minval=0, maxval=n_classes, dtype='int32') + return x_train, x_test, y_train, y_test + + +def train_dense_model(cfg): + # limit imports oustide the call to the function, in order to launch quickly + # when using dask + import tensorflow as tf + from tensorflow import keras + from tensorflow.keras import layers + # model building + tf.keras.backend.clear_session() # For easy reset of notebook state. + + + model = my_model(**cfg.model) + + model_compile(model, **cfg.compile) + + + x_train, x_test, y_train, y_test = data(**cfg.data) + + + history = model.fit(x_train, y_train, **cfg.fit) + test_scores = model.evaluate(x_test, y_test, verbose=2) + print('Test loss:', test_scores[0]) + print('Test accuracy:', test_scores[1]) + return True + +if __name__ == '__main__': + train_dense_model_main() From 786a6140595a598b08fb3c85df7877422069f245 Mon Sep 17 00:00:00 2001 From: zaccharieramzi Date: Thu, 6 Jan 2022 18:11:10 +0100 Subject: [PATCH 03/24] slight corrections in train mnist --- docs/examples/tf/tf_wandb_hydra/train_mnist.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/train_mnist.py b/docs/examples/tf/tf_wandb_hydra/train_mnist.py index 95ecf2d..db01986 100644 --- a/docs/examples/tf/tf_wandb_hydra/train_mnist.py +++ b/docs/examples/tf/tf_wandb_hydra/train_mnist.py @@ -1,6 +1,7 @@ # all taken from https://www.tensorflow.org/guide/keras/functional import hydra + @hydra.main(config_path='../conf', config_name='config') def train_dense_model_main(cfg): return train_dense_model(cfg) @@ -13,7 +14,7 @@ def my_model(input_shape=784, output_num=10, activation='relu', hidden_size=64): outputs = layers.Dense(output_num)(x) return keras.Model(inputs=inputs, outputs=outputs, name='mnist_model') -def model_compile(model, loss, optimizer='rmsprop'): +def model_compile(model, loss='xent', optimizer='rmsprop'): if loss == 'xent': loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True) model.compile(loss=loss, From ed9d2c76f407b4ee1ec31854ce55db6ef7c5577d Mon Sep 17 00:00:00 2001 From: zaccharieramzi Date: Thu, 6 Jan 2022 18:11:26 +0100 Subject: [PATCH 04/24] added configurations for hydra --- .../examples/tf/tf_wandb_hydra/conf/config.yaml | 12 ++++++++++++ .../conf/hydra/launcher/base.yaml | 17 +++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 docs/examples/tf/tf_wandb_hydra/conf/config.yaml create mode 100644 docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml diff --git a/docs/examples/tf/tf_wandb_hydra/conf/config.yaml b/docs/examples/tf/tf_wandb_hydra/conf/config.yaml new file mode 100644 index 0000000..b474dc9 --- /dev/null +++ b/docs/examples/tf/tf_wandb_hydra/conf/config.yaml @@ -0,0 +1,12 @@ +data: + n_features: 784 + n_classes: 10 + +model: + input_shape: ${data.n_features} + output_num: ${data.n_classes} + +fit: + epochs: 5 + batch_size: 64 + validation_split: 0.2 \ No newline at end of file diff --git a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml new file mode 100644 index 0000000..9d449a8 --- /dev/null +++ b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml @@ -0,0 +1,17 @@ +defaults: + - submitit_slurm + +timeout_min: null +gpus_per_node: 1 +tasks_per_node: 1 +gres: "gpu:${hydra.launcher.gpus_per_node}" +qos: qos_gpu-dev +cpus_per_gpu: 10 +gpus_per_task: ${hydra.launcher.gpus_per_node} +additional_parameters: + account: ${project-id}@gpu + distribution: "block:block" + hint: nomultithread + time: "${hours}:00:00" +setup: + - "#SBATCH -C v100-32g" \ No newline at end of file From 1249c938652a593b59f67a7aa743007737567c83 Mon Sep 17 00:00:00 2001 From: zaccharieramzi Date: Thu, 6 Jan 2022 18:16:31 +0100 Subject: [PATCH 05/24] added wandb to example --- .../tf/tf_wandb_hydra/conf/config.yaml | 11 +++++++- .../conf/hydra/launcher/base.yaml | 3 ++- .../examples/tf/tf_wandb_hydra/train_mnist.py | 27 ++++++++++++------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/docs/examples/tf/tf_wandb_hydra/conf/config.yaml b/docs/examples/tf/tf_wandb_hydra/conf/config.yaml index b474dc9..d0089e5 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/config.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/config.yaml @@ -9,4 +9,13 @@ model: fit: epochs: 5 batch_size: 64 - validation_split: 0.2 \ No newline at end of file + validation_split: 0.2 + +wanbd: + project: jean-zay-doc + notes: "Hydra-wandb-submitit exp" + tags: + - hydra + - tuto + dir: "${oc.env:SCRATCH}/wandb/jean-zay-doc" + mode: null \ No newline at end of file diff --git a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml index 9d449a8..2de4e05 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml @@ -14,4 +14,5 @@ additional_parameters: hint: nomultithread time: "${hours}:00:00" setup: - - "#SBATCH -C v100-32g" \ No newline at end of file + - "#SBATCH -C v100-32g" + - "export WANDB_MODE=offline" \ No newline at end of file diff --git a/docs/examples/tf/tf_wandb_hydra/train_mnist.py b/docs/examples/tf/tf_wandb_hydra/train_mnist.py index db01986..c4347cf 100644 --- a/docs/examples/tf/tf_wandb_hydra/train_mnist.py +++ b/docs/examples/tf/tf_wandb_hydra/train_mnist.py @@ -1,5 +1,10 @@ # all taken from https://www.tensorflow.org/guide/keras/functional +from pathlib import Path + import hydra +from omegaconf import OmegaConf +import wandb +from wandb.keras import WandbCallback @hydra.main(config_path='../conf', config_name='config') @@ -37,19 +42,23 @@ def train_dense_model(cfg): import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers - # model building - tf.keras.backend.clear_session() # For easy reset of notebook state. - - model = my_model(**cfg.model) + # wandb setup + Path(cfg.wandb.dir).mkdir(exist_ok=True, parents=True) + wandb.init( + config=OmegaConf.to_container(cfg, resolve=True), + **cfg.wandb, + ) + callbacks = [ + WandbCallback(monitor='loss', save_weights_only=True), + ] + # model building + tf.keras.backend.clear_session() + model = my_model(**cfg.model) model_compile(model, **cfg.compile) - - x_train, x_test, y_train, y_test = data(**cfg.data) - - - history = model.fit(x_train, y_train, **cfg.fit) + history = model.fit(x_train, y_train, **cfg.fit, callbacks=callbacks) test_scores = model.evaluate(x_test, y_test, verbose=2) print('Test loss:', test_scores[0]) print('Test accuracy:', test_scores[1]) From a5a5c1a4b91e8aec48949b4abc146280e26cd0e0 Mon Sep 17 00:00:00 2001 From: zaccharieramzi Date: Thu, 6 Jan 2022 23:05:32 +0100 Subject: [PATCH 06/24] expanded readme for the example --- docs/examples/tf/tf_wandb_hydra/README.md | 56 +++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index b70e92c..1444533 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -17,3 +17,59 @@ pip install --user -r $WORK/jean-zay-doc/docs/examples/tf/tf_wandb_hydra/require ``` ## Run +In order to run the example on SLURM you can just issue the following command from the example directory: +``` +python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=1 +``` +where `yyy` is your Jean Zay project id. + +### SLURM parametrization +Different parameters can be set for the SLURM job, using the `hydra.launcher` config group. +For example to launch a longer job, you can use: +``` +python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=10 hydra.launcher.qos=qos_gpu-t3 +``` + +If you want to use more gpus: +``` +python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=10 hydra.launcher.qos=qos_gpu-t3 hydra.launcher.gpus_per_node=4 +``` + +### Weights&Biases +`wandb` is run offline because the compute nodes are not connected to the internet. +In order to have the results uploaded to the cloud, you need to manually sync them using the `wandb sync run_dir` command. +The run directories are located in `$SCRATCH/wandb/jean-zay-doc`, but this can be changed using the `wandb.dir` config variable. +You can also run a script to sync the runs before they are finished on a front node, for example using the script [here](https://github.com/zaccharieramzi/submission-scripts/blob/master/jean_zay/syncall_wandb.sh). + +### Hydra and submitit outputs +The outputs created by Hydra and submitit are located in the `multirun` directory. +You can change this value by setting the `hydra.dir` config variable. + +### Batch jobs +In order to batch multiple similar jobs you can use the sweep feature of Hydra. +For example, if you want to run multiple training with different batch sizes, you can do the following: +``` +python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=1 fit.batch_size=32,64,128 +``` + +This can be extended to the grid search of a Cartesian product for example: +``` +python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=1 fit.batch_size=32,64,128 compile.optimizer=rmsprop,adam +``` + +## Similar resources + +- [slurm-hydra-submitit](https://github.com/RaphaelMeudec/slurm-hydra-submitit) presents a similar concept in a more general case for any SLURM cluster, without W&B. In particular, it specifies [how to run specific parameters combinations grid search](https://github.com/RaphaelMeudec/slurm-hydra-submitit#specific-parameters-combinations). +- [submission-scripts](https://github.com/zaccharieramzi/submission-scripts/tree/master/jean_zay/hydra_config) includes a packaged hydra submitit launcher config for the Jean Zay cluster. In particular this means that instead of having to copy over the launcher configuration, you can install this package (`pip install --user submission-scripts`), and use the launcher as follows (`-m` is equivalent to `--multirun`): +``` +python train_mnist.py -m hydra/launcher=base\ + 'hydra.searchpath=[pkg://jean_zay/hydra_config]'\ + +hours=1 additional_parameters.account=yyy@gpu +project=project_name +``` + + +## References +- Weights&Biases: https://wandb.ai/site +- Hydra: https://hydra.cc/ +- Submitit: https://github.com/facebookincubator/submitit +- Hydra submitit launcher: https://hydra.cc/docs/plugins/submitit_launcher/ \ No newline at end of file From a255edfac23f77245e5f0efc7f833ba85a797e9a Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 14:16:01 +0100 Subject: [PATCH 07/24] corrected config relative placement --- docs/examples/tf/tf_wandb_hydra/train_mnist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/train_mnist.py b/docs/examples/tf/tf_wandb_hydra/train_mnist.py index c4347cf..1120349 100644 --- a/docs/examples/tf/tf_wandb_hydra/train_mnist.py +++ b/docs/examples/tf/tf_wandb_hydra/train_mnist.py @@ -7,7 +7,7 @@ from wandb.keras import WandbCallback -@hydra.main(config_path='../conf', config_name='config') +@hydra.main(config_path='conf', config_name='config') def train_dense_model_main(cfg): return train_dense_model(cfg) From 8e31a478485dc3dc7959089661644aed13a44489 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 14:18:05 +0100 Subject: [PATCH 08/24] corrected project id --- docs/examples/tf/tf_wandb_hydra/README.md | 10 +++++----- .../tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index 1444533..bf6a692 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -19,7 +19,7 @@ pip install --user -r $WORK/jean-zay-doc/docs/examples/tf/tf_wandb_hydra/require ## Run In order to run the example on SLURM you can just issue the following command from the example directory: ``` -python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=1 +python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=1 ``` where `yyy` is your Jean Zay project id. @@ -27,12 +27,12 @@ where `yyy` is your Jean Zay project id. Different parameters can be set for the SLURM job, using the `hydra.launcher` config group. For example to launch a longer job, you can use: ``` -python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=10 hydra.launcher.qos=qos_gpu-t3 +python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 hydra.launcher.qos=qos_gpu-t3 ``` If you want to use more gpus: ``` -python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=10 hydra.launcher.qos=qos_gpu-t3 hydra.launcher.gpus_per_node=4 +python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 hydra.launcher.qos=qos_gpu-t3 hydra.launcher.gpus_per_node=4 ``` ### Weights&Biases @@ -49,12 +49,12 @@ You can change this value by setting the `hydra.dir` config variable. In order to batch multiple similar jobs you can use the sweep feature of Hydra. For example, if you want to run multiple training with different batch sizes, you can do the following: ``` -python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=1 fit.batch_size=32,64,128 +python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=1 fit.batch_size=32,64,128 ``` This can be extended to the grid search of a Cartesian product for example: ``` -python train_mnist.py --multirun hydra/launcher=base +project-id=yyy +hours=1 fit.batch_size=32,64,128 compile.optimizer=rmsprop,adam +python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=1 fit.batch_size=32,64,128 compile.optimizer=rmsprop,adam ``` ## Similar resources diff --git a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml index 2de4e05..1f71582 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml @@ -9,7 +9,7 @@ qos: qos_gpu-dev cpus_per_gpu: 10 gpus_per_task: ${hydra.launcher.gpus_per_node} additional_parameters: - account: ${project-id}@gpu + account: ${project_id}@gpu distribution: "block:block" hint: nomultithread time: "${hours}:00:00" From 193e6c6ed3f8ef71b6d5776bfd02042e5e29de5b Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 14:18:55 +0100 Subject: [PATCH 09/24] corrected timeout min --- docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml index 1f71582..8a9be88 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml @@ -1,7 +1,7 @@ defaults: - submitit_slurm -timeout_min: null +timeout_min: 60 gpus_per_node: 1 tasks_per_node: 1 gres: "gpu:${hydra.launcher.gpus_per_node}" From 9b5ce42ef5140c4a220f1af392fc78642bd7d563 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 14:20:02 +0100 Subject: [PATCH 10/24] made the wandb imports lazy --- docs/examples/tf/tf_wandb_hydra/train_mnist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/examples/tf/tf_wandb_hydra/train_mnist.py b/docs/examples/tf/tf_wandb_hydra/train_mnist.py index 1120349..3bcc6a4 100644 --- a/docs/examples/tf/tf_wandb_hydra/train_mnist.py +++ b/docs/examples/tf/tf_wandb_hydra/train_mnist.py @@ -3,8 +3,6 @@ import hydra from omegaconf import OmegaConf -import wandb -from wandb.keras import WandbCallback @hydra.main(config_path='conf', config_name='config') @@ -42,6 +40,8 @@ def train_dense_model(cfg): import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers + import wandb + from wandb.keras import WandbCallback # wandb setup Path(cfg.wandb.dir).mkdir(exist_ok=True, parents=True) From 4f8a1d2ac890f1ebadb8176116f468feb25067f2 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 14:23:32 +0100 Subject: [PATCH 11/24] added more self promotion and potential correction to qos --- docs/examples/tf/tf_wandb_hydra/README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index bf6a692..66bb577 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -27,12 +27,12 @@ where `yyy` is your Jean Zay project id. Different parameters can be set for the SLURM job, using the `hydra.launcher` config group. For example to launch a longer job, you can use: ``` -python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 hydra.launcher.qos=qos_gpu-t3 +python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 hydra.launcher.qos='qos_gpu-t3' ``` If you want to use more gpus: ``` -python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 hydra.launcher.qos=qos_gpu-t3 hydra.launcher.gpus_per_node=4 +python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 hydra.launcher.qos='qos_gpu-t3' hydra.launcher.gpus_per_node=4 ``` ### Weights&Biases @@ -66,6 +66,10 @@ python train_mnist.py -m hydra/launcher=base\ 'hydra.searchpath=[pkg://jean_zay/hydra_config]'\ +hours=1 additional_parameters.account=yyy@gpu +project=project_name ``` +or equivalently: +``` +submitit-hydra-launch train_mnist.py base +hours=1 additional_parameters.account=yyy@gpu +project=project_name +``` ## References From 349ef11bef14aad36d117fd0bd519eb5012e6b78 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 14:23:44 +0100 Subject: [PATCH 12/24] corrected typo on wandb in config --- docs/examples/tf/tf_wandb_hydra/conf/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/conf/config.yaml b/docs/examples/tf/tf_wandb_hydra/conf/config.yaml index d0089e5..435a66e 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/config.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/config.yaml @@ -11,7 +11,7 @@ fit: batch_size: 64 validation_split: 0.2 -wanbd: +wandb: project: jean-zay-doc notes: "Hydra-wandb-submitit exp" tags: From 48599053df1c0936c1b679d4cc0fb3f3a6d29e0e Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 14:28:31 +0100 Subject: [PATCH 13/24] added the functions in the script to keep the imports of tensorflow and wandb lazy --- .../examples/tf/tf_wandb_hydra/train_mnist.py | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/docs/examples/tf/tf_wandb_hydra/train_mnist.py b/docs/examples/tf/tf_wandb_hydra/train_mnist.py index 3bcc6a4..32c59e8 100644 --- a/docs/examples/tf/tf_wandb_hydra/train_mnist.py +++ b/docs/examples/tf/tf_wandb_hydra/train_mnist.py @@ -10,30 +10,6 @@ def train_dense_model_main(cfg): return train_dense_model(cfg) -def my_model(input_shape=784, output_num=10, activation='relu', hidden_size=64): - inputs = keras.Input(shape=input_shape) - x = layers.Dense(hidden_size, activation=activation)(inputs) - x = layers.Dense(hidden_size, activation=activation)(x) - outputs = layers.Dense(output_num)(x) - return keras.Model(inputs=inputs, outputs=outputs, name='mnist_model') - -def model_compile(model, loss='xent', optimizer='rmsprop'): - if loss == 'xent': - loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True) - model.compile(loss=loss, - optimizer=optimizer, - metrics=['accuracy']) - -def data(n_train=60_000, n_test=10_000, n_features=784, n_classes=10): - # training and inference - # network is not reachable, so we use random data - x_train = tf.random.normal((n_train, n_features), dtype='float32') - x_test = tf.random.normal((n_test, n_features), dtype='float32') - y_train = tf.random.uniform((n_train,), minval=0, maxval=n_classes, dtype='int32') - y_test = tf.random.uniform((n_test,), minval=0, maxval=n_classes, dtype='int32') - return x_train, x_test, y_train, y_test - - def train_dense_model(cfg): # limit imports oustide the call to the function, in order to launch quickly # when using dask @@ -43,6 +19,31 @@ def train_dense_model(cfg): import wandb from wandb.keras import WandbCallback + + def my_model(input_shape=784, output_num=10, activation='relu', hidden_size=64): + inputs = keras.Input(shape=input_shape) + x = layers.Dense(hidden_size, activation=activation)(inputs) + x = layers.Dense(hidden_size, activation=activation)(x) + outputs = layers.Dense(output_num)(x) + return keras.Model(inputs=inputs, outputs=outputs, name='mnist_model') + + def model_compile(model, loss='xent', optimizer='rmsprop'): + if loss == 'xent': + loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True) + model.compile(loss=loss, + optimizer=optimizer, + metrics=['accuracy']) + + def data(n_train=60_000, n_test=10_000, n_features=784, n_classes=10): + # training and inference + # network is not reachable, so we use random data + x_train = tf.random.normal((n_train, n_features), dtype='float32') + x_test = tf.random.normal((n_test, n_features), dtype='float32') + y_train = tf.random.uniform((n_train,), minval=0, maxval=n_classes, dtype='int32') + y_test = tf.random.uniform((n_test,), minval=0, maxval=n_classes, dtype='int32') + return x_train, x_test, y_train, y_test + + # wandb setup Path(cfg.wandb.dir).mkdir(exist_ok=True, parents=True) wandb.init( From 12a880c2099b99878fe4dccd2ce4077e53cba12b Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 14:55:55 +0100 Subject: [PATCH 14/24] made compile an integral part of the config --- docs/examples/tf/tf_wandb_hydra/conf/config.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/conf/config.yaml b/docs/examples/tf/tf_wandb_hydra/conf/config.yaml index 435a66e..2f31545 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/config.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/config.yaml @@ -11,11 +11,14 @@ fit: batch_size: 64 validation_split: 0.2 +compile: + optimizer: rmsprop + wandb: project: jean-zay-doc notes: "Hydra-wandb-submitit exp" tags: - hydra - tuto - dir: "${oc.env:SCRATCH}/wandb/jean-zay-doc" + dir: "${oc.env:SCRATCH,.}/wandb/jean-zay-doc" mode: null \ No newline at end of file From 987ed8c0773c9f83633b68d326aa8b7cc05ecef0 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 15:50:26 +0100 Subject: [PATCH 15/24] corrected name of sync all package --- docs/examples/tf/tf_wandb_hydra/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index 66bb577..24006fa 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -39,7 +39,7 @@ python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 h `wandb` is run offline because the compute nodes are not connected to the internet. In order to have the results uploaded to the cloud, you need to manually sync them using the `wandb sync run_dir` command. The run directories are located in `$SCRATCH/wandb/jean-zay-doc`, but this can be changed using the `wandb.dir` config variable. -You can also run a script to sync the runs before they are finished on a front node, for example using the script [here](https://github.com/zaccharieramzi/submission-scripts/blob/master/jean_zay/syncall_wandb.sh). +You can also run a script to sync the runs before they are finished on a front node, for example using the script [here](https://github.com/zaccharieramzi/submission-scripts/blob/master/jean_zay/syncall-wandb). ### Hydra and submitit outputs The outputs created by Hydra and submitit are located in the `multirun` directory. From 560e509a92866af9b869103bec00c4c4b19f6df0 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 15:58:38 +0100 Subject: [PATCH 16/24] made sync all script a gist to have more stability --- docs/examples/tf/tf_wandb_hydra/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index 24006fa..995c0ac 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -39,7 +39,7 @@ python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 h `wandb` is run offline because the compute nodes are not connected to the internet. In order to have the results uploaded to the cloud, you need to manually sync them using the `wandb sync run_dir` command. The run directories are located in `$SCRATCH/wandb/jean-zay-doc`, but this can be changed using the `wandb.dir` config variable. -You can also run a script to sync the runs before they are finished on a front node, for example using the script [here](https://github.com/zaccharieramzi/submission-scripts/blob/master/jean_zay/syncall-wandb). +You can also run a script to sync the runs before they are finished on a front node, for example using the script [here](https://gist.github.com/zaccharieramzi/3e1abc67aefac106ede2883c56ac8e1a). ### Hydra and submitit outputs The outputs created by Hydra and submitit are located in the `multirun` directory. From 025662ed209e7745ef0d4a0e8659abc2ded25627 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 16:37:50 +0100 Subject: [PATCH 17/24] removed need for project id and got it from the env --- docs/examples/tf/tf_wandb_hydra/README.md | 13 ++++++------- .../tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index 995c0ac..0f30567 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -19,20 +19,19 @@ pip install --user -r $WORK/jean-zay-doc/docs/examples/tf/tf_wandb_hydra/require ## Run In order to run the example on SLURM you can just issue the following command from the example directory: ``` -python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=1 +python train_mnist.py --multirun hydra/launcher=base +hours=1 ``` -where `yyy` is your Jean Zay project id. ### SLURM parametrization Different parameters can be set for the SLURM job, using the `hydra.launcher` config group. For example to launch a longer job, you can use: ``` -python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 hydra.launcher.qos='qos_gpu-t3' +python train_mnist.py --multirun hydra/launcher=base +hours=10 hydra.launcher.qos='qos_gpu-t3' ``` If you want to use more gpus: ``` -python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=10 hydra.launcher.qos='qos_gpu-t3' hydra.launcher.gpus_per_node=4 +python train_mnist.py --multirun hydra/launcher=base +hours=10 hydra.launcher.qos='qos_gpu-t3' hydra.launcher.gpus_per_node=4 ``` ### Weights&Biases @@ -49,18 +48,18 @@ You can change this value by setting the `hydra.dir` config variable. In order to batch multiple similar jobs you can use the sweep feature of Hydra. For example, if you want to run multiple training with different batch sizes, you can do the following: ``` -python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=1 fit.batch_size=32,64,128 +python train_mnist.py --multirun hydra/launcher=base +hours=1 fit.batch_size=32,64,128 ``` This can be extended to the grid search of a Cartesian product for example: ``` -python train_mnist.py --multirun hydra/launcher=base +project_id=yyy +hours=1 fit.batch_size=32,64,128 compile.optimizer=rmsprop,adam +python train_mnist.py --multirun hydra/launcher=base +hours=1 fit.batch_size=32,64,128 compile.optimizer=rmsprop,adam ``` ## Similar resources - [slurm-hydra-submitit](https://github.com/RaphaelMeudec/slurm-hydra-submitit) presents a similar concept in a more general case for any SLURM cluster, without W&B. In particular, it specifies [how to run specific parameters combinations grid search](https://github.com/RaphaelMeudec/slurm-hydra-submitit#specific-parameters-combinations). -- [submission-scripts](https://github.com/zaccharieramzi/submission-scripts/tree/master/jean_zay/hydra_config) includes a packaged hydra submitit launcher config for the Jean Zay cluster. In particular this means that instead of having to copy over the launcher configuration, you can install this package (`pip install --user submission-scripts`), and use the launcher as follows (`-m` is equivalent to `--multirun`): +- [submission-scripts](https://github.com/zaccharieramzi/submission-scripts/tree/master/jean_zay/hydra_config) includes a packaged hydra submitit launcher config for the Jean Zay cluster. In particular this means that instead of having to copy over the launcher configuration, you can install this package (`pip install --user submission-scripts`), and use the launcher as follows (`-m` is equivalent to `--multirun`) if `yyy` is your project id (`echo $IDRPROJ`): ``` python train_mnist.py -m hydra/launcher=base\ 'hydra.searchpath=[pkg://jean_zay/hydra_config]'\ diff --git a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml index 8a9be88..c9797e9 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml @@ -9,7 +9,7 @@ qos: qos_gpu-dev cpus_per_gpu: 10 gpus_per_task: ${hydra.launcher.gpus_per_node} additional_parameters: - account: ${project_id}@gpu + account: ${oc.env:IDRPROJ,${project_id}}@gpu distribution: "block:block" hint: nomultithread time: "${hours}:00:00" From 7927128c657bf7a1c55d9149ae2ea8294d21f4fe Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Fri, 7 Jan 2022 16:40:03 +0100 Subject: [PATCH 18/24] removed default in project id --- docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml index c9797e9..a704356 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml @@ -9,7 +9,7 @@ qos: qos_gpu-dev cpus_per_gpu: 10 gpus_per_task: ${hydra.launcher.gpus_per_node} additional_parameters: - account: ${oc.env:IDRPROJ,${project_id}}@gpu + account: ${oc.env:IDRPROJ}@gpu distribution: "block:block" hint: nomultithread time: "${hours}:00:00" From a23b9131628a8724847824adc9ef00b7dcfc7718 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Mon, 10 Jan 2022 10:59:46 +0100 Subject: [PATCH 19/24] added explanation on sbatch -c --- docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml index a704356..9e13e0b 100644 --- a/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml +++ b/docs/examples/tf/tf_wandb_hydra/conf/hydra/launcher/base.yaml @@ -14,5 +14,6 @@ additional_parameters: hint: nomultithread time: "${hours}:00:00" setup: - - "#SBATCH -C v100-32g" + - "#SBATCH -C v100-32g" # this setup is needed here and not in additional parameters + # because otherwise it will be difficult to remove at run time. - "export WANDB_MODE=offline" \ No newline at end of file From 7cd061ce29d0f4a3c893a7aa358e836f7c7c82c5 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Mon, 10 Jan 2022 11:16:04 +0100 Subject: [PATCH 20/24] added an introduction and some alternatives to the packages introduced --- docs/examples/tf/tf_wandb_hydra/README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index 0f30567..350ed32 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -1,5 +1,10 @@ # [Weights&Biases - Hydra](https://github.com/jean-zay-users/jean-zay-doc/tree/master/docs/examples/tf/tf_wandb_hydra) +Weights&Biases and Hydra are 2 tools used in Machine Learning Projects. +Weights&Biases allows you to easily save a lot of information about your different experiments in the cloud, like meta data, system data, model weights and of course your different metrics and logs. +Hydra is a configuration management tool that allows you to build command line interfaces and create robust and readable configuration files. +These 2 tools can be used together very elegantly and easily, but their setup on Jean Zay is not straightforward. +In this example, we will show you how to setup both tools on Jean Zay in a TensorFlow example. ## Installation @@ -75,4 +80,14 @@ submitit-hydra-launch train_mnist.py base +hours=1 additional_parameters.account - Weights&Biases: https://wandb.ai/site - Hydra: https://hydra.cc/ - Submitit: https://github.com/facebookincubator/submitit -- Hydra submitit launcher: https://hydra.cc/docs/plugins/submitit_launcher/ \ No newline at end of file +- Hydra submitit launcher: https://hydra.cc/docs/plugins/submitit_launcher/ + +## Alternatives + +To Weights&Biases: +- MLFlow +- Tensorboard + +To Hydra: +- argparse +- click \ No newline at end of file From b4cde5afa891cbe1db2ee467be857d0e4c47fbf8 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Mon, 10 Jan 2022 13:30:35 +0100 Subject: [PATCH 21/24] replaced my sub scripts (inadapted) with the custom launcher I developped in similar resources --- docs/examples/tf/tf_wandb_hydra/README.md | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index 350ed32..f46d9ad 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -64,15 +64,9 @@ python train_mnist.py --multirun hydra/launcher=base +hours=1 fit.batch_size=32, ## Similar resources - [slurm-hydra-submitit](https://github.com/RaphaelMeudec/slurm-hydra-submitit) presents a similar concept in a more general case for any SLURM cluster, without W&B. In particular, it specifies [how to run specific parameters combinations grid search](https://github.com/RaphaelMeudec/slurm-hydra-submitit#specific-parameters-combinations). -- [submission-scripts](https://github.com/zaccharieramzi/submission-scripts/tree/master/jean_zay/hydra_config) includes a packaged hydra submitit launcher config for the Jean Zay cluster. In particular this means that instead of having to copy over the launcher configuration, you can install this package (`pip install --user submission-scripts`), and use the launcher as follows (`-m` is equivalent to `--multirun`) if `yyy` is your project id (`echo $IDRPROJ`): +- [jz-hydra-submitit-launcher](https://github.com/zaccharieramzi/jz-hydra-submitit-launcher) a pip installable (`pip install jz-hydra-submitit-launcher`) custom launcher that has the correct default for JZ, and several default configurations: ``` -python train_mnist.py -m hydra/launcher=base\ - 'hydra.searchpath=[pkg://jean_zay/hydra_config]'\ - +hours=1 additional_parameters.account=yyy@gpu +project=project_name -``` -or equivalently: -``` -submitit-hydra-launch train_mnist.py base +hours=1 additional_parameters.account=yyy@gpu +project=project_name +hydra-submitit-launch train_mnist.py dev ``` From b0903c3eb85c897ca57915fc0652b2fc6a29acda Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Mon, 10 Jan 2022 15:01:31 +0100 Subject: [PATCH 22/24] added spec that my plugin needs to specify wandb mode to offline --- docs/examples/tf/tf_wandb_hydra/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index f46d9ad..3088f68 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -66,7 +66,7 @@ python train_mnist.py --multirun hydra/launcher=base +hours=1 fit.batch_size=32, - [slurm-hydra-submitit](https://github.com/RaphaelMeudec/slurm-hydra-submitit) presents a similar concept in a more general case for any SLURM cluster, without W&B. In particular, it specifies [how to run specific parameters combinations grid search](https://github.com/RaphaelMeudec/slurm-hydra-submitit#specific-parameters-combinations). - [jz-hydra-submitit-launcher](https://github.com/zaccharieramzi/jz-hydra-submitit-launcher) a pip installable (`pip install jz-hydra-submitit-launcher`) custom launcher that has the correct default for JZ, and several default configurations: ``` -hydra-submitit-launch train_mnist.py dev +hydra-submitit-launch train_mnist.py dev hydra.launcher.setup=\["'#SBATCH -C v100-32g'","'export WANDB_MODE=offline'"\] ``` From fc382164b71457496459bcbc601daf1162b5f853 Mon Sep 17 00:00:00 2001 From: zaccharieramzi Date: Tue, 11 Jan 2022 16:56:35 +0100 Subject: [PATCH 23/24] specified that wandb requires an account --- docs/examples/tf/tf_wandb_hydra/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/examples/tf/tf_wandb_hydra/README.md b/docs/examples/tf/tf_wandb_hydra/README.md index 3088f68..f45e3b0 100644 --- a/docs/examples/tf/tf_wandb_hydra/README.md +++ b/docs/examples/tf/tf_wandb_hydra/README.md @@ -40,6 +40,7 @@ python train_mnist.py --multirun hydra/launcher=base +hours=10 hydra.launcher.qo ``` ### Weights&Biases +This will require you to create a [Weights&Biases account](https://wandb.ai/). `wandb` is run offline because the compute nodes are not connected to the internet. In order to have the results uploaded to the cloud, you need to manually sync them using the `wandb sync run_dir` command. The run directories are located in `$SCRATCH/wandb/jean-zay-doc`, but this can be changed using the `wandb.dir` config variable. From 750322a06123a513da43766ee38da8409be368a5 Mon Sep 17 00:00:00 2001 From: zaccharieramzi Date: Tue, 11 Jan 2022 16:58:21 +0100 Subject: [PATCH 24/24] added the example to the page tree --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index 266761d..41f3fdf 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -73,3 +73,4 @@ nav: - Single node: examples/tf/tf_simple/README.md - Distributed with SlurmClusterResolver: examples/tf/tf_distributed/README.md - Distributed with Horovod: examples/tf/tf_mpi/README.md + - Weights&Biases and Hydra: examples/tf/tf_wandb_hydra/README.md