From 048ba19a0965d595c286c4549ecaa5b6aca5848b Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Wed, 9 Nov 2022 13:13:37 -0500 Subject: [PATCH 1/6] Tweak example to be resumable --- example/config.yaml | 9 ++- example/my_app.py | 55 ++++++++++++++++++- hydra_plugins/hydra_orion_sweeper/config.py | 2 + .../hydra_orion_sweeper/implementation.py | 17 +++++- 4 files changed, 77 insertions(+), 6 deletions(-) diff --git a/example/config.yaml b/example/config.yaml index 5f2f31c..cf7446a 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -2,6 +2,11 @@ defaults: - override hydra/sweeper: orion hydra: + job: + env_set: + PREVIOUS_CHECKPOINT: ${hydra.sweeper.orion.previous_checkpoint} + CURRENT_CHECKPOINT: ${hydra.runtime.output_dir} + # makes sure each multirun ends up in a unique folder # the defaults can make overlapping folders sweep: @@ -16,14 +21,14 @@ hydra: lr: "uniform(0, 1)" dropout: "uniform(0, 1)" batch_size: "uniform(4, 16, discrete=True)" - epoch: "fidelity(10, 100)" + epoch: "fidelity(3, 100)" orion: name: 'experiment' version: '1' algorithm: - type: random + type: hyperband config: seed: 1 diff --git a/example/my_app.py b/example/my_app.py index 11ab5a3..387e828 100644 --- a/example/my_app.py +++ b/example/my_app.py @@ -8,6 +8,47 @@ log = logging.getLogger(__name__) + +def _load_checkpoint(path, model): + checkpoint = os.path.join(path, 'chk.pt') + + if os.path.exists(checkpoint): + # load checkpoint + # ... + return True + + return False + + +def load_checkpoint(model): + current_checkpoint_path = os.getenv("CURRENT_CHECKPOINT") + assert current_checkpoint_path is not None + + # if checkpoint file exist then always load it as it is the most recent + if _load_checkpoint(current_checkpoint_path, model): + return True + + # Previous checkpoint points to a job that finished and that we want to resume from + # this is useful for genetic algo or algo that gradually improve on previous solutions + prev_checkpoint_path = os.getenv("PREVIOUS_CHECKPOINT") + + if prev_checkpoint_path and _load_checkpoint(prev_checkpoint_path, model): + return True + + return False + + + +def save_checkpoint(model): + current_checkpoint_path = os.getenv("CURRENT_CHECKPOINT") + checkpoint = os.path.join(current_checkpoint_path, 'chk.pt') + + with open(checkpoint, 'w') as fp: + # save checkpoint + # ... + pass + + @hydra.main(config_path=".", config_name="config", version_base="1.1") def dummy_training(cfg: DictConfig) -> float: """A dummy function to minimize @@ -15,19 +56,29 @@ def dummy_training(cfg: DictConfig) -> float: lr = 0.12, dropout=0.33, opt=Adam, batch_size=4 """ + # print(cfg.hydra ) + # makes sure folders are unique os.makedirs('newdir', exist_ok=False) + model = None + + if load_checkpoint(model): + print('Resuming from checkpoint') + else: + print('No checkpoint found') + do = cfg.dropout bs = cfg.batch_size out = float( abs(do - 0.33) + int(cfg.optimizer.name == "Adam") + abs(cfg.optimizer.lr - 0.12) + abs(bs - 4) ) - # ..../hydra_orion_sweeper/example/multirun/2022-11-08/11-56-45/39 - # print(os.getcwd()) log.info( f"dummy_training(dropout={do:.3f}, lr={cfg.optimizer.lr:.3f}, opt={cfg.optimizer.name}, batch_size={bs}) = {out:.3f}", ) + + save_checkpoint(model) + if cfg.error: raise RuntimeError("cfg.error is True") diff --git a/hydra_plugins/hydra_orion_sweeper/config.py b/hydra_plugins/hydra_orion_sweeper/config.py index 4995a50..0ecfd5f 100644 --- a/hydra_plugins/hydra_orion_sweeper/config.py +++ b/hydra_plugins/hydra_orion_sweeper/config.py @@ -23,6 +23,8 @@ class OrionClientConf: trial: Optional[str] = None uuid: Optional[str] = None + previous_checkpoint: Optional[str] = None + @dataclass class WorkerConf: diff --git a/hydra_plugins/hydra_orion_sweeper/implementation.py b/hydra_plugins/hydra_orion_sweeper/implementation.py index 1959afb..2469ba3 100644 --- a/hydra_plugins/hydra_orion_sweeper/implementation.py +++ b/hydra_plugins/hydra_orion_sweeper/implementation.py @@ -13,6 +13,7 @@ from typing import Any, List, Optional, Sequence, Union from hydra.core import utils +from hydra.core.global_hydra import GlobalHydra from hydra.core.override_parser.overrides_parser import OverridesParser from hydra.core.override_parser.types import Override, QuotedString from hydra.core.plugins import Plugins @@ -159,16 +160,19 @@ def override_parser(): return parser -def as_overrides(trial, additional, uuid): +def as_overrides(trial, additional, uuid, prev_checkpoint): """Returns the trial arguments as hydra overrides""" kwargs = deepcopy(additional) kwargs.update(flatten(trial.params)) args = [f"{k}={v}" for k, v in kwargs.items()] + args += [ f"hydra.sweeper.orion.id={trial.experiment}", f"hydra.sweeper.orion.trial={trial.id}", f"hydra.sweeper.orion.uuid={uuid}", + f"hydra.sweeper.orion.previous_checkpoint={prev_checkpoint}", + # "hydra.sweeper.orion.current_checkpoint=$hydra.runtime.output_dir", ] return tuple(args) @@ -350,6 +354,7 @@ def __init__( self.client = None self.storage = None self.uuid = uuid.uuid1().hex + self.resume_paths = dict() self.orion_config = orion self.worker_config = worker @@ -532,10 +537,15 @@ def sample_trials(self) -> List[Trial]: self.pending_trials.update(set(trials)) return trials + def trial_as_override(self, trial: Trial): + """Create overrides for a specific trial""" + checkpoint = self.resume_paths.get(trial.hash_params) + return as_overrides(trial, self.arguments, self.uuid, checkpoint) + def execute_trials(self, trials: List[Trial]) -> Sequence[JobReturn]: """Execture the given batch of trials""" - overrides = list(as_overrides(t, self.arguments, self.uuid) for t in trials) + overrides = list(self.trial_as_override(t) for t in trials) self.validate_batch_is_legal(overrides) returns = self.launcher.launch(overrides, initial_job_idx=self.job_idx) @@ -548,6 +558,9 @@ def observe_one( """Observe a single trial""" value = result.return_value + trialdir = result.hydra_cfg["hydra"]["runtime"]["output_dir"] + self.resume_paths[trial.hash_params] = trialdir + try: objective = to_objective(value) self.client.observe(trial, objective) From fe93ec8575782ffd5b1cc837d6ca691ec5869446 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Wed, 9 Nov 2022 13:16:07 -0500 Subject: [PATCH 2/6] cleanup --- example/my_app.py | 2 -- hydra_plugins/hydra_orion_sweeper/implementation.py | 1 - 2 files changed, 3 deletions(-) diff --git a/example/my_app.py b/example/my_app.py index 387e828..ff68b07 100644 --- a/example/my_app.py +++ b/example/my_app.py @@ -56,8 +56,6 @@ def dummy_training(cfg: DictConfig) -> float: lr = 0.12, dropout=0.33, opt=Adam, batch_size=4 """ - # print(cfg.hydra ) - # makes sure folders are unique os.makedirs('newdir', exist_ok=False) diff --git a/hydra_plugins/hydra_orion_sweeper/implementation.py b/hydra_plugins/hydra_orion_sweeper/implementation.py index 2469ba3..1c9ace7 100644 --- a/hydra_plugins/hydra_orion_sweeper/implementation.py +++ b/hydra_plugins/hydra_orion_sweeper/implementation.py @@ -13,7 +13,6 @@ from typing import Any, List, Optional, Sequence, Union from hydra.core import utils -from hydra.core.global_hydra import GlobalHydra from hydra.core.override_parser.overrides_parser import OverridesParser from hydra.core.override_parser.types import Override, QuotedString from hydra.core.plugins import Plugins From 0c96fde6755a698bea11c75a021c626ef9cbe28d Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Wed, 9 Nov 2022 13:26:00 -0500 Subject: [PATCH 3/6] - --- hydra_plugins/hydra_orion_sweeper/implementation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/hydra_plugins/hydra_orion_sweeper/implementation.py b/hydra_plugins/hydra_orion_sweeper/implementation.py index 1c9ace7..f687730 100644 --- a/hydra_plugins/hydra_orion_sweeper/implementation.py +++ b/hydra_plugins/hydra_orion_sweeper/implementation.py @@ -557,8 +557,11 @@ def observe_one( """Observe a single trial""" value = result.return_value - trialdir = result.hydra_cfg["hydra"]["runtime"]["output_dir"] - self.resume_paths[trial.hash_params] = trialdir + trialdir = ( + result.hydra_cfg.get("hydra", {}).get("runtime", {}).get("output_dir", None) + ) + if trialdir: + self.resume_paths[trial.hash_params] = trialdir try: objective = to_objective(value) From 3ec1052cb48c0c88177ad23c1172654a7091cc41 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Wed, 9 Nov 2022 15:07:43 -0500 Subject: [PATCH 4/6] - --- hydra_plugins/hydra_orion_sweeper/implementation.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hydra_plugins/hydra_orion_sweeper/implementation.py b/hydra_plugins/hydra_orion_sweeper/implementation.py index f687730..7f365a3 100644 --- a/hydra_plugins/hydra_orion_sweeper/implementation.py +++ b/hydra_plugins/hydra_orion_sweeper/implementation.py @@ -557,10 +557,8 @@ def observe_one( """Observe a single trial""" value = result.return_value - trialdir = ( - result.hydra_cfg.get("hydra", {}).get("runtime", {}).get("output_dir", None) - ) - if trialdir: + if result.hydra_cfg: + trialdir = result.hydra_cfg["hydra"]["runtime"]["output_dir"] self.resume_paths[trial.hash_params] = trialdir try: From b8d0aac41816a4154b646fb186bea00ca1de7692 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Fri, 6 Jan 2023 13:31:18 -0500 Subject: [PATCH 5/6] Fix name clash --- example/config.yaml | 6 +++--- hydra_plugins/hydra_orion_sweeper/config.py | 2 +- .../hydra_orion_sweeper/implementation.py | 14 +++++++------- hydra_plugins/hydra_orion_sweeper/orion_sweeper.py | 4 ++-- tests/hydra_config.py | 2 +- tests/test_orion.py | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/example/config.yaml b/example/config.yaml index cf7446a..f61a15d 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -4,14 +4,14 @@ defaults: hydra: job: env_set: - PREVIOUS_CHECKPOINT: ${hydra.sweeper.orion.previous_checkpoint} + PREVIOUS_CHECKPOINT: ${hydra.sweeper.client.previous_checkpoint} CURRENT_CHECKPOINT: ${hydra.runtime.output_dir} # makes sure each multirun ends up in a unique folder # the defaults can make overlapping folders sweep: dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.sweeper.orion.name}/${hydra.sweeper.orion.uuid}/${hydra.job.id} + subdir: ${hydra.sweeper.client.name}/${hydra.sweeper.client.uuid}/${hydra.job.id} sweeper: # default parametrization of the search space @@ -23,7 +23,7 @@ hydra: batch_size: "uniform(4, 16, discrete=True)" epoch: "fidelity(3, 100)" - orion: + client: name: 'experiment' version: '1' diff --git a/hydra_plugins/hydra_orion_sweeper/config.py b/hydra_plugins/hydra_orion_sweeper/config.py index 0ecfd5f..ae30529 100644 --- a/hydra_plugins/hydra_orion_sweeper/config.py +++ b/hydra_plugins/hydra_orion_sweeper/config.py @@ -84,7 +84,7 @@ class OrionSweeperConf: _target_: str = "hydra_plugins.hydra_orion_sweeper.orion_sweeper.OrionSweeper" - orion: OrionClientConf = OrionClientConf() + client: OrionClientConf = OrionClientConf() worker: WorkerConf = WorkerConf() diff --git a/hydra_plugins/hydra_orion_sweeper/implementation.py b/hydra_plugins/hydra_orion_sweeper/implementation.py index 7f365a3..29d342a 100644 --- a/hydra_plugins/hydra_orion_sweeper/implementation.py +++ b/hydra_plugins/hydra_orion_sweeper/implementation.py @@ -167,11 +167,11 @@ def as_overrides(trial, additional, uuid, prev_checkpoint): args = [f"{k}={v}" for k, v in kwargs.items()] args += [ - f"hydra.sweeper.orion.id={trial.experiment}", - f"hydra.sweeper.orion.trial={trial.id}", - f"hydra.sweeper.orion.uuid={uuid}", - f"hydra.sweeper.orion.previous_checkpoint={prev_checkpoint}", - # "hydra.sweeper.orion.current_checkpoint=$hydra.runtime.output_dir", + f"hydra.sweeper.client.id={trial.experiment}", + f"hydra.sweeper.client.trial={trial.id}", + f"hydra.sweeper.client.uuid={uuid}", + f"hydra.sweeper.client.previous_checkpoint={prev_checkpoint}", + # "hydra.sweeper.client.current_checkpoint=$hydra.runtime.output_dir", ] return tuple(args) @@ -341,7 +341,7 @@ class OrionSweeperImpl(Sweeper): def __init__( self, - orion: OrionClientConf, + client: OrionClientConf, worker: WorkerConf, algorithm: AlgorithmConf, storage: StorageConf, @@ -355,7 +355,7 @@ def __init__( self.uuid = uuid.uuid1().hex self.resume_paths = dict() - self.orion_config = orion + self.orion_config = client self.worker_config = worker self.algo_config = algorithm self.storage_config = storage diff --git a/hydra_plugins/hydra_orion_sweeper/orion_sweeper.py b/hydra_plugins/hydra_orion_sweeper/orion_sweeper.py index 8fdd1d7..767c27b 100644 --- a/hydra_plugins/hydra_orion_sweeper/orion_sweeper.py +++ b/hydra_plugins/hydra_orion_sweeper/orion_sweeper.py @@ -18,7 +18,7 @@ class OrionSweeper(Sweeper): def __init__( self, - orion: OrionClientConf, + client: OrionClientConf, worker: WorkerConf, algorithm: AlgorithmConf, storage: StorageConf, @@ -47,7 +47,7 @@ def __init__( if params is None: params = dict() - self.sweeper = OrionSweeperImpl(orion, worker, algorithm, storage, params) + self.sweeper = OrionSweeperImpl(client, worker, algorithm, storage, params) def setup( self, diff --git a/tests/hydra_config.py b/tests/hydra_config.py index 19bdb17..fa9c0e5 100644 --- a/tests/hydra_config.py +++ b/tests/hydra_config.py @@ -7,7 +7,7 @@ }, "sweeper": { "_target_": "hydra_plugins.hydra_orion_sweeper.orion_sweeper.OrionSweeper", - "orion": { + "client": { "name": None, "version": None, "branching": None, diff --git a/tests/test_orion.py b/tests/test_orion.py index 2b60a4e..20b1bc1 100644 --- a/tests/test_orion.py +++ b/tests/test_orion.py @@ -42,7 +42,7 @@ def load_hydra_testing_config(): def orion_configuration(): return dict( - orion=OmegaConf.structured(OrionClientConf()), + client=OmegaConf.structured(OrionClientConf()), worker=OmegaConf.structured(WorkerConf()), algorithm=OmegaConf.structured(AlgorithmConf()), storage=OmegaConf.structured(StorageConf()), From c60e8d42c7bdf72315c94b0f9b54a6c9cd7805f5 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Fri, 6 Jan 2023 13:38:00 -0500 Subject: [PATCH 6/6] Rename client to experiment --- example/config.yaml | 6 +++--- hydra_plugins/hydra_orion_sweeper/config.py | 2 +- .../hydra_orion_sweeper/implementation.py | 14 +++++++------- hydra_plugins/hydra_orion_sweeper/orion_sweeper.py | 8 ++++---- tests/hydra_config.py | 2 +- tests/test_orion.py | 2 +- tests/test_warnings.py | 4 ++-- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/example/config.yaml b/example/config.yaml index f61a15d..8c204f4 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -4,14 +4,14 @@ defaults: hydra: job: env_set: - PREVIOUS_CHECKPOINT: ${hydra.sweeper.client.previous_checkpoint} + PREVIOUS_CHECKPOINT: ${hydra.sweeper.experiment.previous_checkpoint} CURRENT_CHECKPOINT: ${hydra.runtime.output_dir} # makes sure each multirun ends up in a unique folder # the defaults can make overlapping folders sweep: dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.sweeper.client.name}/${hydra.sweeper.client.uuid}/${hydra.job.id} + subdir: ${hydra.sweeper.experiment.name}/${hydra.sweeper.experiment.uuid}/${hydra.job.id} sweeper: # default parametrization of the search space @@ -23,7 +23,7 @@ hydra: batch_size: "uniform(4, 16, discrete=True)" epoch: "fidelity(3, 100)" - client: + experiment: name: 'experiment' version: '1' diff --git a/hydra_plugins/hydra_orion_sweeper/config.py b/hydra_plugins/hydra_orion_sweeper/config.py index ae30529..4ade603 100644 --- a/hydra_plugins/hydra_orion_sweeper/config.py +++ b/hydra_plugins/hydra_orion_sweeper/config.py @@ -84,7 +84,7 @@ class OrionSweeperConf: _target_: str = "hydra_plugins.hydra_orion_sweeper.orion_sweeper.OrionSweeper" - client: OrionClientConf = OrionClientConf() + experiment: OrionClientConf = OrionClientConf() worker: WorkerConf = WorkerConf() diff --git a/hydra_plugins/hydra_orion_sweeper/implementation.py b/hydra_plugins/hydra_orion_sweeper/implementation.py index 29d342a..20b2ade 100644 --- a/hydra_plugins/hydra_orion_sweeper/implementation.py +++ b/hydra_plugins/hydra_orion_sweeper/implementation.py @@ -167,11 +167,11 @@ def as_overrides(trial, additional, uuid, prev_checkpoint): args = [f"{k}={v}" for k, v in kwargs.items()] args += [ - f"hydra.sweeper.client.id={trial.experiment}", - f"hydra.sweeper.client.trial={trial.id}", - f"hydra.sweeper.client.uuid={uuid}", - f"hydra.sweeper.client.previous_checkpoint={prev_checkpoint}", - # "hydra.sweeper.client.current_checkpoint=$hydra.runtime.output_dir", + f"hydra.sweeper.experiment.id={trial.experiment}", + f"hydra.sweeper.experiment.trial={trial.id}", + f"hydra.sweeper.experiment.uuid={uuid}", + f"hydra.sweeper.experiment.previous_checkpoint={prev_checkpoint}", + # "hydra.sweeper.experiment.current_checkpoint=$hydra.runtime.output_dir", ] return tuple(args) @@ -341,7 +341,7 @@ class OrionSweeperImpl(Sweeper): def __init__( self, - client: OrionClientConf, + experiment: OrionClientConf, worker: WorkerConf, algorithm: AlgorithmConf, storage: StorageConf, @@ -355,7 +355,7 @@ def __init__( self.uuid = uuid.uuid1().hex self.resume_paths = dict() - self.orion_config = client + self.orion_config = experiment self.worker_config = worker self.algo_config = algorithm self.storage_config = storage diff --git a/hydra_plugins/hydra_orion_sweeper/orion_sweeper.py b/hydra_plugins/hydra_orion_sweeper/orion_sweeper.py index 767c27b..8e5fb2d 100644 --- a/hydra_plugins/hydra_orion_sweeper/orion_sweeper.py +++ b/hydra_plugins/hydra_orion_sweeper/orion_sweeper.py @@ -18,7 +18,7 @@ class OrionSweeper(Sweeper): def __init__( self, - client: OrionClientConf, + experiment: OrionClientConf, worker: WorkerConf, algorithm: AlgorithmConf, storage: StorageConf, @@ -30,7 +30,7 @@ def __init__( # >>> Remove with Issue #8 if parametrization is not None and params is None: warn( - "`hydra.sweeper.orion.parametrization` is deprecated;" + "`hydra.sweeper.experiment.parametrization` is deprecated;" "use `hydra.sweeper.params` instead", DeprecationWarning, ) @@ -38,7 +38,7 @@ def __init__( elif parametrization is not None and params is not None: warn( - "Both `hydra.sweeper.orion.parametrization` and `hydra.sweeper.params` are defined;" + "Both `hydra.sweeper.experiment.parametrization` and `hydra.sweeper.params` are defined;" "using `hydra.sweeper.params`", DeprecationWarning, ) @@ -47,7 +47,7 @@ def __init__( if params is None: params = dict() - self.sweeper = OrionSweeperImpl(client, worker, algorithm, storage, params) + self.sweeper = OrionSweeperImpl(experiment, worker, algorithm, storage, params) def setup( self, diff --git a/tests/hydra_config.py b/tests/hydra_config.py index fa9c0e5..b0b620c 100644 --- a/tests/hydra_config.py +++ b/tests/hydra_config.py @@ -7,7 +7,7 @@ }, "sweeper": { "_target_": "hydra_plugins.hydra_orion_sweeper.orion_sweeper.OrionSweeper", - "client": { + "experiment": { "name": None, "version": None, "branching": None, diff --git a/tests/test_orion.py b/tests/test_orion.py index 20b1bc1..294f192 100644 --- a/tests/test_orion.py +++ b/tests/test_orion.py @@ -42,7 +42,7 @@ def load_hydra_testing_config(): def orion_configuration(): return dict( - client=OmegaConf.structured(OrionClientConf()), + experiment=OmegaConf.structured(OrionClientConf()), worker=OmegaConf.structured(WorkerConf()), algorithm=OmegaConf.structured(AlgorithmConf()), storage=OmegaConf.structured(StorageConf()), diff --git a/tests/test_warnings.py b/tests/test_warnings.py index df7407a..dfc39f6 100644 --- a/tests/test_warnings.py +++ b/tests/test_warnings.py @@ -25,7 +25,7 @@ def test_parametrization_is_deprecated(): assert ( warnings[0] .message.args[0] - .startswith("`hydra.sweeper.orion.parametrization` is deprecated;") + .startswith("`hydra.sweeper.experiment.parametrization` is deprecated;") ) @@ -44,7 +44,7 @@ def test_parametrization_and_params(): assert ( warnings[0] .message.args[0] - .startswith("Both `hydra.sweeper.orion.parametrization` and") + .startswith("Both `hydra.sweeper.experiment.parametrization` and") )