diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 54a3115..988ed8b 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -688,7 +688,9 @@ def _custom_param_grid_builder( return new_params -def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, Any]: +def _choose_randomized_parameters( + rng: random.Random, model_parameters: dict[str, Any] +) -> dict[str, Any]: """ Choose a randomized setting of parameters from the given specification. """ @@ -697,7 +699,7 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, for key, value in model_parameters.items(): # If it's a Sequence (usually list) but not a string, choose one of the values at random. if isinstance(value, collections.abc.Sequence) and not isinstance(value, str): - parameter_choices[key] = random.choice(value) + parameter_choices[key] = rng.choice(value) # If it's a Mapping (usually dict), it defines a distribution from which # the parameter should be sampled. elif isinstance(value, collections.abc.Mapping): @@ -706,9 +708,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, high = value["high"] if distribution == "randint": - parameter_choices[key] = random.randint(low, high) + parameter_choices[key] = rng.randint(low, high) elif distribution == "uniform": - parameter_choices[key] = random.uniform(low, high) + parameter_choices[key] = rng.uniform(low, high) else: raise ValueError("unknown distribution") # All other types (including strings) are passed through unchanged. @@ -737,6 +739,7 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any model_parameters = training_config["model_parameters"] model_parameter_search = training_config.get("model_parameter_search") + seed = training_config.get("seed") use_param_grid = training_config.get("param_grid", False) if model_parameters == []: @@ -752,17 +755,18 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any return _custom_param_grid_builder(model_parameters) elif strategy == "randomized": num_samples = model_parameter_search["num_samples"] + rng = random.Random(seed) return_parameters = [] for _ in range(num_samples): - parameter_spec = random.choice(model_parameters) + parameter_spec = rng.choice(model_parameters) model_type = parameter_spec["type"] sample_parameters = dict( (key, value) for (key, value) in parameter_spec.items() if key != "type" ) - randomized = _choose_randomized_parameters(sample_parameters) + randomized = _choose_randomized_parameters(rng, sample_parameters) randomized["type"] = model_type return_parameters.append(randomized) diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 33ee240..3af04da 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -2,6 +2,7 @@ # For copyright and licensing information, see the NOTICE and LICENSE files # in this project's top-level directory, and also on-line at: # https://github.com/ipums/hlink +from collections import Counter import pytest import pandas as pd @@ -431,6 +432,61 @@ def test_get_model_parameters_search_strategy_randomized_take_values(training_co assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5} +def test_get_model_parameters_search_strategy_randomized_multiple_models(training_conf): + """ + When there are multiple models for the "randomized" strategy, it randomly + samples the model before sampling the parameters for that model. Setting + the training.seed attribute lets us assert more precisely the counts for + each model type. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 100, + } + training_conf["training"]["seed"] = 101 + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "minInfoGain": {"distribution": "uniform", "low": 0.1, "high": 0.9}, + }, + {"type": "probit"}, + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + counter = Counter(parameter_choice["type"] for parameter_choice in model_parameters) + assert counter["random_forest"] == 47 + assert counter["probit"] == 53 + + +def test_get_model_parameters_search_strategy_randomized_uses_seed(training_conf): + """ + The "randomized" strategy uses training.seed to allow reproducible runs. + """ + training_conf["training"]["model_parameter_search"] = { + "strategy": "randomized", + "num_samples": 5, + } + training_conf["training"]["seed"] = 35830969 + training_conf["training"]["model_parameters"] = [ + { + "type": "random_forest", + "maxDepth": {"distribution": "randint", "low": 1, "high": 10}, + "numTrees": [1, 10, 100, 1000], + } + ] + + model_parameters = _get_model_parameters(training_conf["training"]) + + assert model_parameters == [ + {"type": "random_forest", "maxDepth": 8, "numTrees": 100}, + {"type": "random_forest", "maxDepth": 2, "numTrees": 1}, + {"type": "random_forest", "maxDepth": 4, "numTrees": 100}, + {"type": "random_forest", "maxDepth": 9, "numTrees": 10}, + {"type": "random_forest", "maxDepth": 7, "numTrees": 100}, + ] + + # ------------------------------------- # Tests that probably should be moved # -------------------------------------