Skip to content

Commit

Permalink
[#167] Respect training.seed when the search strategy is ""randomized"
Browse files Browse the repository at this point in the history
  • Loading branch information
riley-harper committed Dec 2, 2024
1 parent 1692c87 commit 0becd32
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 6 deletions.
16 changes: 10 additions & 6 deletions hlink/linking/model_exploration/link_step_train_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,9 @@ def _custom_param_grid_builder(
return new_params


def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, Any]:
def _choose_randomized_parameters(
rng: random.Random, model_parameters: dict[str, Any]
) -> dict[str, Any]:
"""
Choose a randomized setting of parameters from the given specification.
"""
Expand All @@ -697,7 +699,7 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
for key, value in model_parameters.items():
# If it's a Sequence (usually list) but not a string, choose one of the values at random.
if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
parameter_choices[key] = random.choice(value)
parameter_choices[key] = rng.choice(value)
# If it's a Mapping (usually dict), it defines a distribution from which
# the parameter should be sampled.
elif isinstance(value, collections.abc.Mapping):
Expand All @@ -706,9 +708,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
high = value["high"]

if distribution == "randint":
parameter_choices[key] = random.randint(low, high)
parameter_choices[key] = rng.randint(low, high)
elif distribution == "uniform":
parameter_choices[key] = random.uniform(low, high)
parameter_choices[key] = rng.uniform(low, high)
else:
raise ValueError("unknown distribution")
# All other types (including strings) are passed through unchanged.
Expand Down Expand Up @@ -737,6 +739,7 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any

model_parameters = training_config["model_parameters"]
model_parameter_search = training_config.get("model_parameter_search")
seed = training_config.get("seed")
use_param_grid = training_config.get("param_grid", False)

if model_parameters == []:
Expand All @@ -752,17 +755,18 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
return _custom_param_grid_builder(model_parameters)
elif strategy == "randomized":
num_samples = model_parameter_search["num_samples"]
rng = random.Random(seed)

return_parameters = []
for _ in range(num_samples):
parameter_spec = random.choice(model_parameters)
parameter_spec = rng.choice(model_parameters)
model_type = parameter_spec["type"]
sample_parameters = dict(
(key, value)
for (key, value) in parameter_spec.items()
if key != "type"
)
randomized = _choose_randomized_parameters(sample_parameters)
randomized = _choose_randomized_parameters(rng, sample_parameters)
randomized["type"] = model_type
return_parameters.append(randomized)

Expand Down
56 changes: 56 additions & 0 deletions hlink/tests/model_exploration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink
from collections import Counter

import pytest
import pandas as pd
Expand Down Expand Up @@ -431,6 +432,61 @@ def test_get_model_parameters_search_strategy_randomized_take_values(training_co
assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5}


def test_get_model_parameters_search_strategy_randomized_multiple_models(training_conf):
"""
When there are multiple models for the "randomized" strategy, it randomly
samples the model before sampling the parameters for that model. Setting
the training.seed attribute lets us assert more precisely the counts for
each model type.
"""
training_conf["training"]["model_parameter_search"] = {
"strategy": "randomized",
"num_samples": 100,
}
training_conf["training"]["seed"] = 101
training_conf["training"]["model_parameters"] = [
{
"type": "random_forest",
"minInfoGain": {"distribution": "uniform", "low": 0.1, "high": 0.9},
},
{"type": "probit"},
]

model_parameters = _get_model_parameters(training_conf["training"])

counter = Counter(parameter_choice["type"] for parameter_choice in model_parameters)
assert counter["random_forest"] == 47
assert counter["probit"] == 53


def test_get_model_parameters_search_strategy_randomized_uses_seed(training_conf):
"""
The "randomized" strategy uses training.seed to allow reproducible runs.
"""
training_conf["training"]["model_parameter_search"] = {
"strategy": "randomized",
"num_samples": 5,
}
training_conf["training"]["seed"] = 35830969
training_conf["training"]["model_parameters"] = [
{
"type": "random_forest",
"maxDepth": {"distribution": "randint", "low": 1, "high": 10},
"numTrees": [1, 10, 100, 1000],
}
]

model_parameters = _get_model_parameters(training_conf["training"])

assert model_parameters == [
{"type": "random_forest", "maxDepth": 8, "numTrees": 100},
{"type": "random_forest", "maxDepth": 2, "numTrees": 1},
{"type": "random_forest", "maxDepth": 4, "numTrees": 100},
{"type": "random_forest", "maxDepth": 9, "numTrees": 10},
{"type": "random_forest", "maxDepth": 7, "numTrees": 100},
]


# -------------------------------------
# Tests that probably should be moved
# -------------------------------------
Expand Down

0 comments on commit 0becd32

Please sign in to comment.