[#167] Respect training.seed when the search strategy is ""randomized"

ipums · Dec 2, 2024 · 0becd32 · 0becd32
1 parent 1692c87
commit 0becd32
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 6 deletions.
diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -688,7 +688,9 @@ def _custom_param_grid_builder(
     return new_params
 
 
-def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str, Any]:
+def _choose_randomized_parameters(
+    rng: random.Random, model_parameters: dict[str, Any]
+) -> dict[str, Any]:
     """
     Choose a randomized setting of parameters from the given specification.
     """
@@ -697,7 +699,7 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
     for key, value in model_parameters.items():
         # If it's a Sequence (usually list) but not a string, choose one of the values at random.
         if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
-            parameter_choices[key] = random.choice(value)
+            parameter_choices[key] = rng.choice(value)
         # If it's a Mapping (usually dict), it defines a distribution from which
         # the parameter should be sampled.
         elif isinstance(value, collections.abc.Mapping):
@@ -706,9 +708,9 @@ def _choose_randomized_parameters(model_parameters: dict[str, Any]) -> dict[str,
             high = value["high"]
 
             if distribution == "randint":
-                parameter_choices[key] = random.randint(low, high)
+                parameter_choices[key] = rng.randint(low, high)
             elif distribution == "uniform":
-                parameter_choices[key] = random.uniform(low, high)
+                parameter_choices[key] = rng.uniform(low, high)
             else:
                 raise ValueError("unknown distribution")
         # All other types (including strings) are passed through unchanged.
@@ -737,6 +739,7 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
 
     model_parameters = training_config["model_parameters"]
     model_parameter_search = training_config.get("model_parameter_search")
+    seed = training_config.get("seed")
     use_param_grid = training_config.get("param_grid", False)
 
     if model_parameters == []:
@@ -752,17 +755,18 @@ def _get_model_parameters(training_config: dict[str, Any]) -> list[dict[str, Any
             return _custom_param_grid_builder(model_parameters)
         elif strategy == "randomized":
             num_samples = model_parameter_search["num_samples"]
+            rng = random.Random(seed)
 
             return_parameters = []
             for _ in range(num_samples):
-                parameter_spec = random.choice(model_parameters)
+                parameter_spec = rng.choice(model_parameters)
                 model_type = parameter_spec["type"]
                 sample_parameters = dict(
                     (key, value)
                     for (key, value) in parameter_spec.items()
                     if key != "type"
                 )
-                randomized = _choose_randomized_parameters(sample_parameters)
+                randomized = _choose_randomized_parameters(rng, sample_parameters)
                 randomized["type"] = model_type
                 return_parameters.append(randomized)
 

diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
@@ -2,6 +2,7 @@
 # For copyright and licensing information, see the NOTICE and LICENSE files
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
+from collections import Counter
 
 import pytest
 import pandas as pd
@@ -431,6 +432,61 @@ def test_get_model_parameters_search_strategy_randomized_take_values(training_co
         assert parameter_choice["subsamplingRate"] in {0.5, 1.0, 1.5}
 
 
+def test_get_model_parameters_search_strategy_randomized_multiple_models(training_conf):
+    """
+    When there are multiple models for the "randomized" strategy, it randomly
+    samples the model before sampling the parameters for that model. Setting
+    the training.seed attribute lets us assert more precisely the counts for
+    each model type.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 100,
+    }
+    training_conf["training"]["seed"] = 101
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "minInfoGain": {"distribution": "uniform", "low": 0.1, "high": 0.9},
+        },
+        {"type": "probit"},
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    counter = Counter(parameter_choice["type"] for parameter_choice in model_parameters)
+    assert counter["random_forest"] == 47
+    assert counter["probit"] == 53
+
+
+def test_get_model_parameters_search_strategy_randomized_uses_seed(training_conf):
+    """
+    The "randomized" strategy uses training.seed to allow reproducible runs.
+    """
+    training_conf["training"]["model_parameter_search"] = {
+        "strategy": "randomized",
+        "num_samples": 5,
+    }
+    training_conf["training"]["seed"] = 35830969
+    training_conf["training"]["model_parameters"] = [
+        {
+            "type": "random_forest",
+            "maxDepth": {"distribution": "randint", "low": 1, "high": 10},
+            "numTrees": [1, 10, 100, 1000],
+        }
+    ]
+
+    model_parameters = _get_model_parameters(training_conf["training"])
+
+    assert model_parameters == [
+        {"type": "random_forest", "maxDepth": 8, "numTrees": 100},
+        {"type": "random_forest", "maxDepth": 2, "numTrees": 1},
+        {"type": "random_forest", "maxDepth": 4, "numTrees": 100},
+        {"type": "random_forest", "maxDepth": 9, "numTrees": 10},
+        {"type": "random_forest", "maxDepth": 7, "numTrees": 100},
+    ]
+
+
 # -------------------------------------
 # Tests that probably should be moved
 # -------------------------------------