Skip to content

Commit

Permalink
Merge pull request #168 from ipums/randomized_parameter_search
Browse files Browse the repository at this point in the history
Add Randomized Parameter Search
  • Loading branch information
riley-harper authored Dec 4, 2024
2 parents 11bdfd4 + 73e6adc commit 85802d3
Show file tree
Hide file tree
Showing 2 changed files with 570 additions and 51 deletions.
191 changes: 148 additions & 43 deletions hlink/linking/model_exploration/link_step_train_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink

import statistics
import collections.abc
import itertools
import logging
import math
import random
import re
import statistics
import sys
from textwrap import dedent
from time import perf_counter
from dataclasses import dataclass
from typing import Any
Expand Down Expand Up @@ -492,7 +496,7 @@ def _run(self) -> None:
)
# Explode params into all the combinations we want to test with the current model.
# This may use a grid search or a random search or exactly the parameters in the config.
model_parameters = self._get_model_parameters(config)
model_parameters = _get_model_parameters(training_settings)

outer_training_data = self._combine_folds(
outer_folds, ignore=test_data_index
Expand Down Expand Up @@ -632,35 +636,6 @@ def _get_splits(
)
return splits

def _custom_param_grid_builder(self, conf: dict[str, Any]) -> list[dict[str, Any]]:
print("Building param grid for models")
given_parameters = conf[f"{self.task.training_conf}"]["model_parameters"]
new_params = []
for run in given_parameters:
params = run.copy()
model_type = params.pop("type")

# dropping thresholds to prep for scikitlearn model exploration refactor
threshold = params.pop("threshold", False)
threshold_ratio = params.pop("threshold_ratio", False)

keys = params.keys()
values = params.values()

params_exploded = []
for prod in itertools.product(*values):
params_exploded.append(dict(zip(keys, prod)))

for subdict in params_exploded:
subdict["type"] = model_type
if threshold:
subdict["threshold"] = threshold
if threshold_ratio:
subdict["threshold_ratio"] = threshold_ratio

new_params.extend(params_exploded)
return new_params

def _capture_results(
self,
predictions: pyspark.sql.DataFrame,
Expand Down Expand Up @@ -721,18 +696,6 @@ def _capture_results(
)
return pd.concat([results_df, new_results], ignore_index=True)

def _get_model_parameters(self, conf: dict[str, Any]) -> list[dict[str, Any]]:
training_conf = str(self.task.training_conf)

model_parameters = conf[training_conf]["model_parameters"]
if "param_grid" in conf[training_conf] and conf[training_conf]["param_grid"]:
model_parameters = self._custom_param_grid_builder(conf)
elif model_parameters == []:
raise ValueError(
"No model parameters found. In 'training' config, either supply 'model_parameters' or 'param_grid'."
)
return model_parameters

def _save_training_results(
self, desc_df: pd.DataFrame, spark: pyspark.sql.SparkSession
) -> None:
Expand Down Expand Up @@ -1119,3 +1082,145 @@ def _create_thresholded_metrics_df() -> pd.DataFrame:
"mcc_train_sd",
]
)


def _custom_param_grid_builder(
model_parameters: list[dict[str, Any]]
) -> list[dict[str, Any]]:
print("Building param grid for models")
given_parameters = model_parameters
new_params = []
for run in given_parameters:
params = run.copy()
model_type = params.pop("type")

# dropping thresholds to prep for scikitlearn model exploration refactor
threshold = params.pop("threshold", False)
threshold_ratio = params.pop("threshold_ratio", False)

keys = params.keys()
values = params.values()

params_exploded = []
for prod in itertools.product(*values):
params_exploded.append(dict(zip(keys, prod)))

for subdict in params_exploded:
subdict["type"] = model_type
if threshold:
subdict["threshold"] = threshold
if threshold_ratio:
subdict["threshold_ratio"] = threshold_ratio

new_params.extend(params_exploded)
return new_params


def _choose_randomized_parameters(
rng: random.Random, model_parameters: dict[str, Any]
) -> dict[str, Any]:
"""
Choose a randomized setting of parameters from the given specification.
"""
parameter_choices = dict()

for key, value in model_parameters.items():
# If it's a Sequence (usually list) but not a string, choose one of the values at random.
if isinstance(value, collections.abc.Sequence) and not isinstance(value, str):
parameter_choices[key] = rng.choice(value)
# If it's a Mapping (usually dict), it defines a distribution from which
# the parameter should be sampled.
elif isinstance(value, collections.abc.Mapping):
distribution = value["distribution"]

if distribution == "randint":
low = value["low"]
high = value["high"]
parameter_choices[key] = rng.randint(low, high)
elif distribution == "uniform":
low = value["low"]
high = value["high"]
parameter_choices[key] = rng.uniform(low, high)
elif distribution == "normal":
mean = value["mean"]
stdev = value["standard_deviation"]
parameter_choices[key] = rng.normalvariate(mean, stdev)
else:
raise ValueError(
f"Unknown distribution '{distribution}'. Please choose one of 'randint', 'uniform', or 'normal'."
)
# All other types (including strings) are passed through unchanged.
else:
parameter_choices[key] = value

return parameter_choices


def _get_model_parameters(training_settings: dict[str, Any]) -> list[dict[str, Any]]:
if "param_grid" in training_settings:
print(
dedent(
"""\
Deprecation Warning: training.param_grid is deprecated.
Please use training.model_parameter_search instead by replacing
`param_grid = True` with `model_parameter_search = {strategy = "grid"}` or
`param_grid = False` with `model_parameter_search = {strategy = "explicit"}`
[deprecated_in_version=4.0.0]"""
),
file=sys.stderr,
)

model_parameters = training_settings["model_parameters"]
model_parameter_search = training_settings.get("model_parameter_search")
seed = training_settings.get("seed")
use_param_grid = training_settings.get("param_grid", False)

if model_parameters == []:
raise ValueError(
"model_parameters is empty, so there are no models to evaluate"
)

if model_parameter_search is not None:
strategy = model_parameter_search["strategy"]
if strategy == "explicit":
return model_parameters
elif strategy == "grid":
return _custom_param_grid_builder(model_parameters)
elif strategy == "randomized":
num_samples = model_parameter_search["num_samples"]
rng = random.Random(seed)

return_parameters = []
# These keys are special and should not be sampled or modified. All
# other keys are hyper-parameters to the model and should be sampled.
frozen_keys = {"type", "threshold", "threshold_ratio"}
for _ in range(num_samples):
parameter_spec = rng.choice(model_parameters)
sample_parameters = {
key: value
for (key, value) in parameter_spec.items()
if key not in frozen_keys
}
frozen_parameters = {
key: value
for (key, value) in parameter_spec.items()
if key in frozen_keys
}

randomized = _choose_randomized_parameters(rng, sample_parameters)
result = {**frozen_parameters, **randomized}
return_parameters.append(result)

return return_parameters
else:
raise ValueError(
f"Unknown model_parameter_search strategy '{strategy}'. "
"Please choose one of 'explicit', 'grid', or 'randomized'."
)
elif use_param_grid:
return _custom_param_grid_builder(model_parameters)

return model_parameters
Loading

0 comments on commit 85802d3

Please sign in to comment.