Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tree-structured Parzen Estimator for hyperparameter optimization #13

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
276 changes: 269 additions & 7 deletions src/sweeps/bayes_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def sigmoid(x: ArrayLike) -> ArrayLike:


def random_sample(X_bounds: ArrayLike, num_test_samples: integer) -> ArrayLike:
if hasattr(X_bounds, "tolist"):
X_bounds = X_bounds.tolist()
num_hyperparameters = len(X_bounds)
test_X = np.empty((int(num_test_samples), num_hyperparameters))
for ii in range(num_test_samples):
Expand Down Expand Up @@ -171,10 +173,12 @@ def next_sample(
X_bounds: Optional[ArrayLike] = None,
current_X: Optional[ArrayLike] = None,
nu: floating = 1.5,
max_samples_for_gp: integer = 100,
improvement: floating = 0.01,
max_samples_for_model: integer = 100,
improvement: floating = 0.1,
bw_multiplier=0.2,
num_points_to_try: integer = 1000,
opt_func: str = "expected_improvement",
model: str = "gp",
test_X: Optional[ArrayLike] = None,
) -> Tuple[ArrayLike, floating, floating, floating, floating]:
"""Calculates the best next sample to look at via bayesian optimization.
Expand All @@ -194,19 +198,23 @@ def next_sample(

http://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.Matern.html

max_samples_for_gp: integer, optional, default 100
max_samples_for_model: integer, optional, default 100
maximum samples to consider (since algo is O(n^3)) for performance,
but also adds some randomness. this number of samples will be chosen
randomly from the sample_X and used to train the GP.
improvement: floating, optional, default 0.1
amount of improvement to optimize for -- higher means take more exploratory risks
bw_multiplier: floating, optional, default 0.2
scaling factor for kernel density estimation bandwidth for tpe_multi algorithm
num_points_to_try: integer, optional, default 1000
number of X values to try when looking for value with highest expected probability
of improvement
opt_func: one of {"expected_improvement", "prob_of_improvement"} - whether to optimize expected
improvement of probability of improvement. Expected improvement is generally better - may want
to remove probability of improvement at some point. (But I think prboability of improvement
is a little easier to calculate)
model: one of {"gp", "tpe", "tpe_multi"} - whether to use a Gaussian Process as a surrogate model,
a Tree-structured Parzen Estimator, or a multivariate TPE
test_X: X values to test when looking for the best values to try

Returns:
Expand Down Expand Up @@ -263,9 +271,61 @@ def next_sample(
np.nan,
)

if model == "bayes-tpe":
return next_sample_tpe(
filtered_X=filtered_X,
filtered_y=filtered_y,
X_bounds=X_bounds,
current_X=current_X,
max_samples_for_model=max_samples_for_model,
improvement=improvement,
num_points_to_try=num_points_to_try,
test_X=test_X,
multivariate=False,
)
elif model == "bayes-tpe-multi":
return next_sample_tpe(
filtered_X=filtered_X,
filtered_y=filtered_y,
X_bounds=X_bounds,
current_X=current_X,
max_samples_for_model=max_samples_for_model,
improvement=improvement,
num_points_to_try=num_points_to_try,
test_X=test_X,
multivariate=True,
bw_multiplier=bw_multiplier,
)
else: # GP
return next_sample_gp(
filtered_X=filtered_X,
filtered_y=filtered_y,
X_bounds=X_bounds,
current_X=current_X,
nu=nu,
max_samples_for_model=max_samples_for_model,
improvement=improvement,
num_points_to_try=num_points_to_try,
opt_func=opt_func,
test_X=test_X,
)


def next_sample_gp(
filtered_X: ArrayLike,
filtered_y: ArrayLike,
X_bounds: Optional[ArrayLike] = None,
current_X: Optional[ArrayLike] = None,
nu: floating = 1.5,
max_samples_for_model: integer = 100,
improvement: floating = 0.01,
num_points_to_try: integer = 1000,
opt_func: str = "expected_improvement",
test_X: Optional[ArrayLike] = None,
) -> Tuple[ArrayLike, floating, floating, floating, floating]:
# build the acquisition function
gp, y_mean, y_stddev, = train_gaussian_process(
filtered_X, filtered_y, X_bounds, current_X, nu, max_samples_for_gp
filtered_X, filtered_y, X_bounds, current_X, nu, max_samples_for_model
)
# Look for the minimum value of our fitted-target-function + (kappa * fitted-target-std_dev)
if test_X is None: # this is the usual case
Expand Down Expand Up @@ -324,7 +384,199 @@ def next_sample(
)


def _construct_gp_data(
def fit_parzen_estimator_scott_bw(X, X_bounds, multiplier=1.06):
extended_X = np.insert(X_bounds.T, 1, X, axis=0)
mu = np.mean(extended_X, axis=0)
sumsqrs = np.sum(np.square(extended_X - mu), axis=0)
sigmahat = np.sqrt(sumsqrs / (len(extended_X) - 1))
sigmas = multiplier * sigmahat * len(extended_X) ** (-1.0 / (4.0 + len(X_bounds)))
return np.tile(sigmas, [len(X), 1])


def fit_1D_parzen_estimator_heuristic_bw(X, X_bounds):
sorted_ind = np.argsort(X.copy())
sorted_mus = X[sorted_ind]

# Treat endpoints of interval as data points
# extended_mus = np.insert(X_bounds, 1, sorted_mus)

# Ignore endpoints of interval
extended_mus = np.insert([sorted_mus[0], sorted_mus[-1]], 1, sorted_mus)

sigmas = np.zeros(len(X))
sigmas[sorted_ind] = np.maximum(
extended_mus[2:] - extended_mus[1:-1], extended_mus[1:-1] - extended_mus[0:-2]
)

# Magic formula from reference implementation
prior_sigma = (X_bounds[1] - X_bounds[0]) / np.sqrt(12.0)
minsigma = prior_sigma / min(100.0, (1.0 + len(X)))
sigmas = np.clip(sigmas, minsigma, prior_sigma)

return sigmas


def sample_from_parzen_estimator(mus, sigmas, X_bounds, num_samples):
indices = np.random.default_rng().integers(-1, len(mus), num_samples)
samples = np.zeros((num_samples, len(X_bounds)))
uniform_ind = indices == -1
num_uniform = np.count_nonzero(uniform_ind)
samples[uniform_ind] = np.random.default_rng().uniform(
np.tile(X_bounds[:, 0], [num_uniform, 1]),
np.tile(X_bounds[:, 1], [num_uniform, 1]),
)
normal_ind = indices >= 0
samples[normal_ind] = np.random.default_rng().normal(
loc=mus[indices[normal_ind]], scale=sigmas[indices[normal_ind]]
)
return np.clip(samples, X_bounds[:, 0], X_bounds[:, 1])


def sample_from_1D_parzen_estimator(mus, sigmas, X_bounds, num_points_to_try):
indices = np.random.default_rng().integers(-1, len(mus), num_points_to_try)
new_samples = np.zeros(num_points_to_try)

# For which_mu == -1, sample from the (uniform) prior
new_samples[indices == -1] = np.random.default_rng().uniform(
X_bounds[0], X_bounds[1], np.sum(indices == -1)
)
# Other samples are from mus
new_samples[indices >= 0] = np.random.default_rng().normal(
loc=mus[indices[indices >= 0]], scale=sigmas[indices[indices >= 0]]
)
return np.clip(new_samples, X_bounds[0], X_bounds[1])


def llik_from_parzen_estimator(samples, mus, sigmas, X_bounds):
samp_norm = (np.tile(samples, [len(mus), 1, 1]).transpose((1, 0, 2)) - mus) / sigmas
samp_norm = np.square(samp_norm)
normalization = (2.0 * np.pi) ** (-len(X_bounds) / 2.0) / np.prod(sigmas, axis=1)
pdf = normalization * np.exp(-0.5 * np.sum(samp_norm, axis=2))
uniform_pdf = 1.0 / np.prod(X_bounds[:, 1] - X_bounds[:, 0])
mixture = (np.sum(pdf, axis=1) + uniform_pdf) / (len(mus) + 1.0)
return np.log(mixture)


def llik_from_1D_parzen_estimator(samples, mus, sigmas, X_bounds):
samp_norm = (np.tile(samples, [len(mus), 1]).T - mus) / sigmas
llik = np.log(
(
np.sum(scipy_stats.norm.pdf(samp_norm) / sigmas, axis=1)
+ 1.0 / (X_bounds[1] - X_bounds[0])
)
/ (len(mus) + 1.0)
)
return llik


def parzen_threshold(y, gamma):
num_low = int(np.ceil(gamma * np.sqrt(len(y))))
low_ind = np.argsort(y)[0:num_low]
ret_val = np.array([False] * len(y))
ret_val[low_ind] = True
return ret_val


def stats_from_parzen_estimator(low_llik, high_llik, y, gamma, low_ind):
gamma_rescaled = np.sum(low_ind)/len(y)
y_star = (np.min(y[np.logical_not(low_ind)]) + np.max(y[low_ind]))/2.0
unnorm_mean_low_y = np.sum(y[low_ind]) / len(y)
unnorm_mean_high_y = np.mean(y) - unnorm_mean_low_y
unnorm_mean_low_y_sq = np.sum(np.square(y[low_ind])) / len(y)
unnorm_mean_high_y_sq = np.mean(np.square(y)) - unnorm_mean_low_y_sq
l_of_x = np.exp(low_llik)
g_of_x = np.exp(high_llik)
p_of_x = gamma_rescaled * l_of_x + (1-gamma_rescaled) * g_of_x
y_pred = (l_of_x * unnorm_mean_low_y + g_of_x * unnorm_mean_high_y) / p_of_x
y_sq_pred = (l_of_x * unnorm_mean_low_y_sq + g_of_x * unnorm_mean_high_y_sq) / p_of_x
y_std = np.sqrt(y_sq_pred - y_pred * y_pred)
prob_of_improvement = l_of_x * gamma_rescaled / p_of_x
expected_improvement = l_of_x * (gamma_rescaled * y_star - unnorm_mean_low_y) / p_of_x
return prob_of_improvement, y_pred, y_std, expected_improvement


def next_sample_tpe(
filtered_X: ArrayLike,
filtered_y: ArrayLike,
X_bounds: Optional[ArrayLike] = None,
current_X: Optional[ArrayLike] = None,
max_samples_for_model: integer = 100,
improvement: floating = 0.01,
num_points_to_try: integer = 1000,
test_X: Optional[ArrayLike] = None,
multivariate: Optional[bool] = False,
bw_multiplier: Optional[floating] = 0.2,
) -> Tuple[ArrayLike, floating, floating, floating, floating]:

if X_bounds is None:
hp_min = np.min(filtered_X, axis=0)
hp_max = np.max(filtered_X, axis=0)
X_bounds = np.column_stack(hp_min, hp_max)
else:
X_bounds = np.array(X_bounds)

low_ind = parzen_threshold(filtered_y, improvement)
low_X = filtered_X[low_ind]
high_X = filtered_X[np.logical_not(low_ind)]
num_hp = len(X_bounds)
if multivariate:
low_mus = low_X.copy()
high_mus = high_X.copy()

low_sigmas = fit_parzen_estimator_scott_bw(low_X, X_bounds, bw_multiplier)
high_sigmas = fit_parzen_estimator_scott_bw(high_X, X_bounds, bw_multiplier)

new_samples = sample_from_parzen_estimator(
low_mus, low_sigmas, X_bounds, num_points_to_try
)
low_llik = llik_from_parzen_estimator(
new_samples, low_mus, low_sigmas, X_bounds
)
high_llik = llik_from_parzen_estimator(
new_samples, high_mus, high_sigmas, X_bounds
)
score = low_llik - high_llik
best_index = np.argmax(score)
best_sample = new_samples[best_index, :]
best_low_llik = low_llik[best_index]
best_high_llik = high_llik[best_index]
else:
# Fit separate 1D Parzen estimators to each hyperparameter
best_sample = np.zeros(num_hp)
best_low_llik = 0
best_high_llik = 0
for i in range(num_hp):
low_mus = low_X[:, i]
high_mus = high_X[:, i]
low_sigmas = fit_1D_parzen_estimator_heuristic_bw(low_mus, X_bounds[i])
high_sigmas = fit_1D_parzen_estimator_heuristic_bw(high_mus, X_bounds[i])
new_samples = sample_from_1D_parzen_estimator(
low_mus, low_sigmas, X_bounds[i], num_points_to_try
)
low_llik = llik_from_1D_parzen_estimator(
new_samples, low_mus, low_sigmas, X_bounds[i]
)
high_llik = llik_from_1D_parzen_estimator(
new_samples, high_mus, high_sigmas, X_bounds[i]
)
best_index = np.argmax(low_llik - high_llik)
best_sample[i] = new_samples[best_index]
best_low_llik += low_llik[best_index]
best_high_llik += high_llik[best_index]

(prob_of_improvement, predicted_y, predicted_std, expected_improvement) = stats_from_parzen_estimator(best_low_llik, best_high_llik, filtered_y, improvement, low_ind)

# TODO: replace nans with actual values
return (
best_sample,
prob_of_improvement,
predicted_y,
predicted_std,
expected_improvement,
)


def _construct_bayes_data(
runs: List[SweepRun], config: Union[dict, SweepConfig]
) -> Tuple[HyperParameterSet, ArrayLike, ArrayLike, ArrayLike]:
goal = config["metric"]["goal"]
Expand Down Expand Up @@ -432,9 +684,18 @@ def bayes_search_next_run(
if validate:
config = SweepConfig(config)

config = bayes_baseline_validate_and_fill(config)
if "metric" not in config:
raise ValueError('Bayesian search requires "metric" section')

if "method" not in config:
raise ValueError("Method must be specified")

if config["method"] not in ["bayes", "bayes-tpe", "bayes-tpe-multi"]:
raise ValueError(
'Invalid method for bayes_search_next_run, must be one of "bayes", "bayes-tpe", "bayes-tpe-multi"'
)

params, sample_X, current_X, y = _construct_gp_data(runs, config)
params, sample_X, current_X, y = _construct_bayes_data(runs, config)
X_bounds = [[0.0, 1.0]] * len(params.searchable_params)

(
Expand All @@ -448,6 +709,7 @@ def bayes_search_next_run(
sample_y=y,
X_bounds=X_bounds,
current_X=current_X if len(current_X) > 0 else None,
model=config["method"],
improvement=minimum_improvement,
)

Expand Down
5 changes: 4 additions & 1 deletion src/sweeps/config/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -566,11 +566,14 @@
"description": "The project for this sweep"
},
"method": {
"description": "Possible values: bayes, random, grid",
"description": "Possible values: bayes, random, grid, bayes-tpe, bayes-tpe-multi, custom",
"type": "string",
"enum": [
"bayes",
"random",
"grid",
"bayes-tpe",
"bayes-tpe-multi",
"custom"
]
},
Expand Down
4 changes: 2 additions & 2 deletions src/sweeps/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,13 +191,13 @@ def next_runs(
)
elif method == "random":
return random_search_next_runs(sweep_config, validate=validate, n=n)
elif method == "bayes":
elif method in ["bayes", "bayes-tpe", "bayes-tpe-multi"]:
return bayes_search_next_runs(
runs, sweep_config, validate=validate, n=n, **kwargs
)
else:
raise ValueError(
f'Invalid search type {method}, must be one of ["grid", "random", "bayes"]'
f'Invalid search type {method}, must be one of ["grid", "random", "bayes", "bayes-tpe", or "bayes-tpe-multi"]'
)


Expand Down
Loading