diff --git a/src/sweeps/bayes_search.py b/src/sweeps/bayes_search.py index 9aa7949a..bbbdfdc0 100644 --- a/src/sweeps/bayes_search.py +++ b/src/sweeps/bayes_search.py @@ -57,6 +57,8 @@ def sigmoid(x: ArrayLike) -> ArrayLike: def random_sample(X_bounds: ArrayLike, num_test_samples: integer) -> ArrayLike: + if hasattr(X_bounds, "tolist"): + X_bounds = X_bounds.tolist() num_hyperparameters = len(X_bounds) test_X = np.empty((int(num_test_samples), num_hyperparameters)) for ii in range(num_test_samples): @@ -171,10 +173,12 @@ def next_sample( X_bounds: Optional[ArrayLike] = None, current_X: Optional[ArrayLike] = None, nu: floating = 1.5, - max_samples_for_gp: integer = 100, - improvement: floating = 0.01, + max_samples_for_model: integer = 100, + improvement: floating = 0.1, + bw_multiplier=0.2, num_points_to_try: integer = 1000, opt_func: str = "expected_improvement", + model: str = "gp", test_X: Optional[ArrayLike] = None, ) -> Tuple[ArrayLike, floating, floating, floating, floating]: """Calculates the best next sample to look at via bayesian optimization. @@ -194,12 +198,14 @@ def next_sample( http://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.Matern.html - max_samples_for_gp: integer, optional, default 100 + max_samples_for_model: integer, optional, default 100 maximum samples to consider (since algo is O(n^3)) for performance, but also adds some randomness. this number of samples will be chosen randomly from the sample_X and used to train the GP. improvement: floating, optional, default 0.1 amount of improvement to optimize for -- higher means take more exploratory risks + bw_multiplier: floating, optional, default 0.2 + scaling factor for kernel density estimation bandwidth for tpe_multi algorithm num_points_to_try: integer, optional, default 1000 number of X values to try when looking for value with highest expected probability of improvement @@ -207,6 +213,8 @@ def next_sample( improvement of probability of improvement. Expected improvement is generally better - may want to remove probability of improvement at some point. (But I think prboability of improvement is a little easier to calculate) + model: one of {"gp", "tpe", "tpe_multi"} - whether to use a Gaussian Process as a surrogate model, + a Tree-structured Parzen Estimator, or a multivariate TPE test_X: X values to test when looking for the best values to try Returns: @@ -263,9 +271,61 @@ def next_sample( np.nan, ) + if model == "bayes-tpe": + return next_sample_tpe( + filtered_X=filtered_X, + filtered_y=filtered_y, + X_bounds=X_bounds, + current_X=current_X, + max_samples_for_model=max_samples_for_model, + improvement=improvement, + num_points_to_try=num_points_to_try, + test_X=test_X, + multivariate=False, + ) + elif model == "bayes-tpe-multi": + return next_sample_tpe( + filtered_X=filtered_X, + filtered_y=filtered_y, + X_bounds=X_bounds, + current_X=current_X, + max_samples_for_model=max_samples_for_model, + improvement=improvement, + num_points_to_try=num_points_to_try, + test_X=test_X, + multivariate=True, + bw_multiplier=bw_multiplier, + ) + else: # GP + return next_sample_gp( + filtered_X=filtered_X, + filtered_y=filtered_y, + X_bounds=X_bounds, + current_X=current_X, + nu=nu, + max_samples_for_model=max_samples_for_model, + improvement=improvement, + num_points_to_try=num_points_to_try, + opt_func=opt_func, + test_X=test_X, + ) + + +def next_sample_gp( + filtered_X: ArrayLike, + filtered_y: ArrayLike, + X_bounds: Optional[ArrayLike] = None, + current_X: Optional[ArrayLike] = None, + nu: floating = 1.5, + max_samples_for_model: integer = 100, + improvement: floating = 0.01, + num_points_to_try: integer = 1000, + opt_func: str = "expected_improvement", + test_X: Optional[ArrayLike] = None, +) -> Tuple[ArrayLike, floating, floating, floating, floating]: # build the acquisition function gp, y_mean, y_stddev, = train_gaussian_process( - filtered_X, filtered_y, X_bounds, current_X, nu, max_samples_for_gp + filtered_X, filtered_y, X_bounds, current_X, nu, max_samples_for_model ) # Look for the minimum value of our fitted-target-function + (kappa * fitted-target-std_dev) if test_X is None: # this is the usual case @@ -324,7 +384,199 @@ def next_sample( ) -def _construct_gp_data( +def fit_parzen_estimator_scott_bw(X, X_bounds, multiplier=1.06): + extended_X = np.insert(X_bounds.T, 1, X, axis=0) + mu = np.mean(extended_X, axis=0) + sumsqrs = np.sum(np.square(extended_X - mu), axis=0) + sigmahat = np.sqrt(sumsqrs / (len(extended_X) - 1)) + sigmas = multiplier * sigmahat * len(extended_X) ** (-1.0 / (4.0 + len(X_bounds))) + return np.tile(sigmas, [len(X), 1]) + + +def fit_1D_parzen_estimator_heuristic_bw(X, X_bounds): + sorted_ind = np.argsort(X.copy()) + sorted_mus = X[sorted_ind] + + # Treat endpoints of interval as data points + # extended_mus = np.insert(X_bounds, 1, sorted_mus) + + # Ignore endpoints of interval + extended_mus = np.insert([sorted_mus[0], sorted_mus[-1]], 1, sorted_mus) + + sigmas = np.zeros(len(X)) + sigmas[sorted_ind] = np.maximum( + extended_mus[2:] - extended_mus[1:-1], extended_mus[1:-1] - extended_mus[0:-2] + ) + + # Magic formula from reference implementation + prior_sigma = (X_bounds[1] - X_bounds[0]) / np.sqrt(12.0) + minsigma = prior_sigma / min(100.0, (1.0 + len(X))) + sigmas = np.clip(sigmas, minsigma, prior_sigma) + + return sigmas + + +def sample_from_parzen_estimator(mus, sigmas, X_bounds, num_samples): + indices = np.random.default_rng().integers(-1, len(mus), num_samples) + samples = np.zeros((num_samples, len(X_bounds))) + uniform_ind = indices == -1 + num_uniform = np.count_nonzero(uniform_ind) + samples[uniform_ind] = np.random.default_rng().uniform( + np.tile(X_bounds[:, 0], [num_uniform, 1]), + np.tile(X_bounds[:, 1], [num_uniform, 1]), + ) + normal_ind = indices >= 0 + samples[normal_ind] = np.random.default_rng().normal( + loc=mus[indices[normal_ind]], scale=sigmas[indices[normal_ind]] + ) + return np.clip(samples, X_bounds[:, 0], X_bounds[:, 1]) + + +def sample_from_1D_parzen_estimator(mus, sigmas, X_bounds, num_points_to_try): + indices = np.random.default_rng().integers(-1, len(mus), num_points_to_try) + new_samples = np.zeros(num_points_to_try) + + # For which_mu == -1, sample from the (uniform) prior + new_samples[indices == -1] = np.random.default_rng().uniform( + X_bounds[0], X_bounds[1], np.sum(indices == -1) + ) + # Other samples are from mus + new_samples[indices >= 0] = np.random.default_rng().normal( + loc=mus[indices[indices >= 0]], scale=sigmas[indices[indices >= 0]] + ) + return np.clip(new_samples, X_bounds[0], X_bounds[1]) + + +def llik_from_parzen_estimator(samples, mus, sigmas, X_bounds): + samp_norm = (np.tile(samples, [len(mus), 1, 1]).transpose((1, 0, 2)) - mus) / sigmas + samp_norm = np.square(samp_norm) + normalization = (2.0 * np.pi) ** (-len(X_bounds) / 2.0) / np.prod(sigmas, axis=1) + pdf = normalization * np.exp(-0.5 * np.sum(samp_norm, axis=2)) + uniform_pdf = 1.0 / np.prod(X_bounds[:, 1] - X_bounds[:, 0]) + mixture = (np.sum(pdf, axis=1) + uniform_pdf) / (len(mus) + 1.0) + return np.log(mixture) + + +def llik_from_1D_parzen_estimator(samples, mus, sigmas, X_bounds): + samp_norm = (np.tile(samples, [len(mus), 1]).T - mus) / sigmas + llik = np.log( + ( + np.sum(scipy_stats.norm.pdf(samp_norm) / sigmas, axis=1) + + 1.0 / (X_bounds[1] - X_bounds[0]) + ) + / (len(mus) + 1.0) + ) + return llik + + +def parzen_threshold(y, gamma): + num_low = int(np.ceil(gamma * np.sqrt(len(y)))) + low_ind = np.argsort(y)[0:num_low] + ret_val = np.array([False] * len(y)) + ret_val[low_ind] = True + return ret_val + + +def stats_from_parzen_estimator(low_llik, high_llik, y, gamma, low_ind): + gamma_rescaled = np.sum(low_ind)/len(y) + y_star = (np.min(y[np.logical_not(low_ind)]) + np.max(y[low_ind]))/2.0 + unnorm_mean_low_y = np.sum(y[low_ind]) / len(y) + unnorm_mean_high_y = np.mean(y) - unnorm_mean_low_y + unnorm_mean_low_y_sq = np.sum(np.square(y[low_ind])) / len(y) + unnorm_mean_high_y_sq = np.mean(np.square(y)) - unnorm_mean_low_y_sq + l_of_x = np.exp(low_llik) + g_of_x = np.exp(high_llik) + p_of_x = gamma_rescaled * l_of_x + (1-gamma_rescaled) * g_of_x + y_pred = (l_of_x * unnorm_mean_low_y + g_of_x * unnorm_mean_high_y) / p_of_x + y_sq_pred = (l_of_x * unnorm_mean_low_y_sq + g_of_x * unnorm_mean_high_y_sq) / p_of_x + y_std = np.sqrt(y_sq_pred - y_pred * y_pred) + prob_of_improvement = l_of_x * gamma_rescaled / p_of_x + expected_improvement = l_of_x * (gamma_rescaled * y_star - unnorm_mean_low_y) / p_of_x + return prob_of_improvement, y_pred, y_std, expected_improvement + + +def next_sample_tpe( + filtered_X: ArrayLike, + filtered_y: ArrayLike, + X_bounds: Optional[ArrayLike] = None, + current_X: Optional[ArrayLike] = None, + max_samples_for_model: integer = 100, + improvement: floating = 0.01, + num_points_to_try: integer = 1000, + test_X: Optional[ArrayLike] = None, + multivariate: Optional[bool] = False, + bw_multiplier: Optional[floating] = 0.2, +) -> Tuple[ArrayLike, floating, floating, floating, floating]: + + if X_bounds is None: + hp_min = np.min(filtered_X, axis=0) + hp_max = np.max(filtered_X, axis=0) + X_bounds = np.column_stack(hp_min, hp_max) + else: + X_bounds = np.array(X_bounds) + + low_ind = parzen_threshold(filtered_y, improvement) + low_X = filtered_X[low_ind] + high_X = filtered_X[np.logical_not(low_ind)] + num_hp = len(X_bounds) + if multivariate: + low_mus = low_X.copy() + high_mus = high_X.copy() + + low_sigmas = fit_parzen_estimator_scott_bw(low_X, X_bounds, bw_multiplier) + high_sigmas = fit_parzen_estimator_scott_bw(high_X, X_bounds, bw_multiplier) + + new_samples = sample_from_parzen_estimator( + low_mus, low_sigmas, X_bounds, num_points_to_try + ) + low_llik = llik_from_parzen_estimator( + new_samples, low_mus, low_sigmas, X_bounds + ) + high_llik = llik_from_parzen_estimator( + new_samples, high_mus, high_sigmas, X_bounds + ) + score = low_llik - high_llik + best_index = np.argmax(score) + best_sample = new_samples[best_index, :] + best_low_llik = low_llik[best_index] + best_high_llik = high_llik[best_index] + else: + # Fit separate 1D Parzen estimators to each hyperparameter + best_sample = np.zeros(num_hp) + best_low_llik = 0 + best_high_llik = 0 + for i in range(num_hp): + low_mus = low_X[:, i] + high_mus = high_X[:, i] + low_sigmas = fit_1D_parzen_estimator_heuristic_bw(low_mus, X_bounds[i]) + high_sigmas = fit_1D_parzen_estimator_heuristic_bw(high_mus, X_bounds[i]) + new_samples = sample_from_1D_parzen_estimator( + low_mus, low_sigmas, X_bounds[i], num_points_to_try + ) + low_llik = llik_from_1D_parzen_estimator( + new_samples, low_mus, low_sigmas, X_bounds[i] + ) + high_llik = llik_from_1D_parzen_estimator( + new_samples, high_mus, high_sigmas, X_bounds[i] + ) + best_index = np.argmax(low_llik - high_llik) + best_sample[i] = new_samples[best_index] + best_low_llik += low_llik[best_index] + best_high_llik += high_llik[best_index] + + (prob_of_improvement, predicted_y, predicted_std, expected_improvement) = stats_from_parzen_estimator(best_low_llik, best_high_llik, filtered_y, improvement, low_ind) + + # TODO: replace nans with actual values + return ( + best_sample, + prob_of_improvement, + predicted_y, + predicted_std, + expected_improvement, + ) + + +def _construct_bayes_data( runs: List[SweepRun], config: Union[dict, SweepConfig] ) -> Tuple[HyperParameterSet, ArrayLike, ArrayLike, ArrayLike]: goal = config["metric"]["goal"] @@ -432,9 +684,18 @@ def bayes_search_next_run( if validate: config = SweepConfig(config) - config = bayes_baseline_validate_and_fill(config) + if "metric" not in config: + raise ValueError('Bayesian search requires "metric" section') + + if "method" not in config: + raise ValueError("Method must be specified") + + if config["method"] not in ["bayes", "bayes-tpe", "bayes-tpe-multi"]: + raise ValueError( + 'Invalid method for bayes_search_next_run, must be one of "bayes", "bayes-tpe", "bayes-tpe-multi"' + ) - params, sample_X, current_X, y = _construct_gp_data(runs, config) + params, sample_X, current_X, y = _construct_bayes_data(runs, config) X_bounds = [[0.0, 1.0]] * len(params.searchable_params) ( @@ -448,6 +709,7 @@ def bayes_search_next_run( sample_y=y, X_bounds=X_bounds, current_X=current_X if len(current_X) > 0 else None, + model=config["method"], improvement=minimum_improvement, ) diff --git a/src/sweeps/config/schema.json b/src/sweeps/config/schema.json index 2a07e15f..0de2463e 100644 --- a/src/sweeps/config/schema.json +++ b/src/sweeps/config/schema.json @@ -566,11 +566,14 @@ "description": "The project for this sweep" }, "method": { - "description": "Possible values: bayes, random, grid", + "description": "Possible values: bayes, random, grid, bayes-tpe, bayes-tpe-multi, custom", + "type": "string", "enum": [ "bayes", "random", "grid", + "bayes-tpe", + "bayes-tpe-multi", "custom" ] }, diff --git a/src/sweeps/run.py b/src/sweeps/run.py index 3c074284..67d949fc 100644 --- a/src/sweeps/run.py +++ b/src/sweeps/run.py @@ -191,13 +191,13 @@ def next_runs( ) elif method == "random": return random_search_next_runs(sweep_config, validate=validate, n=n) - elif method == "bayes": + elif method in ["bayes", "bayes-tpe", "bayes-tpe-multi"]: return bayes_search_next_runs( runs, sweep_config, validate=validate, n=n, **kwargs ) else: raise ValueError( - f'Invalid search type {method}, must be one of ["grid", "random", "bayes"]' + f'Invalid search type {method}, must be one of ["grid", "random", "bayes", "bayes-tpe", or "bayes-tpe-multi"]' ) diff --git a/tests/test_bayes_search.py b/tests/test_bayes_search.py index a1e11252..25d163c4 100644 --- a/tests/test_bayes_search.py +++ b/tests/test_bayes_search.py @@ -1,5 +1,6 @@ import os import json +import time from typing import Callable, Optional, Tuple, Iterable, Dict, Union import pytest @@ -23,6 +24,36 @@ def rosenbrock(x: ArrayLike) -> np.floating: return np.sum((x[1:] - x[:-1] ** 2.0) ** 2.0 + (1 - x[:-1]) ** 2.0) +rastrigin_A = 10.0 + + +def rastrigin(x: ArrayLike) -> np.floating: + # has a global minimum at (1, 1, 1, ...) with x_i \in [-4.12, 6.12] + return rastrigin_A * len(x) + np.sum( + np.square(x - 1.0) - rastrigin_A * np.cos(2 * np.pi * (x - 1.0)) + ) + + +shekel_beta = np.array([0.1, 0.2, 0.2, 0.4, 0.4, 0.6, 0.3, 0.7, 0.5, 0.5]) +shekel_C = np.array( + [ + [4.0, 1.0, 8.0, 6.0, 3.0, 2.0, 5.0, 8.0, 6.0, 7.0], + [4.0, 1.0, 8.0, 6.0, 7.0, 9.0, 3.0, 1.0, 2.0, 3.6], + [4.0, 1.0, 8.0, 6.0, 3.0, 2.0, 5.0, 8.0, 6.0, 7.0], + [4.0, 1.0, 8.0, 6.0, 7.0, 9.0, 3.0, 1.0, 2.0, 3.6], + ] +).T +shekel_min = -10.5364 + + +def shekel(x: ArrayLike) -> np.floating: + # Four minima in bounding region [[0,10]]*4 + return ( + -np.sum(np.reciprocal(np.sum(np.square(shekel_C - x), axis=1) + shekel_beta)) + - shekel_min + ) + + def run_bayes_search( f: Callable[[SweepRun], floating], config: SweepConfig, @@ -101,12 +132,15 @@ def run_iterations( optimium: Optional[ArrayLike] = None, atol: Optional[ArrayLike] = 0.2, chunk_size: integer = 1, + model: str = "gp", + bw_multiplier: floating = 1.0, ) -> Tuple[ArrayLike, ArrayLike]: + bounds = np.array(bounds) if x_init is not None: X = x_init else: - X = [np.zeros(len(bounds))] + X = [np.random.uniform(low=bounds[:, 0], high=bounds[:, 1])] y = np.array([f(x) for x in X]).flatten() @@ -116,12 +150,15 @@ def run_iterations( for cc in range(chunk_size): if counter >= num_iterations: break - (sample, prob, pred, _, _,) = bayes.next_sample( + (sample, prob, pred, pred_std, exp_imp) = bayes.next_sample( sample_X=X, sample_y=y, X_bounds=bounds, current_X=sample_X, improvement=improvement, + model=model, + bw_multiplier=bw_multiplier, + max_samples_for_model=100, ) if sample_X is None: sample_X = np.array([sample]) @@ -129,8 +166,8 @@ def run_iterations( sample_X = np.append(sample_X, np.array([sample]), axis=0) counter += 1 print( - "X: {} prob(I): {} pred: {} value: {}".format( - sample, prob, pred, f(sample) + "X: {} prob(I): {} EI: {} pred: {} value: {} pred_std: {}".format( + sample, prob, exp_imp, pred, f(sample), pred_std ) ) @@ -177,14 +214,34 @@ def test_squiggle_convergence(): run_iterations(squiggle, [[0.0, 5.0]], 200, x_init, optimium=[3.6], atol=0.2) -def test_squiggle_convergence_to_maximum(): +@pytest.mark.parametrize("method", ["bayes", "bayes-tpe", "bayes-tpe-multi"]) +def test_squiggle_convergence_to_maximum(method, capsys): # This test checks whether the bayes algorithm correctly explores the parameter space # we sample a ton of positive examples, ignoring the negative side def f(x): return -squiggle(x) x_init = np.random.uniform(0, 5, 1)[:, None] - run_iterations(f, [[0.0, 5.0]], 200, x_init, optimium=[2], atol=0.2) + if method == "bayes-tpe" or method == "bayes-tpe-multi": + improvement = 0.15 + else: + improvement = 0.1 + + start = time.time() + run_iterations( + f, + [[0.0, 5.0]], + 2000, + x_init, + improvement=improvement, + optimium=[2], + atol=0.2, + model=method, + ) + stop = time.time() + + with capsys.disabled(): + print(f"took {stop-start:0.2f} seconds") def test_nans(): @@ -210,10 +267,17 @@ def test_squiggle_int(): assert np.isclose(sample % 1, 0) -def test_iterations_rosenbrock(): +@pytest.mark.parametrize("method", ["bayes", "bayes-tpe", "bayes-tpe-multi"]) +def test_iterations_rosenbrock(method, capsys): dimensions = 3 # x_init = np.random.uniform(0, 2, size=(1, dimensions)) x_init = np.zeros((1, dimensions)) + if method == "bayes-tpe" or method == "bayes-tpe-multi": + improvement = 0.15 + else: + improvement = 0.1 + + start = time.time() run_iterations( rosenbrock, [[0.0, 2.0]] * dimensions, @@ -221,8 +285,13 @@ def test_iterations_rosenbrock(): x_init, optimium=[1, 1, 1], atol=0.2, - improvement=0.1, + improvement=improvement, + model=method, ) + stop = time.time() + + with capsys.disabled(): + print(f"took {stop-start:0.2f} seconds") def test_iterations_squiggle_chunked(): @@ -870,7 +939,7 @@ def test_metric_extremum_in_bayes_search(): data_path = f"{os.path.dirname(__file__)}/data/ygnwe8ptupj33get.decoded.json" with open(data_path, "r") as f: data = json.load(f) - _, _, _, y = bayes._construct_gp_data( + _, _, _, y = bayes._construct_bayes_data( [SweepRun(**r) for r in data["jsonPayload"]["data"]["runs"]], data["jsonPayload"]["data"]["config"], ) diff --git a/tests/test_validation.py b/tests/test_validation.py index e676d264..43ea5423 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -8,7 +8,9 @@ from sweeps.hyperband_stopping import hyperband_stop_runs -@pytest.mark.parametrize("search_type", ["bayes", "grid", "random"]) +@pytest.mark.parametrize( + "search_type", ["bayes", "grid", "random", "bayes-tpe", "bayes-tpe-multi"] +) def test_validation_disable(search_type): invalid_schema = { "metric": {"name": "loss", "goal": "minimise"}, @@ -26,7 +28,7 @@ def test_validation_disable(search_type): _ = hyperband_stop_runs([], invalid_schema, validate=True) with pytest.raises(ValidationError): - if search_type == "bayes": + if "bayes" in search_type: _ = bayes_search_next_runs([], invalid_schema, validate=True) elif search_type == "grid": _ = grid_search_next_runs([], invalid_schema, validate=True) @@ -38,6 +40,46 @@ def test_validation_disable(search_type): assert result is not None +def test_bayes_methods(): + schema = { + "method": "bayes-tpex", + "metric": {"name": "loss", "goal": "minimize"}, + "parameters": {"a": {"min": 0, "max": 1, "distribution": "uniform"}}, + } + with pytest.raises(ValidationError): + SweepConfig(schema) + + schema = { + "method": "bayes-gp", + "metric": {"name": "loss", "goal": "minimize"}, + "parameters": {"a": {"min": 0, "max": 1, "distribution": "uniform"}}, + } + with pytest.raises(ValidationError): + SweepConfig(schema) + + schema = { + "method": "baye", + "metric": {"name": "loss", "goal": "minimize"}, + "parameters": {"a": {"min": 0, "max": 1, "distribution": "uniform"}}, + } + with pytest.raises(ValidationError): + SweepConfig(schema) + + schema = { + "method": "bayes-tpe", + "metric": {"name": "loss", "goal": "minimize"}, + "parameters": {"a": {"min": 0, "max": 1, "distribution": "uniform"}}, + } + SweepConfig(schema) + + schema = { + "method": "bayes", + "metric": {"name": "loss", "goal": "minimize"}, + "parameters": {"a": {"min": 0, "max": 1, "distribution": "uniform"}}, + } + SweepConfig(schema) + + def test_validation_not_enough_params(): schema = {"method": "random", "parameters": {}}