From d98b8063f5b2c5321742fffea0b4c39659fceda7 Mon Sep 17 00:00:00 2001 From: Paul Koch Date: Fri, 18 Oct 2024 12:06:23 -0700 Subject: [PATCH] improved learning rate parameter --- docs/interpret/hyperparameters.md | 50 +++++++++---------- .../interpret/glassbox/_ebm/_ebm.py | 24 ++++----- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/interpret/hyperparameters.md b/docs/interpret/hyperparameters.md index 4f9b00d11..a8e3e3680 100644 --- a/docs/interpret/hyperparameters.md +++ b/docs/interpret/hyperparameters.md @@ -13,11 +13,9 @@ hyperparameters: [0, 50, 100, 200, 500, 1000] guidance: This is an important hyperparameter to tune. The optimal smoothing_rounds value will vary depending on the dataset's characteristics. Adjust based on the prevalence of smooth feature response curves. ## learning_rate -default: 0.016 (classification), 0.06 (regression) +default: 0.015 (classification), 0.04 (regression) -NOTE: 0.018 is better for multiclass. 0.016 optimizes for binary classification - -hyperparameters: [0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002] +hyperparameters: [0.0025, 0.005, 0.01, 0.015, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2] guidance: This is an important hyperparameter to tune. The conventional wisdom is that a lower learning rate is generally better, but we have found the relationship to be more complex for EBMs. In general, regression seems to prefer a higher learning rate, binary classification seems to prefer a lower learning rate, and multiclass is in-between. @@ -26,41 +24,43 @@ default: 0.9 ideal: As many as possible -hyperparameters: [0, 0.5, 0.75, 0.9, 0.95, 5, 10, 25, 50, 100, 250] +hyperparameters: [0.0, 0.9, 0.95, 0.99, 100, 250, 1000] -guidance: Introducing more interactions tends to improve model accuracy. Values between 0 and LESS than 1.0 are interpreted as percentages of the number of features. For example, a dataset with 100 features and an interactions value of 0.75 will automatically detect and use 75 interactions. Values of 1 or higher indicate the exact number of interactions to be detected, so for example 1 would create 1 interaction term, and 50 would create 50. +guidance: Introducing more interactions tends to improve model accuracy. Values between 0 and LESS than 1.0 are interpreted as percentages of the number of features. For example, a dataset with 100 features and an interactions value of 0.7 will automatically detect and use 70 interactions. Values of 1 or higher indicate the exact number of interactions to be detected, so for example 1 would create 1 interaction term, and 50 would create 50. ## inner_bags default: 0 -WARNING: Setting this value to 50 will typically increase the fitting time by a factor of 50x. +WARNING: Setting this value to 20 will typically increase the fitting time by a factor of 20x. -ideal: 50 (diminishing returns beyond this point) +ideal: 20 (diminishing returns beyond this point) -hyperparameters: [0] OR if you can afford it [0, 50] +hyperparameters: [0] OR if you can afford it [0, 20] -guidance: The default inner_bags value of 0 disables inner bagging. Setting this parameter to 1 or other low values will typically make the model worse since model fitting will then only use a subset of the data but not do enough inner bagging to compensate. Increasing the number of inner bags to 50 can improve model accuracy at the cost of significantly longer fitting times. If computation time is not a constraint, we suggest trying both 0 and 50, but not other values in between. +guidance: The default inner_bags value of 0 disables inner bagging. Setting this parameter to 1 or other low values will typically make the model worse since model fitting will then only use a subset of the data but not do enough inner bagging to compensate. Increasing the number of inner bags to 20 can improve model accuracy at the cost of significantly longer fitting times. If computation time is not a constraint, we suggest trying both 0 and 20, but not other values in between. ## max_bins default: 1024 -hyperparameters: [256, 512, 1024, 4096, 16384, 65536] +ideal: 1024 (diminishing returns beyond this point) + +hyperparameters: [1024] -guidance: Higher max_bins values can improve model accuracy by allowing more granular discretization of features, although increasing max_bins beyond 1024 sometimes decreases model performance. While the default minimizes memory consumption and speeds up training, we suggest testing larger values if resources permit. +guidance: Increasing the max_bins value can enhance model accuracy by enabling finer discretization of features. Values above 1024 seem to result in very small changes to model performance, although there might be benefits for very large datasets. Setting max_bins to 1024 often provides a good balance between model performance, memory requirements, and fitting time. ## max_interaction_bins -default: 32 +default: 64 -hyperparameters: [8, 16, 32, 64, 128, 256] +hyperparameters: [64] -guidance: For max_interaction_bins, more is not necessarily better. A good value on many datasets seems to be 32, but it's worth trying higher and lower values. +guidance: For max_interaction_bins, more is typically better in term of model performance, however fitting times go up significantly above 64 bins for very little benefit. We recommend using 64 as the default for this reason. If your fitting times are acceptable however, setting max_interaction_bins to 256 or even more might improve the model slightly. ## greedy_ratio default: 12.0 -hyperparameters: [0.0, 1.0, 2.0, 5.0, 12.0, 20.0] +hyperparameters: [0.0, 1.0, 2.0, 5.0, 10.0, 12.0, 20.0] -guidance: greedy_ratio is a good candidate for hyperparameter tuning as the best value is dataset dependent. +guidance: Values of greedy_ratio above 5.0 seem to result in similar model performance. ## cyclic_progress default: 0.0 @@ -76,14 +76,14 @@ ideal: 14 (diminishing returns beyond this point) hyperparameters: [14] -guidance: Increasing outer bags beyond 14 provides no benefit. Reducing outer_bags below 14 might improve fitting time on machines with less than 14 cores. +guidance: Increasing outer bags beyond 14 provides no observable benefit. Reducing outer_bags below 14 might improve fitting time on machines with less than 14 cores. Setting outer_bags to 8 is reasonable on many datasets, and can improve fitting time. ## interaction_smoothing_rounds -default: 50 +default: 100 -hyperparameters: [0, 50, 100, 500] +hyperparameters: [0, 50, 100, 200, 500, 1000] -guidance: interaction_smoothing_rounds appears to have only a minor impact on model accuracy. 0 is often the best choice. 0 is often the most accurate choice, but the interaction shape plots will be smoother and easier to interpret with more interaction_smoothing_rounds. +guidance: interaction_smoothing_rounds appears to have only a minor impact on model accuracy. 100 is a good default choice, but it might be worth trying other values when optimizing a model. ## max_leaves default: 2 @@ -102,9 +102,9 @@ guidance: The default value usually works well, however experimenting with sligh ## min_hessian default: 0.0 -hyperparameters: [1e-4, 0.0] +hyperparameters: [0.0, 1e-4] -guidance: Generally 0.0 is the best choice for min_hessian, but on some datasets it might be useful to set min_hessian. +guidance: Generally 0.0 is close to the best choice for min_hessian, but on some datasets it might be useful to set min_hessian to a small non-zero value. ## max_rounds default: 25000 @@ -113,7 +113,7 @@ ideal: 1000000000 (early stopping should stop long before this point) hyperparameters: [1000000000] -guidance: The max_rounds parameter serves as a limit to prevent excessive training on datasets where improvements taper off. Set this parameter sufficiently high to avoid premature early stopping provided fitting times are reasonable. Consider increasing it if small yet consistent gains are observed in longer trainings. +guidance: The max_rounds parameter serves as a limit to prevent excessive training on datasets where improvements taper off. Set this parameter sufficiently high to avoid premature early stopping provided fitting times are reasonable. ## early_stopping_rounds default: 100 @@ -129,7 +129,7 @@ default: 0.0 hyperparameters: [0.0] -guidance: early_stopping_tolerance is set to 0.0 by default, however setting it to a negative value sometimes yields slightly higher accuracy. EBMs are a bagged ensemble model, so overfitting each individual bag a little can be beneficial because after the models are averaged together in the ensemble averaging decreases the variance due to overfitting. Using a negative value for early_stopping_tolerance allows the individual models to be overfit. +guidance: Setting early_stopping_tolerance to a small positive value in the range of 1e-4 can help reduce fitting times on some datasets with minimal degradation in model performance. Setting it to a negative value sometimes yields slightly better models. EBMs are a bagged ensemble model, so overfitting each individual bag a little can be beneficial because after the models are averaged together in the ensemble averaging decreases the variance due to overfitting. Using a negative value for early_stopping_tolerance allows the individual models to be overfit. ## validation_size default: 0.15 diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py index 5cfc2e335..7a6d1d4bc 100644 --- a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py +++ b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py @@ -2426,7 +2426,7 @@ class ExplainableBoostingClassifier(EBMModel, ClassifierMixin, ExplainerMixin): - `'nominal'`: Categorical where the order has no meaning. Eg: country names max_bins : int, default=1024 Max number of bins per feature for the main effects stage. - max_interaction_bins : int, default=32 + max_interaction_bins : int, default=64 Max number of bins per feature for interaction terms. interactions : int, float, or list of tuples of feature indices, default=0.9 @@ -2448,7 +2448,7 @@ class ExplainableBoostingClassifier(EBMModel, ClassifierMixin, ExplainerMixin): Number of outer bags. Outer bags are used to generate error bounds and help with smoothing the graphs. inner_bags : int, default=0 Number of inner bags. 0 turns off inner bagging. - learning_rate : float, default=0.016 + learning_rate : float, default=0.015 Learning rate for boosting. greedy_ratio : float, default=12.0 The proportion of greedy boosting steps relative to cyclic boosting steps. @@ -2464,7 +2464,7 @@ class ExplainableBoostingClassifier(EBMModel, ClassifierMixin, ExplainerMixin): to a value less than 1.0 can be useful for preventing overfitting. smoothing_rounds : int, default=100 Number of initial highly regularized rounds to set the basic shape of the main effect feature graphs. - interaction_smoothing_rounds : int, default=50 + interaction_smoothing_rounds : int, default=100 Number of initial highly regularized rounds to set the basic shape of the interaction effect feature graphs during fitting. max_rounds : int, default=25000 Total number of boosting rounds with n_terms boosting steps per round. @@ -2631,7 +2631,7 @@ def __init__( ] = None, # Preprocessor max_bins: int = 1024, - max_interaction_bins: int = 32, + max_interaction_bins: int = 64, # Stages interactions: Optional[ Union[int, float, Sequence[Union[int, str, Sequence[Union[int, str]]]]] @@ -2642,11 +2642,11 @@ def __init__( outer_bags: int = 14, inner_bags: Optional[int] = 0, # Boosting - learning_rate: float = 0.016, + learning_rate: float = 0.015, greedy_ratio: Optional[float] = 12.0, cyclic_progress: Union[bool, float, int] = False, # noqa: PYI041 smoothing_rounds: Optional[int] = 100, - interaction_smoothing_rounds: Optional[int] = 50, + interaction_smoothing_rounds: Optional[int] = 100, max_rounds: Optional[int] = 25000, early_stopping_rounds: Optional[int] = 100, early_stopping_tolerance: Optional[float] = 0.0, @@ -2772,7 +2772,7 @@ class ExplainableBoostingRegressor(EBMModel, RegressorMixin, ExplainerMixin): - `'nominal'`: Categorical where the order has no meaning. Eg: country names max_bins : int, default=1024 Max number of bins per feature for the main effects stage. - max_interaction_bins : int, default=32 + max_interaction_bins : int, default=64 Max number of bins per feature for interaction terms. interactions : int, float, or list of tuples of feature indices, default=0.9 @@ -2794,7 +2794,7 @@ class ExplainableBoostingRegressor(EBMModel, RegressorMixin, ExplainerMixin): Number of outer bags. Outer bags are used to generate error bounds and help with smoothing the graphs. inner_bags : int, default=0 Number of inner bags. 0 turns off inner bagging. - learning_rate : float, default=0.06 + learning_rate : float, default=0.04 Learning rate for boosting. greedy_ratio : float, default=12.0 The proportion of greedy boosting steps relative to cyclic boosting steps. @@ -2810,7 +2810,7 @@ class ExplainableBoostingRegressor(EBMModel, RegressorMixin, ExplainerMixin): to a value less than 1.0 can be useful for preventing overfitting. smoothing_rounds : int, default=100 Number of initial highly regularized rounds to set the basic shape of the main effect feature graphs. - interaction_smoothing_rounds : int, default=50 + interaction_smoothing_rounds : int, default=100 Number of initial highly regularized rounds to set the basic shape of the interaction effect feature graphs during fitting. max_rounds : int, default=25000 Total number of boosting rounds with n_terms boosting steps per round. @@ -2977,7 +2977,7 @@ def __init__( ] = None, # Preprocessor max_bins: int = 1024, - max_interaction_bins: int = 32, + max_interaction_bins: int = 64, # Stages interactions: Optional[ Union[int, float, Sequence[Union[int, str, Sequence[Union[int, str]]]]] @@ -2988,11 +2988,11 @@ def __init__( outer_bags: int = 14, inner_bags: Optional[int] = 0, # Boosting - learning_rate: float = 0.06, + learning_rate: float = 0.04, greedy_ratio: Optional[float] = 12.0, cyclic_progress: Union[bool, float, int] = False, # noqa: PYI041 smoothing_rounds: Optional[int] = 100, - interaction_smoothing_rounds: Optional[int] = 50, + interaction_smoothing_rounds: Optional[int] = 100, max_rounds: Optional[int] = 25000, early_stopping_rounds: Optional[int] = 100, early_stopping_tolerance: Optional[float] = 0.0,