From 7791091f2d49ce70609bc21f9a6996144d296cbf Mon Sep 17 00:00:00 2001 From: Ryan Curtin Date: Tue, 31 Dec 2024 02:08:19 -0500 Subject: [PATCH] Don't reset the objective estimate on the last iteration (#417) --- HISTORY.md | 3 + doc/optimizers.md | 397 ++++++++++++++---- .../bigbatch_sgd/bigbatch_sgd_impl.hpp | 11 +- include/ensmallen_bits/eve/eve_impl.hpp | 11 +- include/ensmallen_bits/sgd/sgd_impl.hpp | 11 +- .../spalera_sgd/spalera_sgd_impl.hpp | 11 +- 6 files changed, 347 insertions(+), 97 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index fdaa5ff17..cf4128a1d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,8 @@ ### ensmallen ?.??.?: "???" ###### ????-??-?? + * Fix `exactObjective` output for SGD-like optimizers when the number of + iterations is an even number of epochs + ([#417](https://github.com/mlpack/ensmallen/pull/417)). ### ensmallen 2.22.1: "E-Bike Excitement" ###### 2024-12-02 diff --git a/doc/optimizers.md b/doc/optimizers.md index 2691eb751..2eeda896a 100644 --- a/doc/optimizers.md +++ b/doc/optimizers.md @@ -121,11 +121,20 @@ gradient direction. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | + The attributes of the optimizer may also be modified via the member methods -`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Epsilon()`, `MaxIterations()`, -`Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Epsilon()`, +`MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and +`ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples @@ -181,13 +190,20 @@ class with _`UpdateRule`_` = AdaBoundUpdate`. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | The attributes of the optimizer may also be modified via the member methods `FinalLr()`, `Gamma()`, `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Eps()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples
@@ -242,12 +258,19 @@ gradients. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be changed via the member methods `StepSize()`, `BatchSize()`, `Rho()`, `Epsilon()`, `MaxIterations()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -298,12 +321,19 @@ parameters. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `tolerance` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be changed via the member methods `StepSize()`, `BatchSize()`, `Epsilon()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -356,12 +386,19 @@ learning rate by sqrt(T). | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `tolerance` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be changed via the member methods `StepSize()`, `BatchSize()`, `Epsilon()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -416,11 +453,19 @@ with _`UpdateRule`_` = AdamUpdate`. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | The attributes of the optimizer may also be modified via the member methods -`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Epsilon()`, `MaxIterations()`, -`Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Epsilon()`, +`MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and +`ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples @@ -473,13 +518,20 @@ with _`UpdateRule`_` = AdaMaxUpdate`. | `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | The attributes of the optimizer may also be modified via the member methods `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Eps()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ExactObjective()`, and `ResetPolicy()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples
@@ -625,13 +677,20 @@ class with _`UpdateRule`_` = AdaBoundUpdate`. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | The attributes of the optimizer may also be modified via the member methods `FinalLr()`, `Gamma()`, `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Eps()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples
@@ -684,13 +743,20 @@ with _`UpdateRule`_` = AMSGradUpdate`. | `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | The attributes of the optimizer may also be modified via the member methods `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Eps()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ExactObjective()`, and `ResetPolicy()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples
@@ -838,12 +904,19 @@ For convenience the following typedefs have been defined: | `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the batch order is shuffled; otherwise, each batch is visited in linear order. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be changed via the member methods `BatchSize()`, `StepSize()`, `BatchDelta()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -883,14 +956,14 @@ fitness landscape, and can outperform IPOP. The larger population restarts aim to explore broadly, improving global search capabilities, while the smaller populations intensify the search in promising regions. -### Constructors +#### Constructors * `BIPOP_CMAES<`_`SelectionPolicyType, TransformationPolicyType`_`>()` * `BIPOP_CMAES<`_`SelectionPolicyType, TransformationPolicyType`_`>(`_`lambda, transformationPolicy`_`)` * `BIPOP_CMAES<`_`SelectionPolicyType, TransformationPolicyType`_`>(`_`lambda, transformationPolicy, batchSize, maxIterations, tolerance`_`)` * `BIPOP_CMAES<`_`SelectionPolicyType, TransformationPolicyType`_`>(`_`lambda, transformationPolicy, batchSize, maxIterations, tolerance, selectionPolicy, stepSize, maxRestarts, populationFactor, maxFunctionEvaluations`_`)` -### Attributes +#### Attributes | **type** | **name** | **description** | **default** | |----------|----------|-----------------|-------------| @@ -1377,11 +1450,18 @@ Eve is a stochastic gradient based optimization method with locally and globally | `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | The attributes of the optimizer may also be modified via the member methods -`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Beta3()`, `Epsilon()`, `Clip()`, `MaxIterations()`, -`Tolerance()`, `Shuffle()`, and `ExactObjective()`. +`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Beta3()`, `Epsilon()`, +`Clip()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, and `ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples @@ -1486,11 +1566,19 @@ changes. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | The attributes of the optimizer may also be modified via the member methods -`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Epsilon()`, `MaxIterations()`, -`Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Epsilon()`, +`MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and +`ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples @@ -1664,14 +1752,14 @@ characterized by numerous local optima. The restart mechanism is designed to improve the adaptability of CMA-ES by improving the likelihood of escaping local optima, thus increasing the chances of discovering the global optimum. -### Constructors +#### Constructors * `IPOP_CMAES<`_`SelectionPolicyType, TransformationPolicyType`_`>()` * `IPOP_CMAES<`_`SelectionPolicyType, TransformationPolicyType`_`>(`_`lambda, transformationPolicy`_`)` * `IPOP_CMAES<`_`SelectionPolicyType, TransformationPolicyType`_`>(`_`lambda, transformationPolicy, batchSize, maxIterations, tolerance`_`)` * `IPOP_CMAES<`_`SelectionPolicyType, TransformationPolicyType`_`>(`_`lambda, transformationPolicy, batchSize, maxIterations, tolerance, selectionPolicy, stepSize, maxRestarts, populationFactor, maxFunctionEvaluations`_`)` -### Attributes +#### Attributes | **type** | **name** | **description** | **default** | |----------|----------|-----------------|-------------| @@ -1809,13 +1897,19 @@ For convenience the following typedefs have been defined: | `size_t` | **`innerIterations`** | The number of inner iterations allowed (0 means n / batchSize). Note that the full gradient is only calculated in the outer iteration. | `0` | | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | - +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be changed via the member methods `Convexity()`, `Lipschitz()`, `BatchSize()`, `MaxIterations()`, `InnerIterations()`, `Tolerance()`, `Shuffle()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -1929,12 +2023,19 @@ can be paired with the `Lookahead` optimizer. | `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `DecayPolicyType` | **`decayPolicy`** | Instantiated decay policy used to adjust the step size. | `DecayPolicyType()` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | The attributes of the optimizer may also be modified via the member methods `BaseOptimizer()`, `StepSize()`, `K()`, `MaxIterations()`, `Tolerance()`, `DecayPolicy()` and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples
@@ -2030,14 +2131,24 @@ MomentumUpdate` and _`DecayPolicyType`_` = NoDecay`. | `MomentumUpdate` | **`updatePolicy`** | An instantiated `MomentumUpdate`. | `MomentumUpdate()` | | `DecayPolicyType` | **`decayPolicy`** | Instantiated decay policy used to adjust the step size. | `DecayPolicyType()` | | `bool` | **`resetPolicy`** | Flag that determines whether update policy parameters are reset before every Optimize call. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be modified via the member methods -`StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `UpdatePolicy()`, `DecayPolicy()`, `ResetPolicy()`, and -`ExactObjective()`. +`StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, +`UpdatePolicy()`, `DecayPolicy()`, `ResetPolicy()`, and `ExactObjective()`. -Note that the `MomentumUpdate` class has the constructor -`MomentumUpdate(`_`momentum`_`)` with a default value of `0.5` for the momentum. +***Notes***: + + - The `MomentumUpdate` class has the constructor + `MomentumUpdate(`_`momentum`_`)` with a default value of `0.5` for the + momentum. + + - If `exactObjective` is `false`, then `Optimize(f, coordinates)` will return + an estimate of the objective function. This estimate is the sum of the + objectives obtained on the last pass of the separable functions. The + estimate will not include contributions from any separable functions not + visited in the last pass (e.g., if `maxIterations` is not an integer multiple + of `f.NumFunctions()`). #### Examples @@ -2210,14 +2321,24 @@ NesterovMomentumUpdate` and _`DecayPolicyType`_` = NoDecay`. | `NesterovMomentumUpdate` | **`updatePolicy`** | An instantiated `MomentumUpdate`. | `NesterovMomentumUpdate()` | | `DecayPolicyType` | **`decayPolicy`** | Instantiated decay policy used to adjust the step size. | `DecayPolicyType()` | | `bool` | **`resetPolicy`** | Flag that determines whether update policy parameters are reset before every Optimize call. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be modified via the member methods -`StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `UpdatePolicy()`, `DecayPolicy()`, `ResetPolicy()`, and -`ExactObjective()`. +`StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, +`UpdatePolicy()`, `DecayPolicy()`, `ResetPolicy()`, and `ExactObjective()`. + +***Notes:*** -Note that the `NesterovMomentumUpdate` class has the constructor -`MomentumUpdate(`_`momentum`_`)` with a default value of `0.5` for the momentum. + - The `NesterovMomentumUpdate` class has the constructor + `MomentumUpdate(`_`momentum`_`)` with a default value of `0.5` for the + momentum. + + - If `exactObjective` is `false`, then `Optimize(f, coordinates)` will return + an estimate of the objective function. This estimate is the sum of the + objectives obtained on the last pass of the separable functions. The + estimate will not include contributions from any separable functions not + visited in the last pass (e.g., if `maxIterations` is not an integer multiple + of `f.NumFunctions()`). #### Examples @@ -2491,11 +2612,19 @@ Padam is a variant of Adam with a partially adaptive momentum estimation method. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | The attributes of the optimizer may also be modified via the member methods `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Partial()`, `Epsilon()`, -`MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +`MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and +`ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples @@ -2706,21 +2835,31 @@ SGD. #### Attributes - | **type** | **name** | **description** | **default** | - |----------|----------|-----------------|-------------| - | `double` | **`stepSize`** | Step size for each iteration. | `0.01` | - | `size_t` | **`batchSize`** | Batch size to use for each step. | `32` | - | `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | - | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | - | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | - | `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| **type** | **name** | **description** | **default** | +|----------|----------|-----------------|-------------| +| `double` | **`stepSize`** | Step size for each iteration. | `0.01` | +| `size_t` | **`batchSize`** | Batch size to use for each step. | `32` | +| `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | +| `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | +| `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | + +Attributes of the optimizer may also be modified via the member methods +`StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, and +`ExactObjective()`. + +***Notes:*** - Attributes of the optimizer may also be modified via the member methods - `StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, and `ExactObjective()`. + - The `QHUpdate` class has the constructor `QHUpdate(`_`v, momentum`_`)` with + a default value of `0.7` for the quasi-hyperbolic term `v` and `0.999` for + the momentum term. - Note that the `QHUpdate` class has the constructor `QHUpdate(`_`v, -momentum`_`)` with a default value of `0.7` for the quasi-hyperbolic term `v` -and `0.999` for the momentum term. + - If `exactObjective` is `false`, then `Optimize(f, coordinates)` will return + an estimate of the objective function. This estimate is the sum of the + objectives obtained on the last pass of the separable functions. The + estimate will not include contributions from any separable functions not + visited in the last pass (e.g., if `maxIterations` is not an integer multiple + of `f.NumFunctions()`). #### Examples @@ -2771,24 +2910,32 @@ the following other optimizers: #### Attributes - | **type** | **name** | **description** | **default** | - |----------|----------|-----------------|-------------| - | `double` | **`stepSize`** | Step size for each iteration. | `0.001` | - | `size_t` | **`batchSize`** | Number of points to process in a single step. | `32` | - | `double` | **`v1`** | The First Quasi Hyperbolic Term. | `0.7` | - | `double` | **`v2`** | The Second Quasi Hyperbolic Term. | `1.00` | - | `double` | **`beta1`** | Exponential decay rate for the first moment estimates. | `0.9` | - | `double` | **`beta2`** | Exponential decay rate for the weighted infinity norm estimates. | `0.999` | - | `double` | **`eps`** | Value used to initialize the mean squared gradient parameter. | `1e-8` | - | `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | - | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | - | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | - | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | - | `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | - - The attributes of the optimizer may also be modified via the member methods - `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Eps()`, `MaxIterations()`, - `Tolerance()`, `Shuffle()`, `V1()`, `V2()`, `ResetPolicy()`, and `ExactObjective()`. +| **type** | **name** | **description** | **default** | +|----------|----------|-----------------|-------------| +| `double` | **`stepSize`** | Step size for each iteration. | `0.001` | +| `size_t` | **`batchSize`** | Number of points to process in a single step. | `32` | +| `double` | **`v1`** | The First Quasi Hyperbolic Term. | `0.7` | +| `double` | **`v2`** | The Second Quasi Hyperbolic Term. | `1.00` | +| `double` | **`beta1`** | Exponential decay rate for the first moment estimates. | `0.9` | +| `double` | **`beta2`** | Exponential decay rate for the weighted infinity norm estimates. | `0.999` | +| `double` | **`eps`** | Value used to initialize the mean squared gradient parameter. | `1e-8` | +| `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` | +| `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | +| `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | +| `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | + +The attributes of the optimizer may also be modified via the member methods +`StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Eps()`, `MaxIterations()`, +`Tolerance()`, `Shuffle()`, `V1()`, `V2()`, `ResetPolicy()`, and +`ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples @@ -2836,12 +2983,19 @@ RMSProp utilizes the magnitude of recent gradients to normalize the gradients. | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer can also be modified via the member methods `StepSize()`, `BatchSize()`, `Alpha()`, `Epsilon()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -3030,14 +3184,23 @@ For convenience the following typedefs have been defined: | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` | | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` | | `UpdatePolicyType` | **`updatePolicy`** | Instantiated update policy used to adjust the given parameters. | `UpdatePolicyType()` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be changed via the member methods `StepSize()`, `BatchSize()`, `MaxIterations()`, `InnerIterations()`, `Tolerance()`, `Shuffle()`, `UpdatePolicy()`, and `ExactObjective()`. -Note that the default value for `updatePolicy` is the default constructor for -the `UpdatePolicyType`. +***Notes:*** + + - The default value for `updatePolicy` is the default constructor for the + `UpdatePolicyType`. + + - If `exactObjective` is `false`, then `Optimize(f, coordinates)` will return + an estimate of the objective function. This estimate is the sum of the + objectives obtained on the last pass of the separable functions. The + estimate will not include contributions from any separable functions not + visited in the last pass (e.g., if `maxIterations` is not an integer multiple + of `f.NumFunctions()`). #### Examples: @@ -3097,12 +3260,19 @@ VanillaUpdate` and _`DecayPolicyType`_` = NoDecay`. | `UpdatePolicyType` | **`updatePolicy`** | Instantiated update policy used to adjust the given parameters. | `UpdatePolicyType()` | | `DecayPolicyType` | **`decayPolicy`** | Instantiated decay policy used to adjust the step size. | `DecayPolicyType()` | | `bool` | **`resetPolicy`** | Flag that determines whether update policy parameters are reset before every Optimize call. | `true` | -| `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | +| `bool` | **`exactObjective`** | Calculate the exact objective at the end of optimization. (This could be computationally expensive!) | `false` | Attributes of the optimizer may also be modified via the member methods `StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `UpdatePolicy()`, `DecayPolicy()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples
@@ -3167,12 +3337,19 @@ so the shorter type `SGDR<>` can be used instead of the equivalent Attributes of the optimizer can also be modified via the member methods `EpochRestart()`, `MultFactor()`, `BatchSize()`, `StepSize()`, -`MaxIterations()`, `Tolerance()`, `Shuffle()`, `UpdatePolicy()`, `ResetPolicy()`, and -`ExactObjective()`. +`MaxIterations()`, `Tolerance()`, `Shuffle()`, `UpdatePolicy()`, +`ResetPolicy()`, and `ExactObjective()`. Note that the default value for `updatePolicy` is the default constructor for the `UpdatePolicyType`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -3247,6 +3424,13 @@ snapshots. Note that the default value for `updatePolicy` is the default constructor for the `UpdatePolicyType`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -3302,6 +3486,13 @@ Attributes of the optimizer can also be modified via the member methods `StepSize()`, `BatchSize()`, `Epsilon()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -3378,6 +3569,13 @@ Note that the default values for the `updatePolicy` and `decayPolicy` parameters are simply the default constructors of the _`UpdatePolicyType`_ and _`DecayPolicyType`_ classes. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples:
@@ -3431,6 +3629,13 @@ method signatures. By default, _`DecayPolicyType`_ is set to `NoDecay`, so the shorter type `SPALeRASGD<>` can be used instead of the equivalent `SPALeRASGD`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Attributes | **type** | **name** | **description** | **default** | @@ -3450,7 +3655,15 @@ By default, _`DecayPolicyType`_ is set to `NoDecay`, so the shorter type Attributes of the optimizer may also be modified via the member methods `StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Lambda()`, -`Alpha()`, `Epsilon()`, `AdaptRate()`, `Shuffle()`, `DecayPolicy()`, `ResetPolicy()`, and `ExactObjective()`. +`Alpha()`, `Epsilon()`, `AdaptRate()`, `Shuffle()`, `DecayPolicy()`, +`ResetPolicy()`, and `ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples @@ -3507,7 +3720,15 @@ the projection of Adam steps on the gradient subspace. Attributes of the optimizer can also be modified via the member methods `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Epsilon()`, -`MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +`MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and +`ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples: @@ -3560,8 +3781,15 @@ near-optimal convergence rates in both the batch and stochastic settings. | `bool` | **`exactObjective`** | Calculate the exact objective (Default: estimate the final objective obtained on the last pass over the data). | `false` | Attributes of the optimizer may also be modified via the member methods -`StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and -`ExactObjective()`. +`StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, +`ResetPolicy()`, and `ExactObjective()`. + +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). #### Examples @@ -3621,6 +3849,13 @@ The attributes of the optimizer may also be modified via the member methods `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Eps()`, `MaxIterations()`, `Tolerance()`, `Shuffle()`, `ResetPolicy()`, and `ExactObjective()`. +***Note:*** if `exactObjective` is `false`, then `Optimize(f, coordinates)` will +return an estimate of the objective function. This estimate is the sum of the +objectives obtained on the last pass of the separable functions. The estimate +will not include contributions from any separable functions not visited in the +last pass (e.g., if `maxIterations` is not an integer multiple of +`f.NumFunctions()`). + #### Examples ```c++ diff --git a/include/ensmallen_bits/bigbatch_sgd/bigbatch_sgd_impl.hpp b/include/ensmallen_bits/bigbatch_sgd/bigbatch_sgd_impl.hpp index cd88660e2..ed8561e4c 100644 --- a/include/ensmallen_bits/bigbatch_sgd/bigbatch_sgd_impl.hpp +++ b/include/ensmallen_bits/bigbatch_sgd/bigbatch_sgd_impl.hpp @@ -244,10 +244,13 @@ BigBatchSGD::Optimize( terminate |= Callback::BeginEpoch(*this, f, iterate, epoch, overallObjective, callbacks...); - // Reset the counter variables. - lastObjective = overallObjective; - overallObjective = 0; - currentFunction = 0; + // Reset the counter variables if we will continue. + if (i != actualMaxIterations) + { + lastObjective = overallObjective; + overallObjective = 0; + currentFunction = 0; + } if (shuffle) // Determine order of visitation. f.Shuffle(); diff --git a/include/ensmallen_bits/eve/eve_impl.hpp b/include/ensmallen_bits/eve/eve_impl.hpp index 3237a4eab..3fe58475d 100644 --- a/include/ensmallen_bits/eve/eve_impl.hpp +++ b/include/ensmallen_bits/eve/eve_impl.hpp @@ -186,10 +186,13 @@ Eve::Optimize(SeparableFunctionType& function, terminate |= Callback::BeginEpoch(*this, f, iterate, epoch, overallObjective, callbacks...); - // Reset the counter variables. - lastOverallObjective = overallObjective; - overallObjective = 0; - currentFunction = 0; + // Reset the counter variables if we will continue. + if (i != actualMaxIterations) + { + lastOverallObjective = overallObjective; + overallObjective = 0; + currentFunction = 0; + } if (shuffle) // Determine order of visitation. f.Shuffle(); diff --git a/include/ensmallen_bits/sgd/sgd_impl.hpp b/include/ensmallen_bits/sgd/sgd_impl.hpp index d34115b63..0050a0d17 100644 --- a/include/ensmallen_bits/sgd/sgd_impl.hpp +++ b/include/ensmallen_bits/sgd/sgd_impl.hpp @@ -193,10 +193,13 @@ SGD::Optimize( terminate |= Callback::BeginEpoch(*this, f, iterate, epoch, overallObjective, callbacks...); - // Reset the counter variables. - lastObjective = overallObjective; - overallObjective = 0; - currentFunction = 0; + // Reset the counter variables if we will continue. + if (i != actualMaxIterations) + { + lastObjective = overallObjective; + overallObjective = 0; + currentFunction = 0; + } if (shuffle) // Determine order of visitation. f.Shuffle(); diff --git a/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp b/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp index e0cac7de1..2e56e2cab 100644 --- a/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp +++ b/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp @@ -212,10 +212,13 @@ SPALeRASGD::Optimize( return overallObjective; } - // Reset the counter variables. - lastObjective = overallObjective; - overallObjective = 0; - currentFunction = 0; + // Reset the counter variables if we will continue. + if (i != actualMaxIterations) + { + lastObjective = overallObjective; + overallObjective = 0; + currentFunction = 0; + } terminate |= Callback::BeginEpoch(*this, f, iterate, epoch, overallObjective, callbacks...);