Skip to content

Commit

Permalink
Conv. criteria: add per-component g_abstol
Browse files Browse the repository at this point in the history
This allows `options.g_abstol` to be either a scalar (the traditional
choice) or a vector. The motivation for this change comes from the fact
that the gradient is dependent on how each variable is scaled:

     f₁(x, y) = x^2 + y^2 + 1

and

     f₂(x, y) = x^2 + 10^8 * y^2 + 1

Both have the same minimizer and minimum value, but at any point other
than the x-axis, derivatives with respect to `y` of `f₂` are much larger
than those of `f₁`. When roundoff error becomes a factor, the
convergence requirement should be adjusted accordingly.
  • Loading branch information
timholy committed Oct 26, 2024
1 parent e890943 commit c11c6b2
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 20 deletions.
2 changes: 1 addition & 1 deletion docs/src/user/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ In addition to the solver, you can alter the behavior of the Optim package by us

* `x_tol`: Absolute tolerance in changes of the input vector `x`, in infinity norm. Defaults to `0.0`.
* `f_tol`: Relative tolerance in changes of the objective value. Defaults to `0.0`.
* `g_tol`: Absolute tolerance in the gradient, in infinity norm. Defaults to `1e-8`. For gradient free methods, this will control the main convergence tolerance, which is solver specific.
* `g_tol`: Absolute tolerance in the gradient. If `g_tol` is a scalar (the default), convergence is achieved when `norm(g, Inf) ≤ g_tol`; if `g_tol` is supplied as a vector, then each component must satisfy `abs(g[i]) ≤ g_tol[i]`. Defaults to `1e-8`. For gradient-free methods (e.g., Nelder-Meade), this gets re-purposed to control the main convergence tolerance in a solver-specific manner.
* `f_calls_limit`: A soft upper limit on the number of objective calls. Defaults to `0` (unlimited).
* `g_calls_limit`: A soft upper limit on the number of gradient calls. Defaults to `0` (unlimited).
* `h_calls_limit`: A soft upper limit on the number of Hessian calls. Defaults to `0` (unlimited).
Expand Down
8 changes: 5 additions & 3 deletions src/multivariate/optimize/optimize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ after_while!(d, state, method, options) = nothing
function initial_convergence(d, state, method::AbstractOptimizer, initial_x, options)
gradient!(d, initial_x)
stopped = !isfinite(value(d)) || any(!isfinite, gradient(d))
maximum(abs, gradient(d)) <= options.g_abstol, stopped
g_abstol = options.g_abstol
conv = isa(g_abstol, Real) ? maximum(abs, gradient(d)) <= options.g_abstol : # scalar tolerance
all(t -> ((g, tol) = t; abs(g) <= tol), zip(gradient(d), g_abstol)) # per-component tolerance
return conv, stopped
end
function initial_convergence(d, state, method::ZerothOrderOptimizer, initial_x, options)
false, false
Expand Down Expand Up @@ -133,8 +136,7 @@ function optimize(d::D, initial_x::Tx, method::M,
f_abschange(d, state),
f_relchange(d, state),
g_converged,
Tf(options.g_abstol),
g_residual(d, state),
g_converge_component(options.g_abstol, d, state)...,
f_increased,
tr,
f_calls(d),
Expand Down
3 changes: 2 additions & 1 deletion src/multivariate/solvers/constrained/fminbox.jl
Original file line number Diff line number Diff line change
Expand Up @@ -437,11 +437,12 @@ function optimize(
callback=stopped_by_callback,
f_increased=f_increased && !options.allow_f_increases)

g_abstol, gabs = g_converge_component(options.g_abstol, df, state)
return MultivariateOptimizationResults(F, initial_x, minimizer(results), df.f(minimizer(results)),
iteration, results.iteration_converged,
results.x_converged, results.x_abstol, results.x_reltol, norm(x - xold), norm(x - xold)/norm(x),
results.f_converged, results.f_abstol, results.f_reltol, f_abschange(minimum(results), value(dfbox)), f_relchange(minimum(results), value(dfbox)),
results.g_converged, results.g_abstol, norm(g, Inf),
results.g_converged, T(g_abstol), gabs,
results.f_increased, results.trace, results.f_calls,
results.g_calls, results.h_calls, nothing,
options.time_limit,
Expand Down
10 changes: 6 additions & 4 deletions src/multivariate/solvers/constrained/ipnewton/interior.jl
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,10 @@ function initial_convergence(d, state, method::ConstrainedOptimizer, initial_x,
# state.bgrad normally comes from constraints.c!(..., initial_x) in initial_state
gradient!(d, initial_x)
stopped = !isfinite(value(d)) || any(!isfinite, gradient(d))
norm(gradient(d), Inf) + norm(state.bgrad, Inf) < options.g_abstol, stopped
g_abstol = options.g_abstol
conv = isa(g_abstol, Real) ? norm(gradient(d), Inf) + norm(state.bgrad, Inf) <= options.g_abstol : # scalar tolerance
all((g, bg, tol) -> abs(g) + abs(bg) <= tol, zip(gradient(d), state.bgrad, g_abstol)) # per-component tolerance
return conv, stopped
end

function optimize(f, g, lower::AbstractArray, upper::AbstractArray, initial_x::AbstractArray, method::ConstrainedOptimizer=IPNewton(),
Expand All @@ -203,7 +206,7 @@ end
function optimize(d, lower::AbstractArray, upper::AbstractArray, initial_x::AbstractArray, method::ConstrainedOptimizer,
options::Options = Options(;default_options(method)...))
twicediffed = d isa TwiceDifferentiable ? d : TwiceDifferentiable(d, initial_x)

bounds = ConstraintBounds(lower, upper, [], [])
constraints = TwiceDifferentiableConstraints(
(c,x)->nothing, (J,x)->nothing, (H,x,λ)->nothing, bounds)
Expand Down Expand Up @@ -311,8 +314,7 @@ function optimize(d::AbstractObjective, constraints::AbstractConstraints, initia
f_abschange(d, state),
f_relchange(d, state),
g_converged,
T(options.g_abstol),
g_residual(d),
g_converge_component(options.g_abstol, d, state)...,
f_increased,
tr,
f_calls(d),
Expand Down
14 changes: 8 additions & 6 deletions src/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ x_abstol::Real = 0.0,
x_reltol::Real = 0.0,
f_abstol::Real = 0.0,
f_reltol::Real = 0.0,
g_abstol::Real = 1e-8,
g_abstol::Real = 1e-8, # alternatively, supply per-component vector of tolerances
g_reltol::Real = 1e-8,
outer_x_abstol::Real = 0.0,
outer_x_reltol::Real = 0.0,
Expand All @@ -39,14 +39,14 @@ show_every::Int = 1,
callback = nothing,
time_limit = NaN
```
See http://julianlsolvers.github.io/Optim.jl/stable/#user/config/
See http://julianlsolvers.github.io/Optim.jl/stable/user/config/
"""
struct Options{T, TCallback}
x_abstol::T
x_reltol::T
f_abstol::T
f_reltol::T
g_abstol::T
g_abstol::Union{T,Vector{T}}
g_reltol::T
outer_x_abstol::T
outer_x_reltol::T
Expand Down Expand Up @@ -80,7 +80,7 @@ function Options(;
x_reltol::Real = 0.0,
f_abstol::Real = 0.0,
f_reltol::Real = 0.0,
g_abstol::Real = 1e-8,
g_abstol::Union{Real,AbstractVector{<:Real}} = 1e-8,
g_reltol::Real = 1e-8,
outer_x_tol = nothing,
outer_f_tol = nothing,
Expand Down Expand Up @@ -129,7 +129,9 @@ function Options(;
if !(outer_f_tol === nothing)
outer_f_reltol = outer_f_tol
end
Options(promote(x_abstol, x_reltol, f_abstol, f_reltol, g_abstol, g_reltol, outer_x_abstol, outer_x_reltol, outer_f_abstol, outer_f_reltol, outer_g_abstol, outer_g_reltol)..., f_calls_limit, g_calls_limit, h_calls_limit,
scalars = promote(f_abstol, f_reltol, outer_f_abstol, outer_f_reltol, x_reltol, g_reltol, outer_x_reltol, outer_g_reltol)
T = promote_type(eltype(scalars), eltype(x_abstol), eltype(g_abstol), eltype(outer_x_abstol), eltype(outer_g_abstol))
Options{T, typeof(callback)}(x_abstol, x_reltol, f_abstol, f_reltol, to_eltype(T, g_abstol), g_reltol, outer_x_abstol, outer_x_reltol, outer_f_abstol, outer_f_reltol, outer_g_abstol, outer_g_reltol, f_calls_limit, g_calls_limit, h_calls_limit,
allow_f_increases, allow_outer_f_increases, successive_f_tol, Int(iterations), Int(outer_iterations), store_trace, trace_simplex, show_trace, extended_trace, show_warnings,
Int(show_every), callback, Float64(time_limit))
end
Expand Down Expand Up @@ -196,7 +198,7 @@ mutable struct MultivariateOptimizationResults{O, Tx, Tc, Tf, M, Tls, Tsb} <: Op
f_abschange::Tc
f_relchange::Tc
g_converged::Bool
g_abstol::Tf
g_abstol::Tf # while this might be a vector, we store the component with largest violation
g_residual::Tc
f_increased::Bool
trace::M
Expand Down
31 changes: 27 additions & 4 deletions src/utilities/assess_convergence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,27 @@ g_residual(d, state::NelderMeadState) = state.nm_x
g_residual(d::AbstractObjective) = g_residual(gradient(d))
g_residual(d::NonDifferentiable) = convert(typeof(value(d)), NaN)
g_residual(g) = maximum(abs, g)
gradient_convergence_assessment(state::AbstractOptimizerState, d, options) = g_residual(gradient(d)) options.g_abstol

function g_converge_component(g_tol::AbstractVector, g)
absratio(num, denom) = (iszero(num) && iszero(denom)) ? zero(num/denom) : abs(num)/denom

imax, maxratio = firstindex(g) - 1, typemin(first(g) / first(g_tol))
for i in eachindex(g_tol, g)
ratio = absratio(g[i], g_tol[i])
if ratio > maxratio
imax, maxratio = i, ratio
end
end
return g_tol[imax], abs(g[imax])
end
g_converge_component(g_tol, g) = g_tol, g_residual(g)
g_converge_component(g_tol::AbstractVector, d, state) = g_converge_component(g_tol, gradient(d))
g_converge_component(g_tol, d, state) = g_tol, g_residual(d, state)

function gradient_convergence_assessment(state::AbstractOptimizerState, d, options)
g_tol, gnorm = g_converge_component(options.g_abstol, d, state)
return gnorm g_tol
end
gradient_convergence_assessment(state::ZerothOrderState, d, options) = false

# Default function for convergence assessment used by
Expand Down Expand Up @@ -56,7 +76,8 @@ function assess_convergence(x, x_previous, f_x, f_x_previous, gx, x_abstol, x_re
f_increased = true
end

g_converged = g_residual(gx) g_abstol
g_converged = isa(g_abstol, Real) ? g_residual(gx) g_abstol :
all(t -> abs(t[1]) t[2], zip(gx, g_abstol))

return x_converged, f_converged, g_converged, f_increased
end
Expand Down Expand Up @@ -88,8 +109,10 @@ function assess_convergence(x,
f_increased = true
end

if g_residual(g) g_tol
g_converged = true
if isa(g_tol, Real)
g_converged = g_residual(g) g_tol
else
g_converged = all(t -> abs(t[1]) t[2], zip(g, g_tol))
end

return x_converged, f_converged, g_converged, f_increased
Expand Down
3 changes: 3 additions & 0 deletions src/utilities/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ end
(similar(initial_x), # Buffer of x for line search in state.x_ls
real(one(T))) # Keep track of step size in state.alpha
end

to_eltype(::Type{T}, x::Real) where T = T(x)
to_eltype(::Type{T}, x::AbstractArray) where T = convert(AbstractVector{T}, x)
18 changes: 18 additions & 0 deletions test/general/api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,24 @@
initial_x,
Newton(),
options_g)

options_gvec = Optim.Options(g_abstol = [1e-4, 1e-8], g_reltol = 0)

for method in (BFGS(), LBFGS(), Newton())
res = optimize(f, g!, h!,
initial_x,
method,
options_gvec)
@test Optim.g_converged(res)
@test all(abs.(Optim.gradient(d2, res.minimizer)) .≤ options_gvec.g_abstol)
str = sprint(show, res)
@test occursin(r"|g(x)|.*≤ 1.0e-04", str) || occursin(r"|g(x)|.*≤ 1.0e-08", str)
@test occursin(r"|x - x'| .*≰", str)
@test occursin(r"|x - x'|/|x'|.*≰", str)
@test occursin(r"|f(x) - f(x')| .*≰", str)
@test occursin(r"|f(x) - f(x')|/|f(x')|.*≰", str)
end

options_sa = Optim.Options(iterations = 10, store_trace = true,
show_trace = false)
res = optimize(f, g!, h!,
Expand Down
16 changes: 15 additions & 1 deletion test/general/convergence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ mutable struct DummyMethodZeroth <: Optim.ZerothOrderOptimizer end
g_tol = 1e-6
g_abstol = 1e-6
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, false)
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, [g_tol]) == (true, true, true, false)
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol/1000) == (true, true, false, false)
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, [g_tol/1000]) == (true, true, false, false)
# f_increase
f0, f1 = 1.0, 1.0 + 1e-7
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, true)
Expand All @@ -46,8 +49,19 @@ mutable struct DummyMethodZeroth <: Optim.ZerothOrderOptimizer end

@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, false, true, true)

# Implementation with both abstol and reltol
f_tol = 1e-6
@test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol, 0, f_tol, g_tol) == (true, true, true, true)
@test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol, 0, f_tol, [g_tol]) == (true, true, true, true)
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, 0, f_tol, 0, g_tol) == (true, true, true, true)
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, 0, f_tol, 0, [g_tol]) == (true, true, true, true)
@test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol/1000, 0, f_tol/1000, g_tol/1000) == (false, false, false, true)
@test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol/1000, 0, f_tol/1000, [g_tol/1000]) == (false, false, false, true)
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol/1000, 0, f_tol/1000, 0, g_tol/1000) == (false, false, false, true)
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol/1000, 0, f_tol/1000, 0, [g_tol/1000]) == (false, false, false, true)

f_tol = 1e-6 # rel tol
dOpt = DummyOptions(x_tol, f_tol, g_tol, g_abstol)
dOpt = DummyOptions(x_tol, f_tol, g_tol, g_abstol) # FIXME: this isn't used?
@test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, true)

f0, f1 = 1.0, 1.0 - 1e-7
Expand Down

0 comments on commit c11c6b2

Please sign in to comment.