diff --git a/Project.toml b/Project.toml
index 54c68a3c..421fab33 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NLLSsolver"
 uuid = "50b5cf9e-bf7a-4e29-8981-4280cbba70cb"
 authors = ["Oliver Woodford"]
-version = "3.1.2"
+version = "3.2.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -11,6 +11,7 @@ HybridArrays = "1baab800-613f-4b0a-84e4-9cd3431bfbb9"
 IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
 LDLFactorizations = "40e66cde-538c-5869-a4ad-c39174c6795b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -23,6 +24,7 @@ ForwardDiff = "0.10"
 HybridArrays = "0.4"
 IfElse = "0.1.1"
 LDLFactorizations = "0.10"
+LoopVectorization = "0.12.165"
 OrderedCollections = "1.6.2"
 Static = "0.8.7"
 StaticArrays = "1"
diff --git a/examples/rosenbrock.jl b/examples/rosenbrock.jl
index e9ce5c34..d21d16be 100644
--- a/examples/rosenbrock.jl
+++ b/examples/rosenbrock.jl
@@ -46,25 +46,27 @@ end
 OptimResult() = OptimResult(Observable(Vector{Float64}()), Observable(Vector{Point2f}()))
 
 function runoptimizers!(results, problem, start, iterators)
+    costtrajectory = NLLSsolver.CostTrajectory()
     for (ind, iter) in enumerate(iterators)
         # Set the start
         problem.variables[1] = NLLSsolver.EuclideanVector(start[1], start[2])
 
         # Optimize the cost
-        options = NLLSsolver.NLLSOptions(reldcost=1.e-6, iterator=iter, storetrajectory=true, storecosts=true)
-        result = NLLSsolver.optimize!(problem, options)
+        options = NLLSsolver.NLLSOptions(reldcost=1.e-6, iterator=iter)
+        result = NLLSsolver.optimize!(problem, options, nothing, NLLSsolver.storecostscallback(costtrajectory))
 
         # Construct the trajectory
-        resize!(results[ind].trajectory.val, length(result.trajectory)+1)
+        resize!(results[ind].trajectory.val, length(costtrajectory.trajectory)+1)
         @inbounds results[ind].trajectory.val[1] = Point2f(start[1], start[2])
-        for (i, step) in enumerate(result.trajectory)
+        for (i, step) in enumerate(costtrajectory.trajectory)
             @inbounds results[ind].trajectory.val[i+1] = results[ind].trajectory.val[i] + Point2f(step[1], step[2])
         end
 
         # Set the costs
-        resize!(results[ind].costs.val, length(result.costs)+1)
+        resize!(results[ind].costs.val, length(costtrajectory.costs)+1)
         @inbounds results[ind].costs.val[1] = result.startcost
-        @inbounds results[ind].costs.val[2:end] .= max.(result.costs, 1.e-38)
+        @inbounds results[ind].costs.val[2:end] .= max.(costtrajectory.costs, 1.e-38)
+        empty!(costtrajectory)
     end
     for res in results
         notify(res.costs)
diff --git a/src/BlockDenseMatrix.jl b/src/BlockDenseMatrix.jl
index d671e7f6..e9963a9d 100644
--- a/src/BlockDenseMatrix.jl
+++ b/src/BlockDenseMatrix.jl
@@ -11,11 +11,11 @@ end
 uniformscaling!(bdm::BlockDenseMatrix, k) = uniformscaling!(bdm.data, k)
 
 zero!(bdm::BlockDenseMatrix) = fill!(bdm.data, 0)
-block(bdm::BlockDenseMatrix, i, j, ::StaticInt{rows}, ::StaticInt{cols}) where {rows, cols} = SizedMatrix{rows, cols}(view(bdm.data, SR(0, rows-1).+bdm.rowblockoffsets[i], SR(0, cols-1).+bdm.columnblockoffsets[j]))
-block(bdm::BlockDenseMatrix, i, j, rows::Integer, cols::Integer) = view(bdm.data, SR(0, rows-1).+bdm.rowblockoffsets[i], SR(0, cols-1).+bdm.columnblockoffsets[j])
-block(bdm::BlockDenseMatrix, i, j, ::StaticInt{rows}, cols::Integer) where rows = HybridArray{Tuple{rows, StaticArrays.Dynamic()}}(block(bdm, i, j, rows, cols))
-block(bdm::BlockDenseMatrix, i, j) = block(bdm, i, j, bdm.rowblockoffsets[i+1]-bdm.rowblockoffsets[i], bdm.columnblockoffsets[j+1]-bdm.columnblockoffsets[j])
-validblock(::BlockDenseMatrix, ::Integer, ::Integer) = true
+@inline block(bdm::BlockDenseMatrix, i, j, ::StaticInt{rows}, ::StaticInt{cols}) where {rows, cols} = @inbounds SizedMatrix{rows, cols}(view(bdm.data, SR(0, rows-1).+bdm.rowblockoffsets[i], SR(0, cols-1).+bdm.columnblockoffsets[j]))
+@inline block(bdm::BlockDenseMatrix, i, j, rows::Integer, cols::Integer) = @inbounds view(bdm.data, SR(0, rows-1).+bdm.rowblockoffsets[i], SR(0, cols-1).+bdm.columnblockoffsets[j])
+@inline block(bdm::BlockDenseMatrix, i, j, ::StaticInt{rows}, cols::Integer) where rows = HybridArray{Tuple{rows, StaticArrays.Dynamic()}}(block(bdm, i, j, rows, cols))
+@inline block(bdm::BlockDenseMatrix, i, j) = @inbounds block(bdm, i, j, bdm.rowblockoffsets[i+1]-bdm.rowblockoffsets[i], bdm.columnblockoffsets[j+1]-bdm.columnblockoffsets[j])
+@inline validblock(::BlockDenseMatrix, ::Integer, ::Integer) = true
 Base.size(bdm::BlockDenseMatrix) = size(bdm.data)
 Base.size(bdm::BlockDenseMatrix, dim) = size(bdm.data, dim)
 Base.length(bdm::BlockDenseMatrix) = length(bdm.data)
diff --git a/src/BlockSparseMatrix.jl b/src/BlockSparseMatrix.jl
index f9ffb2ff..70c1ce70 100644
--- a/src/BlockSparseMatrix.jl
+++ b/src/BlockSparseMatrix.jl
@@ -99,11 +99,11 @@ function uniformscaling!(M::BlockSparseMatrix, k)
 end
 
 zero!(bsm::BlockSparseMatrix) = fill!(bsm.data, 0)
-block(bsm::BlockSparseMatrix, i, j, ::StaticInt{rows}, ::StaticInt{cols}) where {rows, cols} = SizedMatrix{rows, cols}(view(bsm.data, SR(0, rows*cols-1).+bsm.indicestransposed[j,i]))
-block(bsm::BlockSparseMatrix, i, j, rows::Integer, cols::Integer) = reshape(view(bsm.data, SR(0, rows*cols-1).+bsm.indicestransposed[j,i]), (rows, cols))
+block(bsm::BlockSparseMatrix, i, j, ::StaticInt{rows}, ::StaticInt{cols}) where {rows, cols} = @inbounds SizedMatrix{rows, cols}(view(bsm.data, SR(0, rows*cols-1).+bsm.indicestransposed[j,i]))
+block(bsm::BlockSparseMatrix, i, j, rows::Integer, cols::Integer) = @inbounds reshape(view(bsm.data, SR(0, rows*cols-1).+bsm.indicestransposed[j,i]), (rows, cols))
 block(bsm::BlockSparseMatrix, i, j, ::StaticInt{rows}, cols::Integer) where rows = HybridArray{Tuple{rows, StaticArrays.Dynamic()}}(block(bsm, i, j, rows, cols))
-block(bsm::BlockSparseMatrix, i, j) = block(bsm, i, j, bsm.rowblocksizes[i], bsm.columnblocksizes[j])
-validblock(bsm::BlockSparseMatrix, i, j) = bsm.indicestransposed[j,i] != 0
+block(bsm::BlockSparseMatrix, i, j) = @inbounds block(bsm, i, j, bsm.rowblocksizes[i], bsm.columnblocksizes[j])
+validblock(bsm::BlockSparseMatrix, i, j) = @inbounds bsm.indicestransposed[j,i] != 0
 Base.size(bsm::BlockSparseMatrix) = (bsm.m, bsm.n)
 Base.size(bsm::BlockSparseMatrix, dim) = dim == 1 ? bsm.m : (dim == 2 ? bsm.n : 1)
 Base.length(bsm::BlockSparseMatrix) = bsm.m * bsm.n
diff --git a/src/NLLSsolver.jl b/src/NLLSsolver.jl
index 1bc2aef2..80ef63e9 100644
--- a/src/NLLSsolver.jl
+++ b/src/NLLSsolver.jl
@@ -8,15 +8,17 @@ export ContaminatedGaussian # Adaptive robustifiers
 export EuclideanVector, ZeroToInfScalar, ZeroToOneScalar # Concrete general variables
 export Rotation3DR, Rotation3DL, Point3D, Pose3D, EffPose3D, UnitPose3D # Concrete 3D geometry variable types
 export SimpleCamera, NoDistortionCamera, ExtendedUnifiedCamera, BarrelDistortion, EULensDistortion # Concrete camera sensor & lens variable types
+export CostTrajectory # Object for storing opimization trajectory and costs
 # Functions
 export addcost!, addvariable!, updatevarcostmap!, subproblem, subproblem!, nvars, nres # Construct a problem
 export update, nvars # Variable interface
 export nres, ndeps, varindices, getvars # Residual interface
 export robustkernel, robustify, robustifydcost, robustifydkernel # Robustifier interface
 export cost, computeresidual, computeresjac, computecost, computecostgradhess # Compute the objective
-export optimize!, printoutcallback  # Optimize the objective
+export optimize!  # Optimize the objective
 export rodrigues, project, epipolarerror, proj2orthonormal # Multi-view geometry helper functions
 export ideal2image, image2ideal, pixel2image, image2pixel, ideal2distorted, distorted2ideal, convertlens
+export nullcallback, printoutcallback, storecostscallback # Callbacks to be called once per optimization iteration
 
 # Exported abstract types
 abstract type AbstractCost end  # Standard (non-squared) cost
@@ -57,6 +59,7 @@ include("robustadaptive.jl")
 include("autodiff.jl")
 include("cost.jl")
 include("linearsolver.jl")
+include("callbacks.jl")
 include("iterators.jl")
 include("optimize.jl")
 end
diff --git a/src/autodiff.jl b/src/autodiff.jl
index 180d3bc9..1d606fa5 100644
--- a/src/autodiff.jl
+++ b/src/autodiff.jl
@@ -162,4 +162,24 @@ computegradhesshelper(varflags, residual, vars, x) = computecost(residual, updat
 
 @inline autorobustifydcost(kernel::AbstractRobustifier, cost) = computehessian(Base.Fix1(robustify, kernel), cost)
 @inline autorobustifydkernel(kernel::AbstractAdaptiveRobustifier, cost::T) where T = computehessian(fixallbutlast(computerobustgradhesshelper, kernel, cost), zeros(SVector{nvars(kernel)+1, T}))
-computerobustgradhesshelper(kernel, cost, x) = robustify(update(kernel, x), cost+x[end])
\ No newline at end of file
+computerobustgradhesshelper(kernel, cost, x) = robustify(update(kernel, x), cost+x[end])
+
+# Overloads of some specific functions
+struct RotMatLie{T}
+    x::T
+    y::T
+    z::T
+end
+
+function rodrigues(x::T, y::T, z::T) where T<:ForwardDiff.Dual
+    @assert x == 0 && y == 0 && z == 0
+    return RotMatLie{T}(x, y, z)
+end
+
+du(x) = ForwardDiff.partials(x)
+Base.:*(r::AbstractMatrix, u::RotMatLie{T}) where T = SMatrix{3, 3, T, 9}(T(r[1,1], du(u.z)*r[1,2]-du(u.y)*r[1,3]), T(r[2,1], du(u.z)*r[2,2]-du(u.y)*r[2,3]), T(r[3,1], du(u.z)*r[3,2]-du(u.y)*r[3,3]),
+                                                                          T(r[1,2], du(u.x)*r[1,3]-du(u.z)*r[1,1]), T(r[2,2], du(u.x)*r[2,3]-du(u.z)*r[2,1]), T(r[3,2], du(u.x)*r[3,3]-du(u.z)*r[3,1]),
+                                                                          T(r[1,3], du(u.y)*r[1,1]-du(u.x)*r[1,2]), T(r[2,3], du(u.y)*r[2,1]-du(u.x)*r[2,2]), T(r[3,3], du(u.y)*r[3,1]-du(u.x)*r[3,2]))
+Base.:*(u::RotMatLie{T}, r::AbstractMatrix) where T = SMatrix{3, 3, T, 9}(T(r[1,1], du(u.y)*r[3,1]-du(u.z)*r[2,1]), T(r[2,1], du(u.z)*r[1,1]-du(u.x)*r[3,1]), T(r[3,1], du(u.x)*r[2,1]-du(u.y)*r[1,1]),
+                                                                          T(r[1,2], du(u.y)*r[3,2]-du(u.z)*r[2,2]), T(r[2,2], du(u.z)*r[1,2]-du(u.x)*r[3,2]), T(r[3,2], du(u.x)*r[2,2]-du(u.y)*r[1,2]),
+                                                                          T(r[1,3], du(u.y)*r[3,3]-du(u.z)*r[2,3]), T(r[2,3], du(u.z)*r[1,3]-du(u.x)*r[3,3]), T(r[3,3], du(u.x)*r[2,3]-du(u.y)*r[1,3]))
diff --git a/src/callbacks.jl b/src/callbacks.jl
new file mode 100644
index 00000000..2f041e7f
--- /dev/null
+++ b/src/callbacks.jl
@@ -0,0 +1,54 @@
+import Printf.@printf
+
+# Default: do nothing
+nullcallback(cost, unusedargs...) = (cost, 0)
+
+# Print out per-iteration results
+function printoutcallback(cost, problem, data, trailingargs...)
+    if data.iternum == 1
+        # First iteration, so print out column headers and the zeroth iteration (i.e. start) values
+        println("iter      cost      cost_change    |step|")
+        @printf("% 4d % 8e  % 4.3e   % 3.2e\n", 0, data.bestcost, 0, 0)
+    end
+    @printf("% 4d % 8e  % 4.3e   % 3.2e\n", data.iternum, cost, data.bestcost-cost, norm(data.linsystem.x))
+    return cost, 0
+end
+function printoutcallback(cost, data, trradius::Float64)
+    if data.iternum == 1
+        # First iteration, so print out column headers and the zeroth iteration (i.e. start) values
+        println("iter      cost      cost_change    |step|    tr_radius")
+        @printf("% 4d % 8e  % 4.3e   % 3.2e   % 2.1e\n", 0, data.bestcost, 0, 0, trradius)
+    end
+    @printf("% 4d % 8e  % 4.3e   % 3.2e   % 2.1e\n", data.iternum, cost, data.bestcost-cost, norm(data.linsystem.x), trradius)
+    return cost, 0
+end
+
+# Store per-iteration costs
+function storecostscallback(costs::Vector{Float64}, cost, unusedargs...)
+    push!(costs, cost)
+    return (cost, 0)
+end
+
+struct CostTrajectory
+    costs::Vector{Float64}
+    trajectory::Vector{Vector{Float64}}
+    
+    function CostTrajectory()
+        return new(sizehint!(Vector{Float64}(), 50), sizehint!(Vector{Vector{Float64}}(), 50))
+    end
+end
+function Base.empty!(ct::CostTrajectory)
+    empty!(ct.costs)
+    empty!(ct.trajectory)
+    return ct
+end
+
+# Store per-iteration costs and trajectory
+function storecostscallback(store::CostTrajectory, cost, problem, data, unusedargs...)
+    push!(store.costs, cost)
+    push!(store.trajectory, Vector{Float64}(data.linsystem.x))
+    return (cost, 0)
+end
+
+# Callback function generator
+storecostscallback(store) = (args...) -> storecostscallback(store, args...)
diff --git a/src/iterators.jl b/src/iterators.jl
index 00646c32..414e9d68 100644
--- a/src/iterators.jl
+++ b/src/iterators.jl
@@ -24,9 +24,10 @@ end
 # Dogleg optimization
 mutable struct DoglegData
     trustradius::Float64
+    cauchy::Vector{Float64}
 
-    function DoglegData()
-        return new(0.0)
+    function DoglegData(varlen)
+        return new(0.0, Vector{Float64}(undef, varlen))
     end
 end
 
@@ -35,8 +36,8 @@ function iterate!(doglegdata::DoglegData, data, problem::NLLSProblem, options::N
     data.timesolver += @elapsed_ns begin
         # Compute the Cauchy step
         gnorm2 = gradient' * gradient
-        a = gnorm2 / ((gradient' * hessian) * gradient + floatmin(eltype(gradient)))
-        cauchy = -a * gradient
+        a = gnorm2 / (fast_bAb(hessian, gradient) + floatmin(eltype(gradient)))
+        doglegdata.cauchy = -a * gradient
         alpha2 = a * a * gnorm2
         alpha = sqrt(alpha2)
         if doglegdata.trustradius == 0
@@ -55,7 +56,7 @@ function iterate!(doglegdata::DoglegData, data, problem::NLLSProblem, options::N
         # Determine the step
         if !(alpha < doglegdata.trustradius)
             # Along first leg
-            data.linsystem.x .= (doglegdata.trustradius / alpha) * cauchy
+            data.linsystem.x .= (doglegdata.trustradius / alpha) * doglegdata.cauchy
             linear_approx = doglegdata.trustradius * (2 * alpha - doglegdata.trustradius) / (2 * a)
         else
             # Along second leg
@@ -65,9 +66,9 @@ function iterate!(doglegdata::DoglegData, data, problem::NLLSProblem, options::N
             else
                 # Find the point along the Cauchy -> Newton line on the trust
                 # region circumference
-                data.linsystem.x .-= cauchy
+                data.linsystem.x .-= doglegdata.cauchy
                 sq_leg = data.linsystem.x' * data.linsystem.x
-                c = cauchy' * data.linsystem.x
+                c = doglegdata.cauchy' * data.linsystem.x
                 trsq = doglegdata.trustradius * doglegdata.trustradius - alpha2
                 step = sqrt(c * c + sq_leg * trsq)
                 if c <= 0
@@ -76,7 +77,7 @@ function iterate!(doglegdata::DoglegData, data, problem::NLLSProblem, options::N
                     step = trsq / (c + step)
                 end
                 data.linsystem.x .*= step
-                data.linsystem.x .+= cauchy
+                data.linsystem.x .+= doglegdata.cauchy
                 linear_approx = 0.5 * (a * (1 - step) ^ 2 * gnorm2) + step * (2 - step) * cost_
             end
         end
@@ -132,7 +133,7 @@ function iterate!(levmardata::LevMarData, data, problem::NLLSProblem, options::N
         if !(cost_ > data.bestcost) || (maximum(abs, data.linsystem.x) < options.dstep)
             # Success (or convergence) - update lambda
             uniformscaling!(hessian, -lastlambda)
-            stepquality = (cost_ - data.bestcost) / (((data.linsystem.x' * hessian) * 0.5 + gradient') * data.linsystem.x)
+            stepquality = (cost_ - data.bestcost) / (0.5 * fast_bAb(hessian, data.linsystem.x) + dot(gradient, data.linsystem.x))
             levmardata.lambda *= stepquality < 0.983 ? 1 - (2 * stepquality - 1) ^ 3 : 0.1
             # Return the cost
             return cost_
diff --git a/src/optimize.jl b/src/optimize.jl
index 7dde5663..49102389 100644
--- a/src/optimize.jl
+++ b/src/optimize.jl
@@ -1,10 +1,9 @@
 
 # Uni-variate optimization (single unfixed variable)
-optimize!(problem::NLLSProblem, options::NLLSOptions, unfixed::Integer, starttimens=Base.time_ns())::NLLSResult = optimizeinternal!(problem, options, NLLSInternal(UInt(unfixed), nvars(problem.variables[unfixed])), starttimens)
+optimize!(problem::NLLSProblem, options::NLLSOptions, unfixed::Integer, callback=nullcallback, starttimens=Base.time_ns())::NLLSResult = optimizeinternal_checkcallback!(problem, options, NLLSInternal(UInt(unfixed), nvars(problem.variables[unfixed])), callback, starttimens)
 
 # Multi-variate optimization
-optimize!(problem::NLLSProblem, options::NLLSOptions, unfixed::Type) = optimize!(problem, options, typeof.(variables).==unfixed)
-function optimize!(problem::NLLSProblem, options::NLLSOptions=NLLSOptions(), unfixed::AbstractVector=trues(length(problem.variables)))::NLLSResult
+function optimize!(problem::NLLSProblem, options::NLLSOptions, unfixed::AbstractVector, callback)::NLLSResult
     starttime = Base.time_ns()
     @assert length(problem.variables) > 0
     # Compute the number of free variables (nblocks)
@@ -12,12 +11,20 @@ function optimize!(problem::NLLSProblem, options::NLLSOptions=NLLSOptions(), unf
     if nblocks == 1
         # One unfixed variable
         unfixed = findfirst(unfixed)
-        return optimize!(problem, options, unfixed, starttime)
+        return optimize!(problem, options, unfixed, callback, starttime)
     end
     # Multiple variables. Use a block sparse matrix
-    return optimizeinternal!(problem, options, NLLSInternal(makesymmvls(problem, unfixed, nblocks)), starttime)
+    return optimizeinternal_checkcallback!(problem, options, NLLSInternal(makesymmvls(problem, unfixed, nblocks)), callback, starttime)
 end
 
+# Conversions for different types of "unfixed"
+convertunfixed(::Nothing, problem) = trues(length(problem.variables))
+convertunfixed(unfixed::Type, problem) = typeof.(problem.variables) .== unfixed
+convertunfixed(unfixed, problem) = unfixed
+
+# Default options
+optimize!(problem::NLLSProblem, options::NLLSOptions=NLLSOptions(), unfixed=nothing, callback=nullcallback) = optimize!(problem, options, convertunfixed(unfixed, problem), callback)
+
 # Optimize one variable at a time
 function optimizesingles!(problem::NLLSProblem{VT, CT}, options::NLLSOptions, type::DataType)::NLLSResult where {VT, CT}
     starttime = Base.time_ns()
@@ -60,7 +67,16 @@ function optimizesingles!(problem::NLLSProblem{VT, CT}, options::NLLSOptions, ty
     return NLLSResult(startcost, endcost, (Base.time_ns() - starttime)*1.e-9, timeinit + subprobinit*1.e-9, timecost, timegradient, timesolver, 0, iternum, costcomputations, gradientcomputations, linearsolvers, Vector{Float64}(), Vector{Vector{Float64}}())
 end
 
-function optimizeinternal!(problem::NLLSProblem, options::NLLSOptions, data::NLLSInternal, starttimens::UInt64)::NLLSResult
+function optimizeinternal_checkcallback!(problem::NLLSProblem, options::NLLSOptions, data::NLLSInternal, callback, starttimens::UInt64)::NLLSResult
+    if isnothing(options.callback)
+        return optimizeinternal!(problem, options, data, callback, starttimens)
+    else
+        Base.depwarn("Setting callback in options is deprecated. Pass the callback directly to optimize!() instead", :NLLSOptions)
+        return optimizeinternal!(problem, options, data, options.callback, starttimens)
+    end
+end
+
+function optimizeinternal!(problem::NLLSProblem, options::NLLSOptions, data::NLLSInternal, callback, starttimens::UInt64)::NLLSResult
     # Copy the variables
     if length(problem.variables) != length(problem.varnext)
         problem.varnext = copy(problem.variables)
@@ -69,27 +85,27 @@ function optimizeinternal!(problem::NLLSProblem, options::NLLSOptions, data::NLL
     if options.iterator == newton || options.iterator == gaussnewton
         # Newton's method, using Gauss' approximation to the Hessian (optimizing Hessian form)
         newtondata = NewtonData()
-        return optimizeinternal!(problem, options, data, newtondata, starttimens)
+        return optimizeinternal!(problem, options, data, newtondata, callback, starttimens)
     end
     if options.iterator == levenbergmarquardt
         # Levenberg-Marquardt
         levmardata = LevMarData()
-        return optimizeinternal!(problem, options, data, levmardata, starttimens)
+        return optimizeinternal!(problem, options, data, levmardata, callback, starttimens)
     end
     if options.iterator == dogleg
         # Dogleg
-        doglegdata = DoglegData()
-        return optimizeinternal!(problem, options, data, doglegdata, starttimens)
+        doglegdata = DoglegData(length(data.linsystem.x))
+        return optimizeinternal!(problem, options, data, doglegdata, callback, starttimens)
     end
     if options.iterator == gradientdescent
         # Gradient descent
         gddata = GradientDescentData(1.0)
-        return optimizeinternal!(problem, options, data, gddata, starttimens)
+        return optimizeinternal!(problem, options, data, gddata, callback, starttimens)
     end
     error("Iterator not recognized")
 end
 
-function optimizeinternal!(problem::NLLSProblem, options::NLLSOptions, data, iteratedata, starttime::UInt64)::NLLSResult
+function optimizeinternal!(problem::NLLSProblem, options::NLLSOptions, data, iteratedata, callback, starttime::UInt64)::NLLSResult
     timeinit = Base.time_ns() - starttime
     # Initialize the linear problem
     data.timegradient += @elapsed_ns data.bestcost = costgradhess!(data.linsystem, problem.variables, problem.costs)
@@ -107,7 +123,7 @@ function optimizeinternal!(problem::NLLSProblem, options::NLLSOptions, data, ite
         # Call the per iteration solver
         cost = iterate!(iteratedata, data, problem, options)::Float64
         # Call the user-defined callback
-        cost, terminate = options.callback(cost, problem, data, iteratedata)::Tuple{Float64, Int}
+        cost, terminate = callback(cost, problem, data, iteratedata)::Tuple{Float64, Int}
         # Store the cost if necessary
         if options.storecosts
             push!(costs, cost)
diff --git a/src/structs.jl b/src/structs.jl
index 139a1527..9fb51152 100644
--- a/src/structs.jl
+++ b/src/structs.jl
@@ -30,31 +30,14 @@ struct NLLSOptions
     storecosts::Bool            # Indicates whether the cost per outer iteration should be stored
     storetrajectory::Bool       # Indicates whether the step per outer iteration should be stored
 end
-function NLLSOptions(; maxiters=100, reldcost=1.e-15, absdcost=1.e-15, dstep=1.e-15, maxfails=3, iterator=levenbergmarquardt, callback=(cost, args...)->(cost, 0), storecosts=false, storetrajectory=false)
+function NLLSOptions(; maxiters=100, reldcost=1.e-15, absdcost=1.e-15, dstep=1.e-15, maxfails=3, iterator=levenbergmarquardt, callback=nothing, storecosts=false, storetrajectory=false)
     if iterator == gaussnewton
         Base.depwarn("gaussnewton is deprecated. Use newton instead", :NLLSOptions)
     end
-    NLLSOptions(reldcost, absdcost, dstep, maxfails, maxiters, iterator, callback, storecosts, storetrajectory)
-end
-
-# Utility callback that prints out per-iteration results
-function printoutcallback(cost, problem, data, trailingargs...)
-    if data.iternum == 1
-        # First iteration, so print out column headers and the zeroth iteration (i.e. start) values
-        println("iter      cost      cost_change    |step|")
-        @printf("% 4d % 8e  % 4.3e   % 3.2e\n", 0, data.bestcost, 0, 0)
+    if storecosts || storetrajectory
+        Base.depwarn("storecosts and storetrajectory are deprecated. Use storecostscallback instead", :NLLSOptions)
     end
-    @printf("% 4d % 8e  % 4.3e   % 3.2e\n", data.iternum, cost, data.bestcost-cost, norm(data.linsystem.x))
-    return cost, 0
-end
-function printoutcallback(cost, data, trradius::Float64)
-    if data.iternum == 1
-        # First iteration, so print out column headers and the zeroth iteration (i.e. start) values
-        println("iter      cost      cost_change    |step|    tr_radius")
-        @printf("% 4d % 8e  % 4.3e   % 3.2e   % 2.1e\n", 0, data.bestcost, 0, 0, trradius)
-    end
-    @printf("% 4d % 8e  % 4.3e   % 3.2e   % 2.1e\n", data.iternum, cost, data.bestcost-cost, norm(data.linsystem.x), trradius)
-    return cost, 0
+    NLLSOptions(reldcost, absdcost, dstep, maxfails, maxiters, iterator, callback, storecosts, storetrajectory)
 end
 
 struct NLLSResult
diff --git a/src/utils.jl b/src/utils.jl
index f4d113b1..24bb96d3 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,4 +1,4 @@
-using StaticArrays, Static
+using StaticArrays, Static, SparseArrays, LoopVectorization
 
 @inline function valuedispatch(lower::StaticInt, upper::StaticInt, val::Int, fun)
     if lower >= upper
@@ -55,8 +55,46 @@ end
 function Base.cumsum!(A::AbstractVector)
     total = zero(eltype(A))
     @inbounds for ind in eachindex(A)
-        total += A[ind]
+        @fastmath total += A[ind]
         A[ind] = total
     end
     return A
 end
+
+function fast_bAb(A::Matrix, b::Vector)
+    total = zero(eltype(b))
+    @tturbo for i in eachindex(b)
+        subtotal = zero(eltype(b))
+        for j in eachindex(b)
+            subtotal += A[j,i] * b[j]
+        end
+        total += b[i] * subtotal
+    end
+    return total
+end
+
+function fast_bAb(A::StaticArray, b::StaticArray)
+    total = zero(eltype(b))
+    @turbo for i in eachindex(b)
+        subtotal = zero(eltype(b))
+        for j in eachindex(b)
+            subtotal += A[j,i] * b[j]
+        end
+        total += b[i] * subtotal
+    end
+    return total
+end
+
+function fast_bAb(A::SparseMatrixCSC, b::Vector)
+    total = zero(eltype(b))
+    for i in eachindex(b)
+        coltotal = zero(eltype(b))
+        for j in nzrange(A, i)
+            @inbounds @fastmath coltotal += A.nzval[j] * b[A.rowval[j]]
+        end
+        @inbounds @fastmath coltotal *= b[i]
+        total += coltotal
+    end
+    return total
+end
+
diff --git a/src/visualgeometry/geometry.jl b/src/visualgeometry/geometry.jl
index c7acf34c..65d9d48d 100644
--- a/src/visualgeometry/geometry.jl
+++ b/src/visualgeometry/geometry.jl
@@ -1,10 +1,6 @@
 using StaticArrays, LinearAlgebra
 
 function rodrigues(x::T, y::T, z::T) where T<:Number
-    if x == 0 && y == 0 && z == 0
-        # Short cut for derivatives at identity
-        return SMatrix{3, 3, T, 9}(T(1), z, -y, -z, T(1), x, y, -x, T(1))
-    end
     theta2 = x * x + y * y + z * z
     cosf = T(0.5)
     sinc = T(1)
@@ -62,7 +58,7 @@ end
 Rotation3DR(x, y, z) = Rotation3DR(rodrigues(x, y, z))
 Rotation3DR() = Rotation3DR(SMatrix{3, 3, Float64}(1., 0., 0., 0., 1., 0., 0., 0., 1.))
 nvars(::Rotation3DR) = static(3)
-update(var::Rotation3DR, updatevec, start=1) = var * Rotation3DR(updatevec[start], updatevec[start+1], updatevec[start+2])
+update(var::Rotation3DR, updatevec, start=1) = Rotation3DR(var.m * rodrigues(updatevec[start], updatevec[start+1], updatevec[start+2]))
 
 struct Rotation3DL{T<:Real} <: AbstractRotation3D
     m::SMatrix{3, 3, T, 9}
@@ -70,8 +66,7 @@ end
 Rotation3DL(x, y, z) = Rotation3DL(rodrigues(x, y, z))
 Rotation3DL() = Rotation3DL(SMatrix{3, 3, Float64}(1., 0., 0., 0., 1., 0., 0., 0., 1.))
 nvars(::Rotation3DL) = static(3)
-update(var::Rotation3DL, updatevec, start=1) = Rotation3DL(updatevec[start], updatevec[start+1], updatevec[start+2]) * var
-
+update(var::Rotation3DL, updatevec, start=1) = Rotation3DL(rodrigues(updatevec[start], updatevec[start+1], updatevec[start+2]) * var.m)
 
 struct UnitVec3D{T<:Real} <: AbstractPoint3D
     v::Rotation3DR{T}
diff --git a/test/adaptivecost.jl b/test/adaptivecost.jl
index 072407a7..65805790 100644
--- a/test/adaptivecost.jl
+++ b/test/adaptivecost.jl
@@ -51,7 +51,7 @@ end
     problem.variables[3] = 0.
 
     # Optimize the cost by alternating EM & optimizing the mean
-    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.newton, callback=emcallback), SVector(1, 2, 3) .> 1)
+    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.newton), SVector(1, 2, 3) .> 1, emcallback)
 
     # Check the result
     @test isapprox(NLLSsolver.params(problem.variables[1]), SVector(1.0, 10.0, 0.8); rtol=0.1)
diff --git a/test/dynamicvars.jl b/test/dynamicvars.jl
index 5e5f9f32..00a47d64 100644
--- a/test/dynamicvars.jl
+++ b/test/dynamicvars.jl
@@ -33,7 +33,7 @@ Base.eltype(::NormResidual) = Float64
     NLLSsolver.addcost!(problem, NormResidual(length(X)))
 
     # Optimize
-    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.newton, storecosts=true))
+    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.newton))
 
     # Check the result is collinear to X
     Y = problem.variables[1]
diff --git a/test/functional.jl b/test/functional.jl
index bded1328..78b4c9a8 100644
--- a/test/functional.jl
+++ b/test/functional.jl
@@ -48,29 +48,38 @@ Base.eltype(::RosenbrockB) = Float64
     @test NLLSsolver.cost(subprob) == 0.
 
     # Check callback termination
-    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(callback=(cost, unusedargs...)->(cost, 13)))
+    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(), nothing, (cost, unusedargs...)->(cost, 13))
     @test result.termination == (13 << 9)
     @test result.niterations == 1
 
-    # Test optimization
-    for iter in [NLLSsolver.newton, NLLSsolver.levenbergmarquardt, NLLSsolver.dogleg]
-        # Set the start
-        problem.variables[1] = -0.5
-        problem.variables[2] = 2.5
+    # Optimize using Newton
+    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.newton))
+    @test isapprox(problem.variables[1], 1.0; rtol=1.e-10)
+    @test isapprox(problem.variables[2], 1.0; rtol=1.e-10)
 
-        # Optimize the cost
-        result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=iter))
+    # Optimize using Levenberg-Marquardt
+    problem.variables[1] = -0.5
+    problem.variables[2] = 2.5
+    ct = NLLSsolver.CostTrajectory()
+    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.levenbergmarquardt), nothing, NLLSsolver.storecostscallback(ct))
+    @test isapprox(problem.variables[1], 1.0; rtol=1.e-10)
+    @test isapprox(problem.variables[2], 1.0; rtol=1.e-10)
+    @test all(diff(ct.costs) .<= 0.0) # Check costs decrease
 
-        # Check the result
-        @test isapprox(problem.variables[1], 1.0; rtol=1.e-10)
-        @test isapprox(problem.variables[2], 1.0; rtol=1.e-10)
-    end
+    # Optimize using dogleg
+    problem.variables[1] = -0.5
+    problem.variables[2] = 2.5
+    empty!(ct)
+    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.dogleg), nothing, NLLSsolver.storecostscallback(ct.costs))
+    @test isapprox(problem.variables[1], 1.0; rtol=1.e-10)
+    @test isapprox(problem.variables[2], 1.0; rtol=1.e-10)
+    @test all(diff(ct.costs) .<= 0.0) # Check costs decrease
 
     # Test standard gradient descent (a worse optimizer, so needs a closer starting point)
     problem.variables[1] = 1.0 - 1.e-5
     problem.variables[2] = 1.0
     display(problem)
-    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.gradientdescent, callback=NLLSsolver.printoutcallback))
+    result = NLLSsolver.optimize!(problem, NLLSsolver.NLLSOptions(iterator=NLLSsolver.gradientdescent), nothing, NLLSsolver.printoutcallback)
     display(result)
     @test isapprox(problem.variables[1], 1.0; rtol=1.e-5)
     @test isapprox(problem.variables[2], 1.0; rtol=1.e-5)
diff --git a/test/geometry.jl b/test/geometry.jl
index 495620ad..a43213ca 100644
--- a/test/geometry.jl
+++ b/test/geometry.jl
@@ -1,4 +1,6 @@
-using NLLSsolver, LinearAlgebra, Test
+using NLLSsolver, LinearAlgebra, StaticArrays, Test
+
+checkduals(x, y) = NLLSsolver.extractvaldual(vec(x)) == NLLSsolver.extractvaldual(vec(y))
 
 @testset "geometry.jl" begin
     # Test utility functions
@@ -22,6 +24,15 @@ using NLLSsolver, LinearAlgebra, Test
     @test isapprox(NLLSsolver.getvec((rotl * rotr) * point), NLLSsolver.getvec(point))
     @test isapprox(NLLSsolver.getvec((NLLSsolver.inverse(rotl) * NLLSsolver.inverse(rotr)) * point), NLLSsolver.getvec(point))
 
+    # Test rotation autodiff updates
+    updatevec = NLLSsolver.dualzeros(Float64, Val(4))
+    T = eltype(updatevec)
+    updatemat = SMatrix{3, 3, T, 9}(T(1), updatevec[3], -updatevec[2], -updatevec[3], T(1), updatevec[1], updatevec[2], -updatevec[1], T(1))
+    rotr = NLLSsolver.Rotation3DR(randn(), randn(), randn())
+    rotl = NLLSsolver.Rotation3DL(randn(), randn(), randn())
+    @test checkduals(NLLSsolver.update(rotr, updatevec).m, rotr.m * updatemat)
+    @test checkduals(NLLSsolver.update(rotl, updatevec).m, updatemat * rotl.m)
+
     # Test poses
     updatevec = zeros(6)
     pose = NLLSsolver.update(NLLSsolver.Pose3D(rotr, point), updatevec)
diff --git a/test/utils.jl b/test/utils.jl
index 6285b7ea..ae90f82b 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,10 +1,17 @@
-using NLLSsolver, Test
+using NLLSsolver, Test, SparseArrays, StaticArrays, LinearAlgebra
+
+test_fast_bAb(A, b) = @test NLLSsolver.fast_bAb(A, b) ≈ dot(b, A, b)
 
 @testset "utils.jl" begin
     @test NLLSsolver.runlengthencodesortedints([0, 0, 1, 1, 3]) == [1, 3, 5, 5, 6]
     @test NLLSsolver.runlengthencodesortedints([3]) == [1, 1, 1, 1, 2]
     @test NLLSsolver.runlengthencodesortedints([0]) == [1, 2]
+
     x = [1, 2, 3]
     @test cumsum!(x) == [1, 3, 6]
     @test x == [1, 3, 6]
+
+    test_fast_bAb(randn(20, 20), randn(20))
+    test_fast_bAb(sprandn(100, 100, 0.02), randn(100))
+    test_fast_bAb(randn(SMatrix{10, 10, Float64, 100}), randn(SVector{10, Float64}))
 end