Release 3.2.2

* Performance improvements * More tests
ojwoodford · Sep 23, 2023 · ced2471 · ojwoodford · Sep 23, 2023 · ced2471
1 parent d268085
commit ced2471
Show file tree

Hide file tree

Showing 8 changed files with 81 additions and 84 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "NLLSsolver"
 uuid = "50b5cf9e-bf7a-4e29-8981-4280cbba70cb"
 authors = ["Oliver Woodford"]
-version = "3.2.1"
+version = "3.2.2"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -12,7 +12,6 @@ IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
 LDLFactorizations = "40e66cde-538c-5869-a4ad-c39174c6795b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
-OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
@@ -25,7 +24,6 @@ HybridArrays = "0.4"
 IfElse = "0.1.1"
 LDLFactorizations = "0.10"
 LoopVectorization = "0.12.165"
-OrderedCollections = "1.6.2"
 Static = "0.8.7"
 StaticArrays = "1"
 VisualGeometryDatasets = "0.2.1"

diff --git a/src/VectorRepo.jl b/src/VectorRepo.jl
@@ -1,9 +1,8 @@
-using OrderedCollections
 
 struct VectorRepo{T}
-    data::OrderedDict{DataType, Vector}
+    data::Dict{DataType, Vector}
     function VectorRepo{T}() where T
-        return new{T}(OrderedDict{DataType, Vector}())
+        return new{T}(Dict{DataType, Vector}())
     end
 end
 VectorRepo() = VectorRepo{Any}()
@@ -70,27 +69,28 @@ end
 @inline vrsum(fun, vr::VectorRepo, init, T::DataType) = sum(fun, get(vr, T); init=init)
 
 # Sum reduction over a subset of the elements
-@inline sumsubset(fun, subsetfun, vr::VectorRepo{Any}) = sum(fixallbutlast(sumsubset, fun, subsetfun), values(vr.data); init=0.0)
-@inline sumsubset(fun, subsetfun, vr::VectorRepo{T}) where T = sumsubset(fun, subsetfun, vr, T)
-@inline sumsubset(fun, subsetfun, vr::VectorRepo, T::Union) = sumsubset(fun, subsetfun, vr, T.a) + sumsubset(fun, subsetfun, vr, T.b)
-@inline sumsubset(fun, subsetfun, vr::VectorRepo, T::DataType) = sumsubset(fun, subsetfun, get(vr, T))
-@inline sumsubset(fun, subsetfun::Function, vector::Vector) = sumsubset(fun, subsetfun(vector), vector)
-function sumsubset(fun, subset::Union{BitVector, Vector{Bool}}, vector::Vector)::Float64
+sumsubset(fun, subsetfun, vr::VectorRepo{Any}) = sum(fixallbutlast(sumsubsetvec, fun, subsetfun), values(vr.data); init=0.0)
+sumsubset(fun, subsetfun, vr::VectorRepo{T}) where T = sumsubsettype(fun, subsetfun, vr, T)
+@inline sumsubsettype(fun, subsetfun, vr, T::Union) = sumsubsettype(fun, subsetfun, vr, T.a) + sumsubsettype(fun, subsetfun, vr, T.b)
+@inline sumsubsettype(fun, subsetfun, vr, T::DataType) = sumsubsetparamtype(fun, subsetfun, vr, T)
+@inline sumsubsetparamtype(fun, subsetfun, vr, ::Type{T}) where T = sumsubsetvec(fun, subsetfun, get(vr, T)::Vector{T})
+@inline sumsubsetvec(fun, subsetfun, vector) = sumsubsetloop(fun, subsetfun(vector), vector)
+function sumsubsetloop(fun, subset::Union{BitVector, Vector{Bool}}, vector)::Float64
     total = 0.0
     for (ind, val) in enumerate(subset)
         if val
-            total += fun(vector[ind])
+            @fastmath total += @inbounds fun(vector[ind])::Float64
         end
     end
     return total
 end
-function sumsubset(fun, subset, vector::Vector)::Float64
+function sumsubsetloop(fun, subset, vector)::Float64
     total = 0.0
-    for ind in subset
-        total += fun(vector[ind])
+    @simd for ind in subset
+        @fastmath total += @inbounds fun(vector[ind])::Float64
     end
     return total
 end
 
 # Map
-Base.map(fun, vr::VectorRepo) = OrderedDict([eltype(val)=>fun(val) for val in values(vr)]...)
+Base.map(fun, vr::VectorRepo) = Dict([eltype(val)=>fun(val) for val in values(vr)]...)
diff --git a/src/linearsystem.jl b/src/linearsystem.jl
@@ -140,8 +140,8 @@ end
 
 function updatesymlinearsystem!(linsystem::UniVariateLS, g, H, unusedargs...)
     # Update the blocks in the problem
-    linsystem.b .+= g
-    linsystem.A .+= H
+    @fastmath linsystem.b .+= g
+    @fastmath linsystem.A .+= H
 end
 
 function updatesymA!(A, a, vars, varflags, blockindices)
@@ -152,7 +152,7 @@ function updatesymA!(A, a, vars, varflags, blockindices)
             nvi = nvars(vars[i])
             rangei = SR(1, nvi) .+ loffseti
             loffseti += nvi
-            @inbounds block(A, blockindices[i], blockindices[i], nvi, nvi) .+= view(a, rangei, rangei)
+            @fastmath @inbounds block(A, blockindices[i], blockindices[i], nvi, nvi) .+= view(a, rangei, rangei)
 
             loffsetj = static(0)
             @unroll for j in 1:i-1
@@ -161,9 +161,9 @@ function updatesymA!(A, a, vars, varflags, blockindices)
                     rangej = SR(1, nvj) .+ loffsetj
                     loffsetj += nvj
                     if @inbounds blockindices[i] >= blockindices[j] # Make sure the block matrix is lower triangular
-                        @inbounds block(A, blockindices[i], blockindices[j], nvi, nvj) .+= view(a, rangei, rangej)
+                        @fastmath @inbounds block(A, blockindices[i], blockindices[j], nvi, nvj) .+= view(a, rangei, rangej)
                     else
-                        @inbounds block(A, blockindices[j], blockindices[i], nvj, nvi) .+= view(a, rangej, rangei)
+                        @fastmath @inbounds block(A, blockindices[j], blockindices[i], nvj, nvi) .+= view(a, rangej, rangei)
                     end
                 end
             end
@@ -178,7 +178,7 @@ function updateb!(B, b, vars, varflags, boffsets, blockindices)
         if i <= length(vars) && bitiset(varflags, i)
             nv = nvars(vars[i])
             range = SR(0, nv-1)
-            @inbounds view(B, range .+ boffsets[blockindices[i]]) .+= view(b, range .+ loffset)
+            @fastmath @inbounds view(B, range .+ boffsets[blockindices[i]]) .+= view(b, range .+ loffset)
             loffset += nv
         end
     end

diff --git a/src/problem.jl b/src/problem.jl
@@ -160,11 +160,11 @@ function getvarcostmap(problem::NLLSProblem)
     return problem.varcostmap
 end
 
-function reordercostsforschur(problem::NLLSProblem, schurvars)
+function reordercostsforschur!(problem::NLLSProblem, schurvars)
     # Group residuals by Schur variable (i.e. variables to be factored out)
     # Get the cost index per Schur variable (0 if none)
     schurruns = Dict{DataType, Vector{Int}}()
-    costvarmap = sparse(view(getvarcostmap(problem), schurvars, :)')
+    costvarmap = sparse(getvarcostmap(problem)')[:,schurvars]
     schurvarpercost = sum(costvarmap; dims=2)
     @assert all(x->(x<=1), schurvarpercost) "Each cost block can only depend on one schur variable at most"
     schurvarpercost[schurvarpercost .> 0] = costvarmap.rowval
@@ -180,6 +180,7 @@ function reordercostsforschur(problem::NLLSProblem, schurvars)
             permute!(costs, sortedorder)
         end
     end
+    problem.varcostmapvalid = false
     return schurruns
 end
 

diff --git a/src/residual.jl b/src/residual.jl
@@ -30,52 +30,40 @@ function computerescostgradhess(varflags, residual, kernel, vars)
     # Compute the unrobust cost, gradient and Hessian
     cost = sqnorm(res)
     g = jac' * res
-    H = jac' * jac
 
     if varflags & 1 == 0
         # The kernel is not optimized. Differentiate w.r.t. the cost only
         cost, dc, d2c = robustifydcost(kernel, cost)
-
-        # IRLS reweighting of Hessian
-        if dc != 1
-            H *= dc
-        end
-        # Second order correction
-        if d2c != 0
-            H += ((2 * d2c) * g) * g'
-        end
-        # IRLS reweighting of gradient
-        if dc != 1
-            g *= dc
-        end
-
-        # Return the cost
-        return 0.5 * Float64(cost), g, H
+    else
+        # Differentiate w.r.t. the cost and kernel parameters
+        cost, dc_, d2c_ = robustifydkernel(kernel, cost)
+        dc = dc_[end]
+        d2c = d2c_[end,end]
+
+        # Compute the d^2/dkernel.dvariables block
+        kernelvarind = SR(1, nvars(kernel))
+        @fastmath dkdv = g * view(d2c_, kernelvarind, nvars(kernel)+1)'
     end
 
-    # Differentiate w.r.t. the cost and kernel parameters
-    cost, dc, d2c = robustifydkernel(kernel, cost)
-
-    # Compute the d^2/dkernel.dvariables block
-    kernelvarind = SR(1, nvars(kernel))
-    dkdv = g * view(d2c, kernelvarind, nvars(kernel)+1)'
-
     # IRLS reweighting of Hessian
-    if dc[end] != 1
-        H *= dc[end]
+    H = jac' * jac
+    if dc != 1
+        H *= dc
     end
     # Second order correction
-    if d2c[end] != 0
-        H += ((2 * d2c[end]) * g) * g'
+    if d2c != 0
+        H += ((2 * d2c) * g) * g'
     end
     # IRLS reweighting of gradient
-    if dc[end] != 1
-        g *= dc[end]
+    if dc != 1
+        g *= dc
     end
 
-    # Add on the kernel derivative blocks
-    g = vcat(view(dc, kernelvarind), g)
-    H = hcat(vcat(view(d2c, kernelvarind, kernelvarind), dkdv), vcat(dkdv', H))
+    if varflags & 1 == 1
+        # Add on the kernel derivative blocks
+        g = vcat(view(dc_, kernelvarind), g)
+        H = hcat(vcat(view(d2c_, kernelvarind, kernelvarind), dkdv), vcat(dkdv', H))
+    end
 
     # Return the cost and derivatives
     return 0.5 * Float64(cost), g, H

diff --git a/src/utils.jl b/src/utils.jl
@@ -26,7 +26,14 @@ bitiset(flags, bit) = (1 << (bit - 1)) & flags != 0
 @inline uniontotuple(T::Union) = (uniontotuple(T.a)..., uniontotuple(T.b)...)
 @inline uniontotuple(T::DataType) = (T,)
 
-@inline sqnorm(x) = (x' * x)
+sqnorm(x::Number) = @fastmath x * x
+function sqnorm(vec::AbstractVector)
+    total = zero(eltype(vec))
+    @turbo for i in eachindex(vec)
+        total += vec[i] * vec[i]
+    end
+    return total
+end
 
 function runlengthencodesortedints(sortedints)
     runindices = Vector{Int}(undef, sortedints[end]+2)
@@ -97,4 +104,3 @@ function fast_bAb(A::SparseMatrixCSC, b::Vector)
     end
     return total
 end
-
diff --git a/test/VectorRepo.jl b/test/VectorRepo.jl
@@ -32,15 +32,15 @@ end
     vr1 = NLLSsolver.VectorRepo()
     @test sum(i->2i, vr1) == 0.0
     fillrepo(vr1, floats, ints)
-    @test isapprox(sum(i->π*i, vr1), total * π)
+    @test sum(i->π*i, vr1) ≈ total * π
     vec = values(vr1)
     @test length(vec) == 2 && any(Base.Fix2(isa, Vector{Float64}), vec) && any(Base.Fix2(isa, Vector{Int}), vec)
 
     # Test subset reductions
-    @test NLLSsolver.sumsubset(identity, rangefun, vr1) == halftotal
-    @test NLLSsolver.sumsubset(identity, indicesfun, vr1) == halftotal
-    @test NLLSsolver.sumsubset(identity, bitvecfun, vr1) == halftotal
-    @test NLLSsolver.sumsubset(identity, boolvecfun, vr1) == halftotal
+    @test NLLSsolver.sumsubset(Float64, rangefun, vr1) == halftotal
+    @test NLLSsolver.sumsubset(Float64, indicesfun, vr1) == halftotal
+    @test NLLSsolver.sumsubset(Float64, bitvecfun, vr1) == halftotal
+    @test NLLSsolver.sumsubset(Float64, boolvecfun, vr1) == halftotal
 
     # Union container
     vr2 = NLLSsolver.VectorRepo{Union{Float64, Int, Char}}()
@@ -49,14 +49,14 @@ end
     @test any(keytup .== Float64) && any(keytup .== Int) && any(keytup .== Char)
     @test sum(i->2i, vr2) == 0.0
     fillrepo(vr2, floats, ints)
-    @test isapprox(sum(i->π*i, vr2), total * π)
+    @test sum(i->π*i, vr2) ≈ total * π
     valuetup = values(vr2)
     @test isa(valuetup, Tuple) && length(valuetup) == 3
     @test any(Base.Fix2(isa, Vector{Float64}), valuetup) && any(Base.Fix2(isa, Vector{Int}), valuetup) && any(Base.Fix2(isa, Vector{Char}), valuetup)
 
     # Test subset reductions
-    @test NLLSsolver.sumsubset(identity, rangefun, vr2) == halftotal
-    @test NLLSsolver.sumsubset(identity, indicesfun, vr2) == halftotal
-    @test NLLSsolver.sumsubset(identity, bitvecfun, vr2) == halftotal
-    @test NLLSsolver.sumsubset(identity, boolvecfun, vr2) == halftotal
+    @test NLLSsolver.sumsubset(Float64, rangefun, vr2) == halftotal
+    @test NLLSsolver.sumsubset(Float64, indicesfun, vr2) == halftotal
+    @test NLLSsolver.sumsubset(Float64, bitvecfun, vr2) == halftotal
+    @test NLLSsolver.sumsubset(Float64, boolvecfun, vr2) == halftotal
 end
diff --git a/test/optimizeba.jl b/test/optimizeba.jl
@@ -15,26 +15,25 @@ Base.eltype(::ProjError) = Float64
 
 function create_ba_problem(ncameras, nlandmarks, propvisible)
     problem = NLLSProblem(Union{EffPose3D{Float64}, Point3D{Float64}}, ProjError)
-
-    # Generate the landmarks in a unit cube centered on the origin
-    for i = 1:nlandmarks
-        addvariable!(problem, Point3D{Float64}(rand(SVector{3, Float64}) .- 0.5))
-    end
 
     # Generate the cameras on a unit sphere, pointing to the origin
     for i = 1:ncameras
         camcenter = normalize(randn(SVector{3, Float64}))
         addvariable!(problem, EffPose3D(Rotation3DL(UnitVec3D(-camcenter).v.m), Point3D(camcenter)))
     end
+
+    # Generate the landmarks in a unit cube centered on the origin
+    for i = 1:nlandmarks
+        addvariable!(problem, Point3D{Float64}(rand(SVector{3, Float64}) .- 0.5))
+    end
 
     # Generate the measurements
-    visibility = sprand(ncameras, nlandmarks, propvisible)
-    for landmark = 1:nlandmarks
-        point = problem.variables[landmark]::Point3D{Float64}
-        for camera = view(visibility.rowval, visibility.colptr[landmark]:visibility.colptr[landmark+1]-1)
-            @assert camera <= ncameras
-            cam = camera + nlandmarks
-            addcost!(problem, ProjError(project(problem.variables[cam]::EffPose3D{Float64} * point), cam, landmark))
+    visibility = sprand(nlandmarks, ncameras, propvisible)
+    for camind = 1:ncameras
+        camera = problem.variables[camind]::EffPose3D{Float64}
+        for landmark = view(visibility.rowval, visibility.colptr[camind]:visibility.colptr[camind+1]-1)
+            landmarkind = landmark + ncameras
+            addcost!(problem, ProjError(project(camera * problem.variables[landmarkind]::Point3D{Float64}), camind, landmarkind))
         end
     end
 
@@ -55,11 +54,16 @@ end
 
 @testset "optimizeba.jl" begin
     # Generate some test data for a dense problem
-    Random.seed!(1)
+    Random.seed!(5)
     problem = create_ba_problem(10, 100, 0.3)
 
-    # Optimze just the landmarks
+    # Test reordering the costs
     problem = perturb_ba_problem(problem, 0.003, 0.0)
+    costbefore = cost(problem)
+    NLLSsolver.reordercostsforschur!(problem, isa.(problem.variables, Point3D{Float64}))
+    @test cost(problem) ≈ costbefore
+
+    # Optimze just the landmarks
     result = NLLSsolver.optimizesingles!(problem, NLLSOptions(), Point3D{Float64})
     @test result.bestcost < 1.e-15