JuliaOptimalTransport · zsteve · Sep 22, 2021 · Sep 13, 2021 · Sep 13, 2021 · Sep 13, 2021
diff --git a/src/OptimalTransport.jl b/src/OptimalTransport.jl
@@ -18,7 +18,7 @@ export SinkhornGibbs, SinkhornStabilized, SinkhornEpsilonScaling
 export SinkhornBarycenterGibbs
 export QuadraticOTNewton
 
-export sinkhorn, sinkhorn2
+export sinkhorn, sinkhorn2, sinkhorn_loss
 export sinkhorn_stabilized, sinkhorn_stabilized_epsscaling, sinkhorn_barycenter
 export sinkhorn_unbalanced, sinkhorn_unbalanced2
 export sinkhorn_divergence

diff --git a/src/entropic/sinkhorn.jl b/src/entropic/sinkhorn.jl
@@ -219,6 +219,12 @@ function sinkhorn2(μ, ν, C, ε, alg::Sinkhorn; regularization=false, plan=noth
     return cost
 end
 
+function sinkhorn_loss(μ, ν, C, ε, alg::Sinkhorn; kwargs...)
+    return error(
+        "sinkhorn_loss is only implemented for alg::SinkhornGibbs. For other algorithms please use sinkhorn2",
+    )
+end
+
 """
     sinkhorn_divergence(μ::AbstractVecOrMat, ν::AbstractVecOrMat, C, ε, alg::Sinkhorn = SinkhornGibbs(); regularization = nothing, plan = nothing, kwargs...)
 Compute the Sinkhorn Divergence between finite discrete
-Compute the Sinkhorn Divergence between finite discrete
+
+Compute the Sinkhorn Divergence between finite discrete
-Compute the Sinkhorn Divergence between finite discrete
+
+Compute the Sinkhorn Divergence between finite discrete
@@ -251,13 +257,19 @@ function sinkhorn_divergence(
     alg::Sinkhorn=SinkhornGibbs(),
     algμ::Sinkhorn=SymmetricSinkhornGibbs(),
     algν::Sinkhorn=SymmetricSinkhornGibbs();
-    regularization=nothing,
+    regularization=true,
     plan=nothing,
     kwargs...,
 )
-    OTμν = sinkhorn2(μ, ν, C, ε, alg; plan=plan, regularization=false, kwargs...)
-    OTμ = sinkhorn2(μ, C, ε, algμ; plan=nothing, regularization=false, kwargs...)
-    OTν = sinkhorn2(ν, C, ε, algν; plan=nothing, regularization=false, kwargs...)
+    OTμν, OTμ, OTν = if (regularization == true) && (plan === nothing)
+        sinkhorn_loss(μ, ν, C, ε, alg; kwargs...),
+        sinkhorn_loss(μ, C, ε, algμ; kwargs...),
+        sinkhorn_loss(ν, C, ε, algν; kwargs...)
+    else
+        sinkhorn2(μ, ν, C, ε, alg; plan=plan, regularization=false, kwargs...),
+        sinkhorn2(μ, C, ε, algμ; plan=nothing, regularization=false, kwargs...),
+        sinkhorn2(ν, C, ε, algν; plan=nothing, regularization=false, kwargs...)
+    end
     return max.(0, OTμν .- (OTμ .+ OTν) / 2)
 end
 """
@@ -290,16 +302,18 @@ function sinkhorn_divergence(
     alg::Sinkhorn=SinkhornGibbs(),
     algμ::Sinkhorn=SymmetricSinkhornGibbs(),
     algν::Sinkhorn=SymmetricSinkhornGibbs();
-    regularization=nothing,
+    regularization=true,
     plan=nothing,
     kwargs...,
 )
-    if regularization !== nothing
-        @warn "`sinkhorn_divergence` does not support the `regularization` keyword argument"
+    OTμν, OTμ, OTν = if (regularization == true) && (plan === nothing)
+        sinkhorn_loss(μ, ν, Cμν, ε, alg; kwargs...),
+        sinkhorn_loss(μ, Cμ, ε, algμ; kwargs...),
+        sinkhorn_loss(ν, Cν, ε, algν; kwargs...)
+    else
+        sinkhorn2(μ, ν, Cμν, ε, alg; plan=plan, regularization=false, kwargs...),
+        sinkhorn2(μ, Cμ, ε, algμ; plan=nothing, regularization=false, kwargs...),
+        sinkhorn2(ν, Cν, ε, algν; plan=nothing, regularization=false, kwargs...)
     end
-
-    OTμν = sinkhorn2(μ, ν, Cμν, ε, alg; plan=plan, regularization=false, kwargs...)
-    OTμ = sinkhorn2(μ, Cμ, ε, algμ; plan=nothing, regularization=false, kwargs...)
-    OTν = sinkhorn2(ν, Cν, ε, algν; plan=nothing, regularization=false, kwargs...)
     return max.(0, OTμν - (OTμ + OTν) / 2)
 end
diff --git a/src/entropic/sinkhorn_gibbs.jl b/src/entropic/sinkhorn_gibbs.jl
@@ -119,6 +119,16 @@ function sinkhorn2(
     )
 end
 
+function sinkhorn_loss(μ, ν, C, ε, alg::SinkhornGibbs; kwargs...)
+    # build solver
+    solver = build_solver(μ, ν, C, ε, alg; kwargs...)
+    # perform Sinkhorn algorithm
+    solve!(solver)
+    # return loss
+    cache = solver.cache
+    return obj(cache.u, cache.v, solver.source, solver.target, solver.eps)
+end
+
 # interface
 
 prestep!(::SinkhornSolver{SinkhornGibbs}, ::Int) = nothing

diff --git a/src/entropic/symmetric.jl b/src/entropic/symmetric.jl
@@ -143,3 +143,13 @@ function sinkhorn2(
 
     return cost
 end
+
+function sinkhorn_loss(μ, C, ε, alg::SymmetricSinkhornGibbs; kwargs...)
+    # build solver
+    solver = build_solver(μ, C, ε, alg; kwargs...)
+    # perform Sinkhorn algorithm
+    solve!(solver)
+    # return loss
+    cache = solver.cache
+    return obj(cache.u, cache.u, solver.source, solver.source, solver.eps)
+end
diff --git a/test/entropic/sinkhorn_divergence.jl b/test/entropic/sinkhorn_divergence.jl
@@ -23,51 +23,67 @@ Random.seed!(100)
         C = pairwise(SqEuclidean(), x)
         f(x; μ, σ) = exp(-((x - μ) / σ)^2)
         # regularization parameter
-        ε = 0.01
+        ε = 0.05
         @testset "basic" begin
             μ = normalize!(f.(x; μ=0, σ=0.5), 1)
             M = 100
 
             ν_all = [normalize!(f.(x; μ=y, σ=0.5), 1) for y in range(-1, 1; length=M)]
 
-            loss = map(ν -> sinkhorn_divergence(μ, ν, C, ε), ν_all)
-            loss_ = map(
-                ν ->
-                    sinkhorn2(μ, ν, C, ε) -
-                    (sinkhorn2(μ, μ, C, ε) + sinkhorn2(ν, ν, C, ε)) / 2,
-                ν_all,
-            )
+            for reg in (true, false)
+                loss = map(ν -> sinkhorn_divergence(μ, ν, C, ε; regularization=reg), ν_all)
+                loss_ = map(
+                    ν ->
+                        sinkhorn2(μ, ν, C, ε; regularization=reg) -
+                        (
+                            sinkhorn2(μ, μ, C, ε; regularization=reg) +
+                            sinkhorn2(ν, ν, C, ε; regularization=reg)
+                        ) / 2,
+                    ν_all,
+                )
 
-            @test loss ≈ loss_
-            @test all(loss .≥ 0)
-            @test sinkhorn_divergence(μ, μ, C, ε) ≈ 0 atol = 1e-9
+                @test loss ≈ loss_ rtol = 1e-6
+                @test all(loss .≥ 0)
+                @test sinkhorn_divergence(μ, μ, C, ε) ≈ 0 atol = 1e-9
+            end
         end
         @testset "batch" begin
             M = 10
             μ = hcat([normalize!(f.(x; μ=randn(), σ=0.5), 1) for _ in 1:M]...)
             ν = hcat([normalize!(f.(x; μ=randn(), σ=0.5), 1) for _ in 1:M]...)
-            loss_batch = sinkhorn_divergence(μ, ν, C, ε)
-            @test loss_batch ≈ [
-                sinkhorn_divergence(x, y, C, ε) for (x, y) in zip(eachcol(μ), eachcol(ν))
-            ]
-            loss_batch_μ = sinkhorn_divergence(μ, ν[:, 1], C, ε)
-            @test loss_batch_μ ≈ [sinkhorn_divergence(x, ν[:, 1], C, ε) for x in eachcol(μ)]
-            loss_batch_ν = sinkhorn_divergence(μ[:, 1], ν, C, ε)
-            @test loss_batch_ν ≈ [sinkhorn_divergence(μ[:, 1], y, C, ε) for y in eachcol(ν)]
+            for reg in (true, false)
+                loss_batch = sinkhorn_divergence(μ, ν, C, ε; regularization=reg)
+                @test loss_batch ≈ [
+                    sinkhorn_divergence(x, y, C, ε; regularization=reg) for
+                    (x, y) in zip(eachcol(μ), eachcol(ν))
+                ]
+                loss_batch_μ = sinkhorn_divergence(μ, ν[:, 1], C, ε; regularization=reg)
+                @test loss_batch_μ ≈ [
+                    sinkhorn_divergence(x, ν[:, 1], C, ε; regularization=reg) for
+                    x in eachcol(μ)
+                ]
+                loss_batch_ν = sinkhorn_divergence(μ[:, 1], ν, C, ε; regularization=reg)
+                @test loss_batch_ν ≈ [
+                    sinkhorn_divergence(μ[:, 1], y, C, ε; regularization=reg) for
+                    y in eachcol(ν)
+                ]
+            end
         end
         @testset "AD" begin
             ε = 0.05
             μ = normalize!(f.(x; μ=-0.5, σ=0.5), 1)
             ν = normalize!(f.(x; μ=0.5, σ=0.5), 1)
             for Diff in [ForwardDiff, ReverseDiff]
-                ∇ = Diff.gradient(log.(ν)) do xs
-                    sinkhorn_divergence(μ, softmax(xs), C, ε)
-                end
-                @test size(∇) == size(ν)
-                ∇ = Diff.gradient(log.(μ)) do xs
-                    sinkhorn_divergence(μ, softmax(xs), C, ε)
+                for reg in (true, false)
+                    ∇ = Diff.gradient(log.(ν)) do xs
+                        sinkhorn_divergence(μ, softmax(xs), C, ε; regularization=reg)
+                    end
+                    @test size(∇) == size(ν)
+                    ∇ = Diff.gradient(log.(μ)) do xs
+                        sinkhorn_divergence(μ, softmax(xs), C, ε; regularization=reg)
+                    end
+                    @test norm(∇, Inf) ≈ 0 atol = 1e-9 # Sinkhorn divergence has minimum at SD(μ, μ)
                 end
-                @test norm(∇, Inf) ≈ 0 rtol = 1e-9 # Sinkhorn divergence has minimum at SD(μ, μ)
             end
         end
     end
@@ -82,27 +98,32 @@ Random.seed!(100)
         Cμν = pairwise(SqEuclidean(), μ_spt', ν_spt'; dims=2)
         Cμ = pairwise(SqEuclidean(), μ_spt'; dims=2)
         Cν = pairwise(SqEuclidean(), ν_spt'; dims=2)
-        ε = 0.05 * max(mean(Cμν), mean(Cμ), mean(Cν))
+        ε = 0.1 * max(mean(Cμν), mean(Cμ), mean(Cν))
 
         @testset "basic" begin
-            @test sinkhorn_divergence(μ, ν, Cμν, Cμ, Cν, ε) ≥ 0
-            @test sinkhorn_divergence(μ, μ, Cμ, Cμ, Cμ, ε) ≈ 0
+            for reg in (true, false)
+                @test sinkhorn_divergence(μ, ν, Cμν, Cμ, Cν, ε; regularization=reg) ≥ 0
+                @test sinkhorn_divergence(μ, μ, Cμ, Cμ, Cμ, ε; regularization=reg) ≈ 0 rtol =
+                    1e-6
+            end
         end
 
         @testset "AD" begin
             for Diff in [ForwardDiff, ReverseDiff]
-                ∇ = Diff.gradient(ν_spt) do xs
-                    Cμν = pairwise(SqEuclidean(), μ_spt', xs'; dims=2)
-                    Cν = pairwise(SqEuclidean(), xs'; dims=2)
-                    sinkhorn_divergence(μ, ν, Cμν, Cμ, Cν, ε)
-                end
-                @test size(∇) == size(ν_spt)
-                ∇ = Diff.gradient(μ_spt) do xs
-                    Cμν = pairwise(SqEuclidean(), μ_spt', xs'; dims=2)
-                    Cν = pairwise(SqEuclidean(), xs'; dims=2)
-                    sinkhorn_divergence(μ, μ, Cμν, Cμ, Cν, ε)
+                for reg in (true, false)
+                    ∇ = Diff.gradient(ν_spt) do xs
+                        Cμν = pairwise(SqEuclidean(), μ_spt', xs'; dims=2)
+                        Cν = pairwise(SqEuclidean(), xs'; dims=2)
+                        sinkhorn_divergence(μ, ν, Cμν, Cμ, Cν, ε)
+                    end
+                    @test size(∇) == size(ν_spt)
+                    ∇ = Diff.gradient(μ_spt) do xs
+                        Cμν = pairwise(SqEuclidean(), μ_spt', xs'; dims=2)
+                        Cν = pairwise(SqEuclidean(), xs'; dims=2)
+                        sinkhorn_divergence(μ, μ, Cμν, Cμ, Cν, ε)
+                    end
+                    @test norm(∇, Inf) ≈ 0 rtol = 1e-6
                 end
-                @test norm(∇, Inf) ≈ 0
             end
         end
     end