diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 00000000..2b1c7e61
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,16 @@
+steps:
+  - label: "Julia v1"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1:
+          coverage: false
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    # Don't run Buildkite if the commit message includes the text [skip tests]
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+env:
+  GROUP: GPU
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index b1b3a6ba..43887b24 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -49,6 +49,7 @@ jobs:
           coverage: ${{ matrix.coverage || false }}
         env:
           PYTHON: '' # Use Conda.jl also on Linux
+          GROUP: OptimalTransport
       - uses: julia-actions/julia-processcoverage@v1
         if: matrix.coverage
       - uses: codecov/codecov-action@v1
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index 8e8b864f..af221183 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -13,4 +13,4 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
-        run: julia -e 'using CompatHelper; CompatHelper.main(; subdirs = ["", "docs", "examples/basic"])'
+        run: julia -e 'using CompatHelper; CompatHelper.main(; subdirs = ["", "docs", "docs/gpu", "examples/basic"])'
diff --git a/.gitignore b/.gitignore
index ef626793..936163cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,8 @@ docs/src/examples/
 # committed for packages, but should be committed for applications that require a static
 # environment.
 /Manifest.toml
+/test/Manifest.toml
+/test/gpu/Manifest.toml
 
 # Files generated by Jupyter Notebooks
 *.ipynb_checkpoints
diff --git a/Project.toml b/Project.toml
index 6effd863..66c4cc8f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,10 +23,12 @@ QuadGK = "2"
 julia = "1"
 
 [extras]
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 PythonOT = "3c485715-4278-42b2-9b5f-8f00e43c12ef"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Tulip = "6dd1b50a-3aae-11e9-10b5-ef983d2400fa"
 
 [targets]
-test = ["Tulip", "PythonOT", "Random", "Test"]
+test = ["Pkg", "PythonOT", "Random", "SafeTestsets", "Test", "Tulip"]
diff --git a/test/entropic.jl b/test/entropic.jl
new file mode 100644
index 00000000..46b2cd3b
--- /dev/null
+++ b/test/entropic.jl
@@ -0,0 +1,146 @@
+using OptimalTransport
+
+using Distances
+using PythonOT: PythonOT
+
+using Random
+using Test
+
+const POT = PythonOT
+
+Random.seed!(100)
+
+@testset "entropic.jl" begin
+    @testset "sinkhorn" begin
+        M = 250
+        N = 200
+
+        @testset "example" begin
+            # create two uniform histograms
+            μ = fill(1 / M, M)
+            ν = fill(1 / N, N)
+
+            # create random cost matrix
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # compute optimal transport map (Julia implementation + POT)
+            eps = 0.01
+            γ = sinkhorn(μ, ν, C, eps; maxiter=5_000, rtol=1e-9)
+            γ_pot = POT.sinkhorn(μ, ν, C, eps; numItermax=5_000, stopThr=1e-9)
+            @test γ_pot ≈ γ rtol = 1e-6
+
+            # compute optimal transport cost
+            c = sinkhorn2(μ, ν, C, eps; maxiter=5_000, rtol=1e-9)
+
+            # with regularization term
+            c_w_regularization = sinkhorn2(μ, ν, C, eps; maxiter=5_000, regularization=true)
+            @test c_w_regularization ≈ c + eps * sum(x -> iszero(x) ? x : x * log(x), γ)
+
+            # compare with POT
+            c_pot = POT.sinkhorn2(μ, ν, C, eps; numItermax=5_000, stopThr=1e-9)[1]
+            @test c_pot ≈ c
+
+            # ensure that provided map is used and correct
+            c2 = sinkhorn2(similar(μ), similar(ν), C, rand(); plan=γ)
+            @test c2 ≈ c
+            c2_w_regularization = sinkhorn2(
+                similar(μ), similar(ν), C, eps; plan=γ, regularization=true
+            )
+            @test c2_w_regularization ≈ c_w_regularization
+        end
+
+        # different element type
+        @testset "Float32" begin
+            # create two uniform histograms
+            μ = fill(Float32(1 / M), M)
+            ν = fill(Float32(1 / N), N)
+
+            # create random cost matrix
+            C = pairwise(SqEuclidean(), rand(Float32, 1, M), rand(Float32, 1, N); dims=2)
+
+            # compute optimal transport map (Julia implementation + POT)
+            eps = 0.01f0
+            γ = sinkhorn(μ, ν, C, eps; maxiter=5_000, rtol=1e-6)
+            @test eltype(γ) === Float32
+
+            γ_pot = POT.sinkhorn(μ, ν, C, eps; numItermax=5_000, stopThr=1e-6)
+            @test Float32.(γ_pot) ≈ γ rtol = 1e-3
+
+            # compute optimal transport cost
+            c = sinkhorn2(μ, ν, C, eps; maxiter=5_000, rtol=1e-6)
+            @test c isa Float32
+
+            # with regularization term
+            c_w_regularization = sinkhorn2(
+                μ, ν, C, eps; maxiter=5_000, rtol=1e-6, regularization=true
+            )
+            @test c_w_regularization ≈ c + eps * sum(x -> iszero(x) ? x : x * log(x), γ)
+
+            # compare with POT
+            c_pot = POT.sinkhorn2(μ, ν, C, eps; numItermax=5_000, stopThr=1e-6)[1]
+            @test Float32(c_pot) ≈ c rtol = 1e-3
+        end
+
+        @testset "deprecations" begin
+            # create two uniform histograms
+            μ = fill(1 / M, M)
+            ν = fill(1 / N, N)
+
+            # create random cost matrix
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # check `sinkhorn2`
+            eps = 0.01
+            c = sinkhorn2(μ, ν, C, eps; atol=1e-6)
+            @test (@test_deprecated sinkhorn2(μ, ν, C, eps; tol=1e-6)) == c
+            c = sinkhorn2(μ, ν, C, eps; check_convergence=5)
+            @test (@test_deprecated sinkhorn2(μ, ν, C, eps; check_marginal_step=5)) == c
+
+            # check `sinkhorn_gibbs
+            K = @. exp(-C / eps)
+            γ = OptimalTransport.sinkhorn_gibbs(μ, ν, K; atol=1e-6)
+            @test (@test_deprecated OptimalTransport.sinkhorn_gibbs(μ, ν, K; tol=1e-6)) == γ
+            γ = OptimalTransport.sinkhorn_gibbs(μ, ν, K; check_convergence=5)
+            @test (@test_deprecated OptimalTransport.sinkhorn_gibbs(
+                μ, ν, K; check_marginal_step=5
+            )) == γ
+        end
+    end
+
+    @testset "sinkhorn_stabilized" begin
+        M = 250
+        N = 200
+
+        # create two uniform histograms
+        μ = fill(1 / M, M)
+        ν = fill(1 / N, N)
+
+        # create random cost matrix
+        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+        # compute optimal transport map (Julia implementation + POT)
+        eps = 0.01
+        γ = sinkhorn_stabilized(μ, ν, C, eps)
+        γ_pot = POT.sinkhorn(μ, ν, C, eps; method="sinkhorn_stabilized")
+        @test γ ≈ γ_pot rtol = 1e-6
+    end
+
+    @testset "sinkhorn_barycenter" begin
+        # set up support
+        support = range(-1; stop=1, length=250)
+        μ1 = exp.(-(support .+ 0.5) .^ 2 ./ 0.1^2)
+        μ1 ./= sum(μ1)
+        μ2 = exp.(-(support .- 0.5) .^ 2 ./ 0.1^2)
+        μ2 ./= sum(μ2)
+        μ_all = hcat(μ1, μ2)'
+
+        # create cost matrix
+        C = pairwise(SqEuclidean(), support'; dims=2)
+
+        # compute Sinkhorn barycenter (Julia implementation + POT)
+        eps = 0.01
+        μ_interp = sinkhorn_barycenter(μ_all, [C, C], eps, [0.5, 0.5])
+        μ_interp_pot = POT.barycenter(μ_all', C, eps; weights=[0.5, 0.5], stopThr=1e-9)
+        @test μ_interp ≈ μ_interp_pot
+    end
+end
diff --git a/test/exact.jl b/test/exact.jl
new file mode 100644
index 00000000..df608641
--- /dev/null
+++ b/test/exact.jl
@@ -0,0 +1,111 @@
+using OptimalTransport
+
+using Distances
+using PythonOT: PythonOT
+using Tulip
+using MathOptInterface
+using Distributions
+
+using LinearAlgebra
+using Random
+
+const MOI = MathOptInterface
+const POT = PythonOT
+
+Random.seed!(100)
+
+@testset "exact.jl" begin
+    @testset "Earth-Movers Distance" begin
+        M = 200
+        N = 250
+        μ = rand(M)
+        ν = rand(N)
+        μ ./= sum(μ)
+        ν ./= sum(ν)
+
+        @testset "example" begin
+            # create random cost matrix
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # compute optimal transport map and cost with POT
+            pot_P = POT.emd(μ, ν, C)
+            pot_cost = POT.emd2(μ, ν, C)
+
+            # compute optimal transport map and cost with Tulip
+            lp = Tulip.Optimizer()
+            P = emd(μ, ν, C, lp)
+            @test size(C) == size(P)
+            @test MOI.get(lp, MOI.TerminationStatus()) == MOI.OPTIMAL
+            @test maximum(abs, P .- pot_P) < 1e-2
+
+            lp = Tulip.Optimizer()
+            cost = emd2(μ, ν, C, lp)
+            @test dot(C, P) ≈ cost atol = 1e-5
+            @test MOI.get(lp, MOI.TerminationStatus()) == MOI.OPTIMAL
+            @test cost ≈ pot_cost atol = 1e-5
+        end
+
+        @testset "pre-computed plan" begin
+            # create random cost matrix
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # compute optimal transport map
+            P = emd(μ, ν, C, Tulip.Optimizer())
+
+            # do not use μ and ν to ensure that provided map is used
+            cost = emd2(similar(μ), similar(ν), C, Tulip.Optimizer(); plan=P)
+            @test cost ≈ emd2(μ, ν, C, Tulip.Optimizer())
+        end
+
+        # https://github.com/JuliaOptimalTransport/OptimalTransport.jl/issues/71
+        @testset "cost matrix with integers" begin
+            C = pairwise(SqEuclidean(), rand(1:10, 1, M), rand(1:10, 1, N); dims=2)
+            emd2(μ, ν, C, Tulip.Optimizer())
+        end
+    end
+
+    @testset "1D Optimal Transport for Convex Cost" begin
+        @testset "continuous distributions" begin
+            # two normal distributions (has analytical solution)
+            μ = Normal(randn(), rand())
+            ν = Normal(randn(), rand())
+
+            # compute OT plan
+            γ = ot_plan(sqeuclidean, μ, ν)
+            x = randn()
+            @test γ(x) ≈ quantile(ν, cdf(μ, x))
+
+            # compute OT cost
+            c = ot_cost(sqeuclidean, μ, ν)
+            @test c ≈ (mean(μ) - mean(ν))^2 + (std(μ) - std(ν))^2
+
+            # do not use ν to ensure that the provided plan is used
+            @test ot_cost(sqeuclidean, μ, Normal(randn(), rand()); plan=γ) ≈ c
+        end
+
+        @testset "semidiscrete case" begin
+            μ = Normal(randn(), rand())
+            νprobs = rand(30)
+            νprobs ./= sum(νprobs)
+            ν = Categorical(νprobs)
+
+            # compute OT plan
+            γ = ot_plan(euclidean, μ, ν)
+            x = randn()
+            @test γ(x) ≈ quantile(ν, cdf(μ, x))
+
+            # compute OT cost, without and with provided plan
+            # do not use ν in the second case to ensure that the provided plan is used
+            c = ot_cost(euclidean, μ, ν)
+            @test ot_cost(euclidean, μ, Categorical(reverse(νprobs)); plan=γ) ≈ c
+
+            # check that OT cost is consistent with OT cost of a discretization
+            m = 500
+            xs = rand(μ, m)
+            μdiscrete = fill(1 / m, m)
+            C = pairwise(Euclidean(), xs', (1:length(νprobs))'; dims=2)
+            c2 = emd2(μdiscrete, νprobs, C, Tulip.Optimizer())
+            @test c2 ≈ c rtol = 1e-1
+        end
+    end
+end
diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml
new file mode 100644
index 00000000..2665a3b0
--- /dev/null
+++ b/test/gpu/Project.toml
@@ -0,0 +1,11 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+OptimalTransport = "7e02d93a-ae51-4f58-b602-d97af76e3b33"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+CUDA = "3"
+Distances = "0.10"
+OptimalTransport = "0.3"
+julia = "1.6"
diff --git a/test/gpu/simple_gpu.jl b/test/gpu/simple_gpu.jl
new file mode 100644
index 00000000..c2284dff
--- /dev/null
+++ b/test/gpu/simple_gpu.jl
@@ -0,0 +1,68 @@
+using OptimalTransport
+
+using CUDA
+using Distances
+
+using Random
+using Test
+
+CUDA.allowscalar(false)
+
+Random.seed!(100)
+
+@testset "simple_gpu.jl" begin
+    # ensure that a GPU is available
+    if !CUDA.functional()
+        @warn "skipped GPU tests: no GPU available"
+    else
+        @testset "sinkhorn" begin
+            # source histogram
+            m = 200
+            μ = rand(Float32, m)
+            μ ./= sum(μ)
+
+            # target histogram
+            n = 250
+            ν = rand(Float32, n)
+            ν ./= sum(ν)
+
+            # random cost matrix
+            C = pairwise(SqEuclidean(), randn(Float32, 1, m), randn(Float32, 1, n); dims=2)
+
+            # compute transport plan and cost on the GPU
+            ε = 0.01f0
+            γ = sinkhorn(cu(μ), cu(ν), cu(C), ε)
+            c = sinkhorn2(cu(μ), cu(ν), cu(C), ε)
+
+            # compare with results on the CPU
+            @test γ ≈ cu(sinkhorn(μ, ν, C, ε))
+            @test c ≈ cu(sinkhorn2(μ, ν, C, ε))
+        end
+
+        @testset "sinkhorn_unbalanced" begin
+            # source histogram
+            m = 200
+            μ = rand(Float32, m)
+            μ ./= 1.5f0 * sum(μ)
+
+            # target histogram
+            n = 250
+            ν = rand(Float32, n)
+            ν ./= sum(ν)
+
+            # random cost matrix
+            C = pairwise(SqEuclidean(), randn(Float32, 1, m), randn(Float32, 1, n); dims=2)
+
+            # compute transport plan and cost on the GPU
+            ε = 0.01f0
+            λ1 = 0.4f0
+            λ2 = 0.6f0
+            γ = sinkhorn_unbalanced(cu(μ), cu(ν), cu(C), λ1, λ2, ε)
+            c = sinkhorn_unbalanced2(cu(μ), cu(ν), cu(C), λ1, λ2, ε)
+
+            # compare with results on the CPU
+            @test γ ≈ cu(sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε))
+            @test c ≈ cu(sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε))
+        end
+    end
+end
diff --git a/test/quadratic.jl b/test/quadratic.jl
new file mode 100644
index 00000000..94ae98b3
--- /dev/null
+++ b/test/quadratic.jl
@@ -0,0 +1,33 @@
+using OptimalTransport
+
+using Distances
+using PythonOT: PythonOT
+
+using Random
+using Test
+
+const POT = PythonOT
+
+Random.seed!(100)
+
+@testset "quadratic.jl" begin
+    @testset "quadreg" begin
+        M = 250
+        N = 200
+
+        # create two uniform histograms
+        μ = fill(1 / M, M)
+        ν = fill(1 / N, N)
+
+        # create random cost matrix
+        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+        # compute optimal transport map (Julia implementation + POT)
+        eps = 0.25
+        γ = quadreg(μ, ν, C, eps)
+        γ_pot = POT.Smooth.smooth_ot_dual(μ, ν, C, eps; stopThr=1e-9)
+
+        # need to use a larger tolerance here because of a quirk with the POT solver
+        @test γ ≈ γ_pot rtol = 1e-1
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index c795a416..ca37436a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,386 +1,38 @@
 using OptimalTransport
+using Pkg: Pkg
+using SafeTestsets
 
-using Distances
-using PythonOT: PythonOT
-using Tulip
-using MathOptInterface
-using Distributions
-using SparseArrays
-
-using LinearAlgebra
-using Random
 using Test
 
-const MOI = MathOptInterface
-const POT = PythonOT
-
-Random.seed!(100)
-
-@testset "Earth-Movers Distance" begin
-    M = 200
-    N = 250
-    μ = rand(M)
-    ν = rand(N)
-    μ ./= sum(μ)
-    ν ./= sum(ν)
-
-    @testset "example" begin
-        # create random cost matrix
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport map and cost with POT
-        pot_P = POT.emd(μ, ν, C)
-        pot_cost = POT.emd2(μ, ν, C)
-
-        # compute optimal transport map and cost with Tulip
-        lp = Tulip.Optimizer()
-        P = emd(μ, ν, C, lp)
-        @test size(C) == size(P)
-        @test MOI.get(lp, MOI.TerminationStatus()) == MOI.OPTIMAL
-        @test maximum(abs, P .- pot_P) < 1e-2
-
-        lp = Tulip.Optimizer()
-        cost = emd2(μ, ν, C, lp)
-        @test dot(C, P) ≈ cost atol = 1e-5
-        @test MOI.get(lp, MOI.TerminationStatus()) == MOI.OPTIMAL
-        @test cost ≈ pot_cost atol = 1e-5
-    end
-
-    @testset "pre-computed plan" begin
-        # create random cost matrix
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport map
-        P = emd(μ, ν, C, Tulip.Optimizer())
-
-        # do not use μ and ν to ensure that provided map is used
-        cost = emd2(similar(μ), similar(ν), C, Tulip.Optimizer(); plan=P)
-        @test cost ≈ emd2(μ, ν, C, Tulip.Optimizer())
-    end
-
-    # https://github.com/JuliaOptimalTransport/OptimalTransport.jl/issues/71
-    @testset "cost matrix with integers" begin
-        C = pairwise(SqEuclidean(), rand(1:10, 1, M), rand(1:10, 1, N); dims=2)
-        emd2(μ, ν, C, Tulip.Optimizer())
-    end
-end
-
-@testset "1D Optimal Transport for Convex Cost" begin
-    @testset "continuous distributions" begin
-        # two normal distributions (has analytical solution)
-        μ = Normal(randn(), rand())
-        ν = Normal(randn(), rand())
-
-        # compute OT plan
-        γ = ot_plan(sqeuclidean, μ, ν)
-        x = randn()
-        @test γ(x) ≈ quantile(ν, cdf(μ, x))
-
-        # compute OT cost
-        c = ot_cost(sqeuclidean, μ, ν)
-        @test c ≈ (mean(μ) - mean(ν))^2 + (std(μ) - std(ν))^2
-
-        # do not use ν to ensure that the provided plan is used
-        @test ot_cost(sqeuclidean, μ, Normal(randn(), rand()); plan=γ) ≈ c
-    end
-
-    @testset "semidiscrete case" begin
-        μ = Normal(randn(), rand())
-        νprobs = rand(30)
-        νprobs ./= sum(νprobs)
-        ν = Categorical(νprobs)
-
-        # compute OT plan
-        γ = ot_plan(euclidean, μ, ν)
-        x = randn()
-        @test γ(x) ≈ quantile(ν, cdf(μ, x))
-
-        # compute OT cost, without and with provided plan
-        # do not use ν in the second case to ensure that the provided plan is used
-        c = ot_cost(euclidean, μ, ν)
-        @test ot_cost(euclidean, μ, Categorical(reverse(νprobs)); plan=γ) ≈ c
-
-        # check that OT cost is consistent with OT cost of a discretization
-        m = 500
-        xs = rand(μ, m)
-        μdiscrete = fill(1 / m, m)
-        C = pairwise(Euclidean(), xs', (1:length(νprobs))'; dims=2)
-        c2 = emd2(μdiscrete, νprobs, C, Tulip.Optimizer())
-        @test c2 ≈ c rtol = 1e-1
-    end
-end
-
-@testset "entropically regularized transport" begin
-    M = 250
-    N = 200
-
-    @testset "example" begin
-        # create two uniform histograms
-        μ = fill(1 / M, M)
-        ν = fill(1 / N, N)
-
-        # create random cost matrix
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport map (Julia implementation + POT)
-        eps = 0.01
-        γ = sinkhorn(μ, ν, C, eps; maxiter=5_000, rtol=1e-9)
-        γ_pot = POT.sinkhorn(μ, ν, C, eps; numItermax=5_000, stopThr=1e-9)
-        @test γ_pot ≈ γ rtol = 1e-6
-
-        # compute optimal transport cost
-        c = sinkhorn2(μ, ν, C, eps; maxiter=5_000, rtol=1e-9)
-
-        # with regularization term
-        c_w_regularization = sinkhorn2(μ, ν, C, eps; maxiter=5_000, regularization=true)
-        @test c_w_regularization ≈ c + eps * sum(x -> iszero(x) ? x : x * log(x), γ)
-
-        # compare with POT
-        c_pot = POT.sinkhorn2(μ, ν, C, eps; numItermax=5_000, stopThr=1e-9)[1]
-        @test c_pot ≈ c
-
-        # ensure that provided map is used and correct
-        c2 = sinkhorn2(similar(μ), similar(ν), C, rand(); plan=γ)
-        @test c2 ≈ c
-        c2_w_regularization = sinkhorn2(
-            similar(μ), similar(ν), C, eps; plan=γ, regularization=true
-        )
-        @test c2_w_regularization ≈ c_w_regularization
-    end
-
-    # different element type
-    @testset "Float32" begin
-        # create two uniform histograms
-        μ = fill(Float32(1 / M), M)
-        ν = fill(Float32(1 / N), N)
-
-        # create random cost matrix
-        C = pairwise(SqEuclidean(), rand(Float32, 1, M), rand(Float32, 1, N); dims=2)
-
-        # compute optimal transport map (Julia implementation + POT)
-        eps = 0.01f0
-        γ = sinkhorn(μ, ν, C, eps; maxiter=5_000, rtol=1e-6)
-        @test eltype(γ) === Float32
-
-        γ_pot = POT.sinkhorn(μ, ν, C, eps; numItermax=5_000, stopThr=1e-6)
-        @test Float32.(γ_pot) ≈ γ rtol = 1e-3
-
-        # compute optimal transport cost
-        c = sinkhorn2(μ, ν, C, eps; maxiter=5_000, rtol=1e-6)
-        @test c isa Float32
-
-        # with regularization term
-        c_w_regularization = sinkhorn2(
-            μ, ν, C, eps; maxiter=5_000, rtol=1e-6, regularization=true
-        )
-        @test c_w_regularization ≈ c + eps * sum(x -> iszero(x) ? x : x * log(x), γ)
-
-        # compare with POT
-        c_pot = POT.sinkhorn2(μ, ν, C, eps; numItermax=5_000, stopThr=1e-6)[1]
-        @test Float32(c_pot) ≈ c rtol = 1e-3
-    end
-
-    @testset "deprecations" begin
-        # create two uniform histograms
-        μ = fill(1 / M, M)
-        ν = fill(1 / N, N)
-
-        # create random cost matrix
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # check `sinkhorn2`
-        eps = 0.01
-        c = sinkhorn2(μ, ν, C, eps; atol=1e-6)
-        @test (@test_deprecated sinkhorn2(μ, ν, C, eps; tol=1e-6)) == c
-        c = sinkhorn2(μ, ν, C, eps; check_convergence=5)
-        @test (@test_deprecated sinkhorn2(μ, ν, C, eps; check_marginal_step=5)) == c
-
-        # check `sinkhorn_gibbs
-        K = @. exp(-C / eps)
-        γ = OptimalTransport.sinkhorn_gibbs(μ, ν, K; atol=1e-6)
-        @test (@test_deprecated OptimalTransport.sinkhorn_gibbs(μ, ν, K; tol=1e-6)) == γ
-        γ = OptimalTransport.sinkhorn_gibbs(μ, ν, K; check_convergence=5)
-        @test (@test_deprecated OptimalTransport.sinkhorn_gibbs(
-            μ, ν, K; check_marginal_step=5
-        )) == γ
-    end
-end
-
-@testset "unbalanced transport" begin
-    M = 250
-    N = 200
-
-    @testset "example" begin
-        μ = fill(1 / N, M)
-        ν = fill(1 / N, N)
-
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport plan
-        eps = 0.01
-        lambda = 1
-        γ = sinkhorn_unbalanced(μ, ν, C, lambda, lambda, eps)
-
-        # compare with POT
-        γ_pot = POT.sinkhorn_unbalanced(μ, ν, C, eps, lambda; stopThr=1e-9)
-        @test γ_pot ≈ γ
-
-        # compute optimal transport cost
-        c = sinkhorn_unbalanced2(μ, ν, C, lambda, lambda, eps; maxiter=5_000)
-
-        # compare with POT
-        c_pot = POT.sinkhorn_unbalanced2(
-            μ, ν, C, eps, lambda; numItermax=5_000, stopThr=1e-9
-        )[1]
-        @test c_pot ≈ c
-
-        # ensure that provided plan is used
-        c2 = sinkhorn_unbalanced2(similar(μ), similar(ν), C, rand(), rand(), rand(); plan=γ)
-        @test c2 ≈ c
-    end
-
-    @testset "proxdiv operators" begin
-        μ = fill(1 / N, M)
-        ν = fill(1 / N, N)
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport plan and cost with real-valued parameters
-        eps = 0.01
-        lambda1 = 0.4
-        lambda2 = 0.5
-        γ = sinkhorn_unbalanced(μ, ν, C, lambda1, lambda2, eps)
-        c = sinkhorn_unbalanced2(μ, ν, C, lambda1, lambda2, eps)
-        @test c ≈ dot(γ, C)
-
-        # compute optimal transport plan and cost with manual "proxdiv" functions
-        proxdivF1!(s, p, ε) = (s .= s .^ (ε / (ε + 0.4)) .* p .^ (0.4 / (ε + 0.4)) ./ s)
-        proxdivF2!(s, p, ε) = (s .= s .^ (ε / (ε + 0.5)) .* p .^ (0.5 / (ε + 0.5)) ./ s)
-        γ_proxdiv = sinkhorn_unbalanced(μ, ν, C, proxdivF1!, proxdivF2!, eps)
-        c_proxdiv = sinkhorn_unbalanced2(μ, ν, C, proxdivF1!, proxdivF2!, eps)
-        @test γ_proxdiv ≈ γ
-        @test c_proxdiv ≈ c
-    end
-
-    @testset "consistency with balanced OT" begin
-        μ = fill(1 / M, M)
-        ν = fill(1 / N, N)
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport plan and cost with manual "proxdiv" functions for
-        # balanced OT
-        ε = 0.01
-        proxdivF!(s, p, ε) = (s .= p ./ s)
-        γ = sinkhorn_unbalanced(μ, ν, C, proxdivF!, proxdivF!, ε)
-        c = sinkhorn_unbalanced2(μ, ν, C, proxdivF!, proxdivF!, ε)
-        @test c ≈ dot(γ, C)
-
-        # compute optimal transport plan and cost for balanced OT
-        γ_balanced = sinkhorn(μ, ν, C, ε)
-        c_balanced = sinkhorn2(μ, ν, C, ε)
-        @test γ_balanced ≈ γ rtol = 1e-4
-        @test c_balanced ≈ c rtol = 1e-4
-    end
-
-    @testset "deprecations" begin
-        μ = fill(1 / N, M)
-        ν = fill(1 / N, N)
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport plan and cost with real-valued parameters
-        ε = 0.01
-        λ1 = 0.4
-        λ2 = 0.5
-        γ = sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε)
-        c = sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε)
-
-        # compute optimal transport plan and cost with manual "proxdiv" functions
-        # as keyword arguments
-        proxdivF1(s, p) = s .^ (0.01 / (0.01 + 0.4)) .* p .^ (0.4 / (0.01 + 0.4)) ./ s
-        proxdivF2(s, p) = s .^ (0.01 / (0.01 + 0.5)) .* p .^ (0.5 / (0.01 + 0.5)) ./ s
-        γ_proxdiv = @test_deprecated sinkhorn_unbalanced(
-            μ, ν, C, rand(), rand(), ε; proxdiv_F1=proxdivF1, proxdiv_F2=proxdivF2
-        )
-        c_proxdiv = @test_deprecated sinkhorn_unbalanced2(
-            μ, ν, C, rand(), rand(), ε; proxdiv_F1=proxdivF1, proxdiv_F2=proxdivF2
-        )
-        @test γ_proxdiv ≈ γ
-        @test c_proxdiv ≈ c
-
-        # deprecated `tol` keyword argument
-        γ = sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε; atol=1e-7)
-        c = sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε; atol=1e-7)
-        γ_tol = @test_deprecated sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε; tol=1e-7)
-        c_tol = @test_deprecated sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε; tol=1e-7)
-        @test γ_tol == γ
-        @test c_tol == c
-
-        # deprecated `max_iter` keyword argument
-        γ = sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε; maxiter=50)
-        c = sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε; maxiter=50)
-        γ_max_iter = @test_deprecated sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε; max_iter=50)
-        c_max_iter = @test_deprecated sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε; max_iter=50)
-        @test γ_max_iter == γ
-        @test c_max_iter == c
-    end
-end
-
-@testset "stabilized sinkhorn" begin
-    M = 250
-    N = 200
-
-    @testset "example" begin
-        # create two uniform histograms
-        μ = fill(1 / M, M)
-        ν = fill(1 / N, N)
-
-        # create random cost matrix
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport map (Julia implementation + POT)
-        eps = 0.01
-        γ = sinkhorn_stabilized(μ, ν, C, eps)
-        γ_pot = POT.sinkhorn(μ, ν, C, eps; method="sinkhorn_stabilized")
-        @test norm(γ - γ_pot, Inf) < 1e-9
-    end
-end
-
-@testset "quadratic optimal transport" begin
-    M = 250
-    N = 200
-    @testset "example" begin
-        # create two uniform histograms
-        μ = fill(1 / M, M)
-        ν = fill(1 / N, N)
-
-        # create random cost matrix
-        C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
-
-        # compute optimal transport map (Julia implementation + POT)
-        eps = 0.25
-        γ = quadreg(μ, ν, C, eps)
-        γ_pot = POT.Smooth.smooth_ot_dual(μ, ν, C, eps; stopThr=1e-9)
-        # need to use a larger tolerance here because of a quirk with the POT solver 
-        @test norm(γ - γ_pot, Inf) < 1e-4
-    end
-end
-
-@testset "sinkhorn barycenter" begin
-    @testset "example" begin
-        # set up support
-        support = range(-1; stop=1, length=250)
-        μ1 = exp.(-(support .+ 0.5) .^ 2 ./ 0.1^2)
-        μ1 ./= sum(μ1)
-        μ2 = exp.(-(support .- 0.5) .^ 2 ./ 0.1^2)
-        μ2 ./= sum(μ2)
-        μ_all = hcat(μ1, μ2)'
-
-        # create cost matrix
-        C = pairwise(SqEuclidean(), support'; dims=2)
-
-        # compute Sinkhorn barycenter (Julia implementation + POT)
-        eps = 0.01
-        μ_interp = sinkhorn_barycenter(μ_all, [C, C], eps, [0.5, 0.5])
-        μ_interp_pot = POT.barycenter(μ_all', C, eps; weights=[0.5, 0.5], stopThr=1e-9)
-        @test norm(μ_interp - μ_interp_pot, Inf) < 1e-9
+const GROUP = get(ENV, "GROUP", "All")
+
+@testset "OptimalTransport" begin
+    if GROUP == "All" || GROUP == "OptimalTransport"
+        @safetestset "Exact OT" begin
+            include("exact.jl")
+        end
+        @safetestset "Entropically regularized OT" begin
+            include("entropic.jl")
+        end
+        @safetestset "Quadratically regularized OT" begin
+            include("quadratic.jl")
+        end
+        @safetestset "Unbalanced OT" begin
+            include("unbalanced.jl")
+        end
+    end
+
+    # CUDA requires Julia >= 1.6
+    if (GROUP == "All" || GROUP == "GPU") && VERSION >= v"1.6"
+        # activate separate environment: CUDA can't be added to test/Project.toml since it
+        # is not available on older Julia versions
+        pkgdir = dirname(dirname(pathof(OptimalTransport)))
+        Pkg.activate("gpu")
+        Pkg.develop(Pkg.PackageSpec(; path=pkgdir))
+        Pkg.instantiate()
+
+        @safetestset "Simple GPU" begin
+            include(joinpath("gpu/simple_gpu.jl"))
+        end
     end
 end
diff --git a/test/unbalanced.jl b/test/unbalanced.jl
new file mode 100644
index 00000000..34a13b2e
--- /dev/null
+++ b/test/unbalanced.jl
@@ -0,0 +1,145 @@
+using OptimalTransport
+using Distances
+using PythonOT: PythonOT
+
+using LinearAlgebra
+using Random
+using Test
+
+const POT = PythonOT
+
+Random.seed!(100)
+
+@testset "unbalanced.jl" begin
+    @testset "sinkhorn_unbalanced" begin
+        M = 250
+        N = 200
+
+        @testset "example" begin
+            μ = fill(1 / N, M)
+            ν = fill(1 / N, N)
+
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # compute optimal transport plan
+            eps = 0.01
+            lambda = 1
+            γ = sinkhorn_unbalanced(μ, ν, C, lambda, lambda, eps)
+
+            # compare with POT
+            γ_pot = POT.sinkhorn_unbalanced(μ, ν, C, eps, lambda; stopThr=1e-9)
+            @test γ_pot ≈ γ
+
+            # compute optimal transport cost
+            c = sinkhorn_unbalanced2(μ, ν, C, lambda, lambda, eps; maxiter=5_000)
+
+            # compare with POT
+            c_pot = POT.sinkhorn_unbalanced2(
+                μ, ν, C, eps, lambda; numItermax=5_000, stopThr=1e-9
+            )[1]
+            @test c_pot ≈ c
+
+            # ensure that provided plan is used
+            c2 = sinkhorn_unbalanced2(
+                similar(μ), similar(ν), C, rand(), rand(), rand(); plan=γ
+            )
+            @test c2 ≈ c
+        end
+
+        @testset "proxdiv operators" begin
+            μ = fill(1 / N, M)
+            ν = fill(1 / N, N)
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # compute optimal transport plan and cost with real-valued parameters
+            eps = 0.01
+            lambda1 = 0.4
+            lambda2 = 0.5
+            γ = sinkhorn_unbalanced(μ, ν, C, lambda1, lambda2, eps)
+            c = sinkhorn_unbalanced2(μ, ν, C, lambda1, lambda2, eps)
+            @test c ≈ dot(γ, C)
+
+            # compute optimal transport plan and cost with manual "proxdiv" functions
+            function proxdivF1!(s, p, ε)
+                return (s .= s .^ (ε / (ε + 0.4)) .* p .^ (0.4 / (ε + 0.4)) ./ s)
+            end
+            function proxdivF2!(s, p, ε)
+                return (s .= s .^ (ε / (ε + 0.5)) .* p .^ (0.5 / (ε + 0.5)) ./ s)
+            end
+            γ_proxdiv = sinkhorn_unbalanced(μ, ν, C, proxdivF1!, proxdivF2!, eps)
+            c_proxdiv = sinkhorn_unbalanced2(μ, ν, C, proxdivF1!, proxdivF2!, eps)
+            @test γ_proxdiv ≈ γ
+            @test c_proxdiv ≈ c
+        end
+
+        @testset "consistency with balanced OT" begin
+            μ = fill(1 / M, M)
+            ν = fill(1 / N, N)
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # compute optimal transport plan and cost with manual "proxdiv" functions for
+            # balanced OT
+            ε = 0.01
+            proxdivF!(s, p, ε) = (s .= p ./ s)
+            γ = sinkhorn_unbalanced(μ, ν, C, proxdivF!, proxdivF!, ε)
+            c = sinkhorn_unbalanced2(μ, ν, C, proxdivF!, proxdivF!, ε)
+            @test c ≈ dot(γ, C)
+
+            # compute optimal transport plan and cost for balanced OT
+            γ_balanced = sinkhorn(μ, ν, C, ε)
+            c_balanced = sinkhorn2(μ, ν, C, ε)
+            @test γ_balanced ≈ γ rtol = 1e-4
+            @test c_balanced ≈ c rtol = 1e-4
+        end
+
+        @testset "deprecations" begin
+            μ = fill(1 / N, M)
+            ν = fill(1 / N, N)
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # compute optimal transport plan and cost with real-valued parameters
+            ε = 0.01
+            λ1 = 0.4
+            λ2 = 0.5
+            γ = sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε)
+            c = sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε)
+
+            # compute optimal transport plan and cost with manual "proxdiv" functions
+            # as keyword arguments
+            function proxdivF1(s, p)
+                return s .^ (0.01 / (0.01 + 0.4)) .* p .^ (0.4 / (0.01 + 0.4)) ./ s
+            end
+            function proxdivF2(s, p)
+                return s .^ (0.01 / (0.01 + 0.5)) .* p .^ (0.5 / (0.01 + 0.5)) ./ s
+            end
+            γ_proxdiv = @test_deprecated sinkhorn_unbalanced(
+                μ, ν, C, rand(), rand(), ε; proxdiv_F1=proxdivF1, proxdiv_F2=proxdivF2
+            )
+            c_proxdiv = @test_deprecated sinkhorn_unbalanced2(
+                μ, ν, C, rand(), rand(), ε; proxdiv_F1=proxdivF1, proxdiv_F2=proxdivF2
+            )
+            @test γ_proxdiv ≈ γ
+            @test c_proxdiv ≈ c
+
+            # deprecated `tol` keyword argument
+            γ = sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε; atol=1e-7)
+            c = sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε; atol=1e-7)
+            γ_tol = @test_deprecated sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε; tol=1e-7)
+            c_tol = @test_deprecated sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε; tol=1e-7)
+            @test γ_tol == γ
+            @test c_tol == c
+
+            # deprecated `max_iter` keyword argument
+            γ = sinkhorn_unbalanced(μ, ν, C, λ1, λ2, ε; maxiter=50)
+            c = sinkhorn_unbalanced2(μ, ν, C, λ1, λ2, ε; maxiter=50)
+            γ_max_iter = @test_deprecated sinkhorn_unbalanced(
+                μ, ν, C, λ1, λ2, ε; max_iter=50
+            )
+            c_max_iter = @test_deprecated sinkhorn_unbalanced2(
+                μ, ν, C, λ1, λ2, ε; max_iter=50
+            )
+            @test γ_max_iter == γ
+            @test c_max_iter == c
+        end
+    end
+end