diff --git a/Project.toml b/Project.toml index ffbd821..488f600 100644 --- a/Project.toml +++ b/Project.toml @@ -4,9 +4,7 @@ version = "0.2.3" [deps] BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf" -CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" -CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" -GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LuxurySparse = "d05aeea4-b7d4-55ac-b691-9e7fabb07ba2" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -18,9 +16,7 @@ Yao = "5872b779-8223-5990-8dd0-5abbb0748c8c" [compat] BitBasis = "0.7" -CUDAnative = "3.0" -CuArrays = "2.2" -GPUArrays = "3.0, 4.0" +CUDA = "1.1, 1.2" LuxurySparse = "0.6" Reexport = "0.2" StaticArrays = "0.12" @@ -32,6 +28,7 @@ julia = "1" [extras] Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df" [targets] -test = ["Test", "Statistics"] +test = ["Test", "Statistics", "YaoBlocks"] diff --git a/benchmarks/TestCuda.jl b/benchmarks/TestCuda.jl index f3b9f46..8c6a6cd 100644 --- a/benchmarks/TestCuda.jl +++ b/benchmarks/TestCuda.jl @@ -1,4 +1,4 @@ -using CuArrays, CUDAnative +using CUDA using LinearAlgebra using BenchmarkTools diff --git a/benchmarks/gates.jl b/benchmarks/gates.jl index 0184ca7..7a3bcf6 100644 --- a/benchmarks/gates.jl +++ b/benchmarks/gates.jl @@ -1,18 +1,18 @@ -using Yao, CuYao, CuArrays +using Yao, CuYao, CUDA using BenchmarkTools reg = rand_state(12; nbatch=1000) creg = reg |> cu -@benchmark CuArrays.@sync creg |> put(12, 3=>Z) -@benchmark CuArrays.@sync creg |> put(12, 3=>X) +@benchmark CUDA.@sync creg |> put(12, 3=>Z) +@benchmark CUDA.@sync creg |> put(12, 3=>X) @benchmark reg |> put(12, 3=>Z) -@benchmark CuArrays.@sync creg |> control(12, 6, 3=>X) +@benchmark CUDA.@sync creg |> control(12, 6, 3=>X) @benchmark reg |> control(12, 6, 3=>X) -@benchmark CuArrays.@sync creg |> put(12, 3=>rot(X, 0.3)) +@benchmark CUDA.@sync creg |> put(12, 3=>rot(X, 0.3)) @benchmark reg |> put(12, 3=>rot(X, 0.3)) reg = rand_state(20) creg = reg |> cu g = swap(12, 7, 2) @benchmark reg |> g -@benchmark CuArrays.@sync creg |> g +@benchmark CUDA.@sync creg |> g diff --git a/benchmarks/gcompile.jl b/benchmarks/gcompile.jl index 7fcdb1e..62a90ae 100644 --- a/benchmarks/gcompile.jl +++ b/benchmarks/gcompile.jl @@ -1,5 +1,5 @@ using Yao, Yao.Boost, Yao.Intrinsics, StaticArrays, Yao.Blocks -using CuYao, CuArrays, CUDAnative, GPUArrays +using CuYao, CUDA using BenchmarkTools, Profile nbit = 12 diff --git a/src/CUDApatch.jl b/src/CUDApatch.jl index cfdf396..a5e6b2c 100644 --- a/src/CUDApatch.jl +++ b/src/CUDApatch.jl @@ -1,5 +1,5 @@ -#import CuArrays: _cuview, ViewIndex, NonContiguous -#using GPUArrays: genperm +#import CUDA: _cuview, ViewIndex, NonContiguous +#using CUDA: genperm # fallback to SubArray when the view is not contiguous #= @@ -14,11 +14,11 @@ function LinearAlgebra.permutedims!(dest::GPUArray, src::GPUArray, perm) where N end =# -import CUDAnative: pow, abs, angle +import CUDA: pow, abs, angle for (RT, CT) in [(:Float64, :ComplexF64), (:Float32, :ComplexF32)] - @eval cp2c(d::$RT, a::$RT) = CUDAnative.ComplexF64(d*CUDAnative.cos(a), d*CUDAnative.sin(a)) + @eval cp2c(d::$RT, a::$RT) = CUDA.Complex(d*CUDA.cos(a), d*CUDA.sin(a)) for NT in [RT, :Int32] - @eval CUDAnative.pow(z::$CT, n::$NT) = CUDAnative.ComplexF64((CUDAnative.pow(CUDAnative.abs(z), n)*CUDAnative.cos(n*CUDAnative.angle(z))), (CUDAnative.pow(CUDAnative.abs(z), n)*CUDAnative.sin(n*CUDAnative.angle(z)))) + @eval CUDA.pow(z::$CT, n::$NT) = CUDA.Complex((CUDA.pow(CUDA.abs(z), n)*CUDA.cos(n*CUDA.angle(z))), (CUDA.pow(CUDA.abs(z), n)*CUDA.sin(n*CUDA.angle(z)))) end end @@ -47,7 +47,7 @@ bit_count(UInt32(0b11111)) using LinearAlgebra import LinearAlgebra: norm const CuSubArr{T, N} = Union{CuArray{T, N}, SubArray{T, N, <:CuArray}} -norm2(A::CuSubArr; dims=1) = mapreduce(abs2, +, A, dims=dims) .|> CUDAnative.sqrt +norm2(A::CuSubArr; dims=1) = mapreduce(abs2, +, A, dims=dims) .|> CUDA.sqrt export piecewise, cudiv @inline function cudiv(x::Int) @@ -74,7 +74,7 @@ piecewise(state::AbstractMatrix, inds) = @inbounds view(state,:,inds[2]) import Base: kron, getindex function kron(A::Union{CuArray{T1}, Adjoint{<:Any, <:CuArray{T1}}}, B::Union{CuArray{T2}, Adjoint{<:Any, <:CuArray{T2}}}) where {T1, T2} - res = CuArrays.zeros(promote_type(T1,T2), (size(A).*size(B))...) + res = CUDA.zeros(promote_type(T1,T2), (size(A).*size(B))...) CI = Base.CartesianIndices(res) @inline function kernel(res, A, B) state = (blockIdx().x-1) * blockDim().x + threadIdx().x @@ -114,7 +114,7 @@ function kron!(C::CuArray{T3}, A::Union{CuArray{T1}, Adjoint{<:Any, <:CuArray{T1 end function getindex(A::CuVector{T}, B::CuArray{<:Integer}) where T - res = CuArrays.zeros(T, size(B)...) + res = CUDA.zeros(T, size(B)...) @inline function kernel(res, A, B) state = (blockIdx().x-1) * blockDim().x + threadIdx().x state <= length(res) && (@inbounds res[state] = A[B[state]]) @@ -131,4 +131,3 @@ function getindex(A::AbstractVector, B::CuArray{<:Integer}) end YaoBlocks.AD.as_scalar(x::CuArray) = Array(x)[] - diff --git a/src/CuYao.jl b/src/CuYao.jl index b27cb21..46bde19 100644 --- a/src/CuYao.jl +++ b/src/CuYao.jl @@ -7,7 +7,8 @@ import TupleTools using Random using Yao.YaoArrayRegister -using GPUArrays, CuArrays, CUDAnative +using CUDA +import Yao: kron! @reexport using Yao const Ints = NTuple{<:Any, Int} @@ -18,7 +19,7 @@ include("gpuapplys.jl") #include("gcompile.jl") function __init__() - CuArrays.allowscalar(false) + CUDA.allowscalar(false) end end diff --git a/src/GPUReg.jl b/src/GPUReg.jl index 0ac0ba7..ce29f62 100644 --- a/src/GPUReg.jl +++ b/src/GPUReg.jl @@ -1,4 +1,4 @@ -import CuArrays: cu +import CUDA: cu import Yao.YaoArrayRegister: _measure, measure, measure!, measure_collapseto!, measure_remove! import Yao.YaoBase: batch_normalize! import Yao: expect @@ -46,7 +46,7 @@ function measure!(::RemoveMeasured, ::ComputationalBasis, reg::GPUReg{B}, ::AllL if state <= length(nregm) @inbounds i,j = CI[state].I @inbounds r = Int(res[j])+1 - @inbounds nregm[i,j] = regm[r,i,j]/CUDAnative.sqrt(pl[r, j]) + @inbounds nregm[i,j] = regm[r,i,j]/CUDA.sqrt(pl[r, j]) end return end @@ -69,7 +69,7 @@ function measure!(::NoPostProcess, ::ComputationalBasis, reg::GPUReg{B, T}, ::Al if state <= length(regm) @inbounds k,i,j = CI[state].I @inbounds rind = Int(res[j]) + 1 - @inbounds regm[k,i,j] = k==rind ? regm[k,i,j]/CUDAnative.sqrt(pl[k, j]) : T(0) + @inbounds regm[k,i,j] = k==rind ? regm[k,i,j]/CUDA.sqrt(pl[k, j]) : T(0) end return end @@ -92,8 +92,8 @@ function measure!(rst::ResetTo, ::ComputationalBasis, reg::GPUReg{B, T}, ::AllLo if state <= length(regm) @inbounds k,i,j = CI[state].I @inbounds rind = Int(res[j]) + 1 - @inbounds k==val+1 && (regm[k,i,j] = regm[rind,i,j]/CUDAnative.sqrt(pl[rind, j])) - CuArrays.sync_threads() + @inbounds k==val+1 && (regm[k,i,j] = regm[rind,i,j]/CUDA.sqrt(pl[rind, j])) + CUDA.sync_threads() @inbounds k!=val+1 && (regm[k,i,j] = 0) end return @@ -106,7 +106,7 @@ end import Yao.YaoArrayRegister: insert_qubits!, join function YaoBase.batched_kron(A::Union{CuArray{T1, 3}, Adjoint{<:Any, <:CuArray{T1, 3}}}, B::Union{CuArray{T2, 3}, Adjoint{<:Any, <:CuArray{T2, 3}}}) where {T1 ,T2} - res = CuArrays.zeros(promote_type(T1,T2), size(A,1)*size(B, 1), size(A,2)*size(B,2), size(A, 3)) + res = CUDA.zeros(promote_type(T1,T2), size(A,1)*size(B, 1), size(A,2)*size(B,2), size(A, 3)) CI = Base.CartesianIndices(res) @inline function kernel(res, A, B) state = (blockIdx().x-1) * blockDim().x + threadIdx().x @@ -133,7 +133,7 @@ Performs batched Kronecker products in-place on the GPU. The results are stored in 'C', overwriting the existing values of 'C'. """ function YaoBase.batched_kron!(C::CuArray{T3, 3}, A::Union{CuArray{T1, 3}, Adjoint{<:Any, <:CuArray{T1, 3}}}, B::Union{CuArray{T2, 3}, Adjoint{<:Any, <:CuArray{T2, 3}}}) where {T1 ,T2, T3} - @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2)), size(A,3)) || throw(DimensionMismatch()) + @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2), size(A,3))) || throw(DimensionMismatch()) @boundscheck (size(A,3) == size(B,3) == size(C,3)) || throw(DimensionMismatch()) CI = Base.CartesianIndices(C) @inline function kernel(C, A, B) diff --git a/src/gpuapplys.jl b/src/gpuapplys.jl index e61812d..fd1a5a7 100644 --- a/src/gpuapplys.jl +++ b/src/gpuapplys.jl @@ -110,6 +110,6 @@ end using Yao.YaoBlocks function YaoBlocks._apply_fallback!(r::GPUReg{B,T}, b::AbstractBlock) where {B,T} YaoBlocks._check_size(r, b) - r.state .= CuArrays.adapt(CuArray{T}, mat(T, b)) * r.state + r.state .= CUDA.adapt(CuArray{T}, mat(T, b)) * r.state return r end diff --git a/src/kernels.jl b/src/kernels.jl index 81b11a3..1b567e2 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -22,17 +22,21 @@ end u1rows!(piecewise(state, inds), i, i+step, a, b, c, d) end end -u1_kernel(nbit::Int, U1::SDSparseMatrixCSC, ibit::Int) = u1_kernel(nbit, U1|>Matrix, ibit) +function u1_kernel(nbit::Int, U1::SDSparseMatrixCSC, ibit::Int) + u1_kernel(nbit, U1|>Matrix, ibit) +end @inline function u1_kernel(nbit::Int, U1::SDPermMatrix, ibit::Int) - U1.perm[1] == 1 && return u1_kernel(nbit, Diagonal(U1.vals), ibit) + if U1.perm[1] == 1 + return u1_kernel(nbit, Diagonal(U1.vals), ibit) + end mask = bmask(ibit) b, c = U1.vals[1], U1.vals[2] step = 1<<(ibit-1) configs = itercontrol(nbit, [ibit], [0]) - length(configs), @inline function kernel(state, inds) + 1<<(nbit-1), function kernel(state, inds) x = @inbounds configs[inds[1]] + 1 swaprows!(piecewise(state, inds), x, x+step, c, b) end @@ -107,7 +111,7 @@ end mask = bmask(Int32, bits...) 1< CuArray, U1, (3,)) |> Vector ≈ instruct!(v1 |> copy, U1, (3,)) + for U1 in [mat(H), mat(Z), mat(I2), mat(P0), mat(X), mat(Y)] + #@test instruct!(v1 |> CuArray, U1, (3,)) |> Vector ≈ instruct!(v1 |> copy, U1, (3,)) @test instruct!(vn |> CuArray, U1, (3,)) |> Matrix ≈ instruct!(vn |> copy, U1, (3,)) end # sparse matrix like P0, P1 et. al. are not implemented. diff --git a/test/runtests.jl b/test/runtests.jl index 913160f..c8b9cd0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,5 @@ -using CuArrays -CuArrays.allowscalar(false) +using CUDA +CUDA.allowscalar(false) include("CUDApatch.jl") include("GPUReg.jl") include("gpuapplys.jl")