Skip to content

Commit

Permalink
Merge pull request #62 from QuantumBFS/compat-CUDA
Browse files Browse the repository at this point in the history
compatibility upgrade - CUDA
  • Loading branch information
GiggleLiu authored Jul 19, 2020
2 parents 33675cc + 1a95354 commit e250477
Show file tree
Hide file tree
Showing 14 changed files with 50 additions and 50 deletions.
11 changes: 4 additions & 7 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ version = "0.2.3"

[deps]
BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"
CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LuxurySparse = "d05aeea4-b7d4-55ac-b691-9e7fabb07ba2"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand All @@ -18,9 +16,7 @@ Yao = "5872b779-8223-5990-8dd0-5abbb0748c8c"

[compat]
BitBasis = "0.7"
CUDAnative = "3.0"
CuArrays = "2.2"
GPUArrays = "3.0, 4.0"
CUDA = "1.1, 1.2"
LuxurySparse = "0.6"
Reexport = "0.2"
StaticArrays = "0.12"
Expand All @@ -32,6 +28,7 @@ julia = "1"
[extras]
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df"

[targets]
test = ["Test", "Statistics"]
test = ["Test", "Statistics", "YaoBlocks"]
2 changes: 1 addition & 1 deletion benchmarks/TestCuda.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using CuArrays, CUDAnative
using CUDA
using LinearAlgebra
using BenchmarkTools

Expand Down
12 changes: 6 additions & 6 deletions benchmarks/gates.jl
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
using Yao, CuYao, CuArrays
using Yao, CuYao, CUDA
using BenchmarkTools

reg = rand_state(12; nbatch=1000)
creg = reg |> cu
@benchmark CuArrays.@sync creg |> put(12, 3=>Z)
@benchmark CuArrays.@sync creg |> put(12, 3=>X)
@benchmark CUDA.@sync creg |> put(12, 3=>Z)
@benchmark CUDA.@sync creg |> put(12, 3=>X)
@benchmark reg |> put(12, 3=>Z)
@benchmark CuArrays.@sync creg |> control(12, 6, 3=>X)
@benchmark CUDA.@sync creg |> control(12, 6, 3=>X)
@benchmark reg |> control(12, 6, 3=>X)
@benchmark CuArrays.@sync creg |> put(12, 3=>rot(X, 0.3))
@benchmark CUDA.@sync creg |> put(12, 3=>rot(X, 0.3))
@benchmark reg |> put(12, 3=>rot(X, 0.3))

reg = rand_state(20)
creg = reg |> cu
g = swap(12, 7, 2)
@benchmark reg |> g
@benchmark CuArrays.@sync creg |> g
@benchmark CUDA.@sync creg |> g
2 changes: 1 addition & 1 deletion benchmarks/gcompile.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using Yao, Yao.Boost, Yao.Intrinsics, StaticArrays, Yao.Blocks
using CuYao, CuArrays, CUDAnative, GPUArrays
using CuYao, CUDA
using BenchmarkTools, Profile

nbit = 12
Expand Down
17 changes: 8 additions & 9 deletions src/CUDApatch.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#import CuArrays: _cuview, ViewIndex, NonContiguous
#using GPUArrays: genperm
#import CUDA: _cuview, ViewIndex, NonContiguous
#using CUDA: genperm
# fallback to SubArray when the view is not contiguous

#=
Expand All @@ -14,11 +14,11 @@ function LinearAlgebra.permutedims!(dest::GPUArray, src::GPUArray, perm) where N
end
=#

import CUDAnative: pow, abs, angle
import CUDA: pow, abs, angle
for (RT, CT) in [(:Float64, :ComplexF64), (:Float32, :ComplexF32)]
@eval cp2c(d::$RT, a::$RT) = CUDAnative.ComplexF64(d*CUDAnative.cos(a), d*CUDAnative.sin(a))
@eval cp2c(d::$RT, a::$RT) = CUDA.Complex(d*CUDA.cos(a), d*CUDA.sin(a))
for NT in [RT, :Int32]
@eval CUDAnative.pow(z::$CT, n::$NT) = CUDAnative.ComplexF64((CUDAnative.pow(CUDAnative.abs(z), n)*CUDAnative.cos(n*CUDAnative.angle(z))), (CUDAnative.pow(CUDAnative.abs(z), n)*CUDAnative.sin(n*CUDAnative.angle(z))))
@eval CUDA.pow(z::$CT, n::$NT) = CUDA.Complex((CUDA.pow(CUDA.abs(z), n)*CUDA.cos(n*CUDA.angle(z))), (CUDA.pow(CUDA.abs(z), n)*CUDA.sin(n*CUDA.angle(z))))
end
end

Expand Down Expand Up @@ -47,7 +47,7 @@ bit_count(UInt32(0b11111))
using LinearAlgebra
import LinearAlgebra: norm
const CuSubArr{T, N} = Union{CuArray{T, N}, SubArray{T, N, <:CuArray}}
norm2(A::CuSubArr; dims=1) = mapreduce(abs2, +, A, dims=dims) .|> CUDAnative.sqrt
norm2(A::CuSubArr; dims=1) = mapreduce(abs2, +, A, dims=dims) .|> CUDA.sqrt

export piecewise, cudiv
@inline function cudiv(x::Int)
Expand All @@ -74,7 +74,7 @@ piecewise(state::AbstractMatrix, inds) = @inbounds view(state,:,inds[2])

import Base: kron, getindex
function kron(A::Union{CuArray{T1}, Adjoint{<:Any, <:CuArray{T1}}}, B::Union{CuArray{T2}, Adjoint{<:Any, <:CuArray{T2}}}) where {T1, T2}
res = CuArrays.zeros(promote_type(T1,T2), (size(A).*size(B))...)
res = CUDA.zeros(promote_type(T1,T2), (size(A).*size(B))...)
CI = Base.CartesianIndices(res)
@inline function kernel(res, A, B)
state = (blockIdx().x-1) * blockDim().x + threadIdx().x
Expand Down Expand Up @@ -114,7 +114,7 @@ function kron!(C::CuArray{T3}, A::Union{CuArray{T1}, Adjoint{<:Any, <:CuArray{T1
end

function getindex(A::CuVector{T}, B::CuArray{<:Integer}) where T
res = CuArrays.zeros(T, size(B)...)
res = CUDA.zeros(T, size(B)...)
@inline function kernel(res, A, B)
state = (blockIdx().x-1) * blockDim().x + threadIdx().x
state <= length(res) && (@inbounds res[state] = A[B[state]])
Expand All @@ -131,4 +131,3 @@ function getindex(A::AbstractVector, B::CuArray{<:Integer})
end

YaoBlocks.AD.as_scalar(x::CuArray) = Array(x)[]

5 changes: 3 additions & 2 deletions src/CuYao.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import TupleTools
using Random

using Yao.YaoArrayRegister
using GPUArrays, CuArrays, CUDAnative
using CUDA
import Yao: kron!
@reexport using Yao

const Ints = NTuple{<:Any, Int}
Expand All @@ -18,7 +19,7 @@ include("gpuapplys.jl")
#include("gcompile.jl")

function __init__()
CuArrays.allowscalar(false)
CUDA.allowscalar(false)
end

end
14 changes: 7 additions & 7 deletions src/GPUReg.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import CuArrays: cu
import CUDA: cu
import Yao.YaoArrayRegister: _measure, measure, measure!, measure_collapseto!, measure_remove!
import Yao.YaoBase: batch_normalize!
import Yao: expect
Expand Down Expand Up @@ -46,7 +46,7 @@ function measure!(::RemoveMeasured, ::ComputationalBasis, reg::GPUReg{B}, ::AllL
if state <= length(nregm)
@inbounds i,j = CI[state].I
@inbounds r = Int(res[j])+1
@inbounds nregm[i,j] = regm[r,i,j]/CUDAnative.sqrt(pl[r, j])
@inbounds nregm[i,j] = regm[r,i,j]/CUDA.sqrt(pl[r, j])
end
return
end
Expand All @@ -69,7 +69,7 @@ function measure!(::NoPostProcess, ::ComputationalBasis, reg::GPUReg{B, T}, ::Al
if state <= length(regm)
@inbounds k,i,j = CI[state].I
@inbounds rind = Int(res[j]) + 1
@inbounds regm[k,i,j] = k==rind ? regm[k,i,j]/CUDAnative.sqrt(pl[k, j]) : T(0)
@inbounds regm[k,i,j] = k==rind ? regm[k,i,j]/CUDA.sqrt(pl[k, j]) : T(0)
end
return
end
Expand All @@ -92,8 +92,8 @@ function measure!(rst::ResetTo, ::ComputationalBasis, reg::GPUReg{B, T}, ::AllLo
if state <= length(regm)
@inbounds k,i,j = CI[state].I
@inbounds rind = Int(res[j]) + 1
@inbounds k==val+1 && (regm[k,i,j] = regm[rind,i,j]/CUDAnative.sqrt(pl[rind, j]))
CuArrays.sync_threads()
@inbounds k==val+1 && (regm[k,i,j] = regm[rind,i,j]/CUDA.sqrt(pl[rind, j]))
CUDA.sync_threads()
@inbounds k!=val+1 && (regm[k,i,j] = 0)
end
return
Expand All @@ -106,7 +106,7 @@ end

import Yao.YaoArrayRegister: insert_qubits!, join
function YaoBase.batched_kron(A::Union{CuArray{T1, 3}, Adjoint{<:Any, <:CuArray{T1, 3}}}, B::Union{CuArray{T2, 3}, Adjoint{<:Any, <:CuArray{T2, 3}}}) where {T1 ,T2}
res = CuArrays.zeros(promote_type(T1,T2), size(A,1)*size(B, 1), size(A,2)*size(B,2), size(A, 3))
res = CUDA.zeros(promote_type(T1,T2), size(A,1)*size(B, 1), size(A,2)*size(B,2), size(A, 3))
CI = Base.CartesianIndices(res)
@inline function kernel(res, A, B)
state = (blockIdx().x-1) * blockDim().x + threadIdx().x
Expand All @@ -133,7 +133,7 @@ Performs batched Kronecker products in-place on the GPU.
The results are stored in 'C', overwriting the existing values of 'C'.
"""
function YaoBase.batched_kron!(C::CuArray{T3, 3}, A::Union{CuArray{T1, 3}, Adjoint{<:Any, <:CuArray{T1, 3}}}, B::Union{CuArray{T2, 3}, Adjoint{<:Any, <:CuArray{T2, 3}}}) where {T1 ,T2, T3}
@boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2)), size(A,3)) || throw(DimensionMismatch())
@boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2), size(A,3))) || throw(DimensionMismatch())
@boundscheck (size(A,3) == size(B,3) == size(C,3)) || throw(DimensionMismatch())
CI = Base.CartesianIndices(C)
@inline function kernel(C, A, B)
Expand Down
2 changes: 1 addition & 1 deletion src/gpuapplys.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,6 @@ end
using Yao.YaoBlocks
function YaoBlocks._apply_fallback!(r::GPUReg{B,T}, b::AbstractBlock) where {B,T}
YaoBlocks._check_size(r, b)
r.state .= CuArrays.adapt(CuArray{T}, mat(T, b)) * r.state
r.state .= CUDA.adapt(CuArray{T}, mat(T, b)) * r.state
return r
end
12 changes: 8 additions & 4 deletions src/kernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,21 @@ end
u1rows!(piecewise(state, inds), i, i+step, a, b, c, d)
end
end
u1_kernel(nbit::Int, U1::SDSparseMatrixCSC, ibit::Int) = u1_kernel(nbit, U1|>Matrix, ibit)
function u1_kernel(nbit::Int, U1::SDSparseMatrixCSC, ibit::Int)
u1_kernel(nbit, U1|>Matrix, ibit)
end

@inline function u1_kernel(nbit::Int, U1::SDPermMatrix, ibit::Int)
U1.perm[1] == 1 && return u1_kernel(nbit, Diagonal(U1.vals), ibit)
if U1.perm[1] == 1
return u1_kernel(nbit, Diagonal(U1.vals), ibit)
end

mask = bmask(ibit)
b, c = U1.vals[1], U1.vals[2]
step = 1<<(ibit-1)
configs = itercontrol(nbit, [ibit], [0])

length(configs), @inline function kernel(state, inds)
1<<(nbit-1), function kernel(state, inds)
x = @inbounds configs[inds[1]] + 1
swaprows!(piecewise(state, inds), x, x+step, c, b)
end
Expand Down Expand Up @@ -107,7 +111,7 @@ end
mask = bmask(Int32, bits...)
1<<nbit,@inline function kernel(state, inds)
i = inds[1]
piecewise(state, inds)[i] *= CUDAnative.pow(d, bit_count(Int32(i-1)&mask))
piecewise(state, inds)[i] *= CUDA.pow(d, bit_count(Int32(i-1)&mask))
return
end
end
Expand Down
7 changes: 3 additions & 4 deletions test/CUDApatch.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
using CuYao
using CuArrays, GPUArrays
using CUDA
using Test
using CUDAnative
using YaoBlocks

@testset "isapprox-complex" begin
Expand All @@ -26,8 +25,8 @@ end
@testset "Complex pow" begin
for T in [ComplexF64, ComplexF32]
a = CuArray(randn(T, 4, 4))
@test Array(CUDAnative.pow.(a, Int32(3))) Array(a).^3
@test Array(CUDAnative.pow.(a, real(T)(3))) Array(a).^3
@test Array(CUDA.pow.(a, Int32(3))) Array(a).^3
@test Array(CUDA.pow.(a, real(T)(3))) Array(a).^3
end
end

Expand Down
2 changes: 1 addition & 1 deletion test/Diff.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ using StatsBase: Weights
using BitBasis
using StaticArrays
using QuAlgorithmZoo
using CuArrays
using CUDA
using YaoExtensions
using YaoExtensions: NDWeights

Expand Down
4 changes: 2 additions & 2 deletions test/GPUReg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ using LinearAlgebra
using BitBasis
using Statistics: mean
using StaticArrays
using CuArrays
CuArrays.allowscalar(false)
using CUDA
CUDA.allowscalar(false)

@testset "basics" begin
a = randn(ComplexF64, 50, 20)
Expand Down
6 changes: 3 additions & 3 deletions test/gpuapplys.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ using Test, Random
using CuYao
using StaticArrays
using Yao.ConstGate: SWAPGate
using CuArrays
using CUDA

@testset "gpu instruct nbit!" begin
Random.seed!(3)
Expand Down Expand Up @@ -45,8 +45,8 @@ end
v1 = randn(ComplexF32, N)
vn = randn(ComplexF32, N, 333)

for U1 in [mat(H), mat(Y), mat(Z), mat(I2), mat(P0)]
@test instruct!(v1 |> CuArray, U1, (3,)) |> Vector instruct!(v1 |> copy, U1, (3,))
for U1 in [mat(H), mat(Z), mat(I2), mat(P0), mat(X), mat(Y)]
#@test instruct!(v1 |> CuArray, U1, (3,)) |> Vector ≈ instruct!(v1 |> copy, U1, (3,))
@test instruct!(vn |> CuArray, U1, (3,)) |> Matrix instruct!(vn |> copy, U1, (3,))
end
# sparse matrix like P0, P1 et. al. are not implemented.
Expand Down
4 changes: 2 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using CuArrays
CuArrays.allowscalar(false)
using CUDA
CUDA.allowscalar(false)
include("CUDApatch.jl")
include("GPUReg.jl")
include("gpuapplys.jl")
Expand Down

0 comments on commit e250477

Please sign in to comment.