Optimize inner gpu type (#172)

--------- Co-authored-by: Stefan Krastanov <[email protected]>
QuantumSavory · Sep 8, 2023 · 5c9822e · 5c9822e
1 parent 4e767d2
commit 5c9822e
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 18 deletions.
diff --git a/benchmark/benchmark_gpu_innertype.jl b/benchmark/benchmark_gpu_innertype.jl
@@ -0,0 +1,81 @@
+using Revise
+using QuantumClifford
+using QuantumClifford: to_cpu, to_gpu
+using CUDA
+using BenchmarkTools
+using Plots
+using ProgressBars
+using DataFrames
+using CSV
+
+include("surface-code-circuit.jl")
+
+ints = [UInt8, UInt16, UInt32, UInt64, UInt128]
+names = [:UInt8, :UInt16, :UInt32, :UInt64, :UInt128]
+
+function get_stats()
+    println("Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREADS = $(Sys.CPU_THREADS)`...")
+
+    # Powers of 2 to benchmark
+    powers = reverse(3:18) # start from the hardest one first! (fail fast)
+    n_values = 2 .^ powers
+
+    times = Dict([
+        sm=>[]
+        for (sm, tp) in zip(names, ints)
+    ])
+
+    ccircuit = if eltype(circ) <: QuantumClifford.CompactifiedGate
+        circ
+    else
+        compactify_circuit(circ)
+    end
+
+    CUDA.allowscalar(false)
+
+
+    for n in ProgressBar(n_values)    
+        for (sm, T) in zip(names, ints)
+            gpu_frames() = to_gpu(QuantumClifford._create_pauliframe(ccircuit; trajectories=n), T)
+            gpu_column() = pftrajectories(fastcolumn(gpu_frames()), ccircuit), synchronize()
+            @show n, T
+            @elapsed gpu_column() # to eliminate compile time
+            push!(times[sm], @belapsed $gpu_column())
+        end
+        # note. this is also measuring the time it takes to move data to gpu
+    end
+    df = DataFrame(times)
+    df.n_values=n_values
+    return df
+end
+
+function load_stats()
+    if isfile("gpu_innertype_comparison.csv")
+        DataFrame(CSV.File("gpu_innertype_comparison.csv"))
+    else
+        get_stats()
+    end
+end
+
+function save_stats(df)
+    CSV.write("gpu_innertype_comparison.csv", df)
+end
+
+function plot_result(df)
+    my_y_axis(arrs) = begin
+        arr = []
+        for a in arrs
+            append!(arr, a)
+        end
+        logarr = log.(arr)
+        exp.(range(min(logarr...), stop=max(logarr...), length=8))
+    end
+    plot(df.n_values, [df[!, col] for col in names], marker=:circle, label=reshape(["times $col" for col in names], (1, length(names))), xlabel="n", ylabel="Execution Time (s)", title="gpu inner type comparison (fast column)", xticks=df.n_values[1:2:end], xscale=:log2, yticks=my_y_axis([df[!, col] for col in names]), yscale=:log10, legend=:topleft, size=(1200, 800))
+end
+
+function main()
+    df = load_stats()
+    save_stats(df)
+    plot_result(df)
+    savefig("benchmark_gpu_innertype.png")
+end
diff --git a/ext/QuantumCliffordGPUExt/adapters.jl b/ext/QuantumCliffordGPUExt/adapters.jl
@@ -1,33 +1,40 @@
-# const InnerGPUType = UInt64;
+const InnerGPUType = UInt32;
+const InnerCPUType = UInt64;
 # todo. make the conversion types consiting with this...
 # also define a cpu default type?!
 
-to_gpu(array::AbstractArray) = CuArray(array);
+to_gpu(array::AbstractArray) = CuArray(array)
 to_cpu(array::AbstractArray) = Array(array);
 
+# changes the type to InnerGPUType or InnerCPUType as well as copying to cpu/gpu
+to_gpu(array::AbstractArray, tp::Type{T}) where {T <: Unsigned} = 
+    CuArray(reinterpret(T, collect(array)))
+to_cpu(array::AbstractArray, tp::Type{T}) where {T <: Unsigned} = 
+    Array(reinterpret(T, collect(array)))
+
 # maybe change the format of storing the data in gpu array 
 # so that it is more convinient to work with them on gpu?
 # todo later add some type checking to avoid copying (or throw error) if the data is already on gpu/cpu
-to_gpu(tab::QuantumClifford.Tableau) =
-    QuantumClifford.Tableau(to_gpu(tab.phases), tab.nqubits, to_gpu(tab.xzs))
+to_gpu(tab::QuantumClifford.Tableau, tp::Type{T}=InnerGPUType) where {T <: Unsigned} =
+    QuantumClifford.Tableau(to_gpu(tab.phases), tab.nqubits, to_gpu(tab.xzs, tp))
 
-to_cpu(tab::QuantumClifford.Tableau) =
-    QuantumClifford.Tableau(to_cpu(tab.phases), tab.nqubits, to_cpu(tab.xzs))
+to_cpu(tab::QuantumClifford.Tableau, tp::Type{T}=InnerCPUType) where {T <: Unsigned} =
+    QuantumClifford.Tableau(to_cpu(tab.phases), tab.nqubits, to_cpu(tab.xzs, tp))
 
-to_gpu(pauli::QuantumClifford.PauliOperator) =
-    QuantumClifford.PauliOperator(to_gpu(pauli.phase), pauli.nqubits, to_gpu(pauli.xz))
+to_gpu(pauli::QuantumClifford.PauliOperator, tp::Type{T}=InnerGPUType) where {T <: Unsigned} =
+    QuantumClifford.PauliOperator(to_gpu(pauli.phase), pauli.nqubits, to_gpu(pauli.xz, tp))
 
-to_cpu(pauli::QuantumClifford.PauliOperator) =
-    QuantumClifford.PauliOperator(to_cpu(pauli.phase), pauli.nqubits, to_cpu(pauli.xz))
+to_cpu(pauli::QuantumClifford.PauliOperator, tp::Type{T}=InnerCPUType) where {T <: Unsigned} =
+    QuantumClifford.PauliOperator(to_cpu(pauli.phase), pauli.nqubits, to_cpu(pauli.xz, tp))
 
-to_gpu(stabilizer::QuantumClifford.Stabilizer) =
-    Stabilizer(to_gpu(tab(stabilizer)))
+to_gpu(stabilizer::QuantumClifford.Stabilizer, tp::Type{T}=InnerGPUType) where {T <: Unsigned} =
+    Stabilizer(to_gpu(tab(stabilizer), tp))
 
-to_cpu(stabilizer::QuantumClifford.Stabilizer) = 
-    Stabilizer(to_cpu(tab(stabilizer)))
+to_cpu(stabilizer::QuantumClifford.Stabilizer, tp::Type{T}=InnerCPUType) where {T <: Unsigned} = 
+    Stabilizer(to_cpu(tab(stabilizer), tp))
 
-to_gpu(pauli_frame::QuantumClifford.PauliFrame) =
-    QuantumClifford.PauliFrame(to_gpu(pauli_frame.frame), to_gpu(pauli_frame.measurements))
+to_gpu(pauli_frame::QuantumClifford.PauliFrame, tp::Type{T}=InnerGPUType) where {T <: Unsigned} =
+    QuantumClifford.PauliFrame(to_gpu(pauli_frame.frame, tp), to_gpu(pauli_frame.measurements))
 
-to_cpu(pauli_frame::QuantumClifford.PauliFrame) =
-    QuantumClifford.PauliFrame(to_cpu(pauli_frame.frame), to_cpu(pauli_frame.measurements))
+to_cpu(pauli_frame::QuantumClifford.PauliFrame, tp::Type{T}=InnerCPUType) where {T <: Unsigned} =
+    QuantumClifford.PauliFrame(to_cpu(pauli_frame.frame, tp), to_cpu(pauli_frame.measurements))
diff --git a/ext/QuantumCliffordGPUExt/apply_noise.jl b/ext/QuantumCliffordGPUExt/apply_noise.jl
@@ -27,6 +27,7 @@ function applynoise_kernel(xzs::DeviceMatrix{Tme},
         return nothing
     end
 
+
     r = rand()
     if  r < p # X error
         xzs[ibig,f] ⊻= ismallm