diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index e32c30be42..5cc0996ded 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -248,3 +248,121 @@ function run_and_collect(cmd) return proc, log end + + + +## opaque closures + +# TODO: once stabilised, move bits of this into GPUCompiler.jl + +using Core.Compiler: IRCode +using Core: CodeInfo, MethodInstance, CodeInstance, LineNumberNode + +struct OpaqueClosure{F, E, A, R} # func, env, args, ret + env::E +end + +# XXX: because we can't call functions from other CUDA modules, we effectively need to +# recompile when the target function changes. this, and because of how GPUCompiler's +# deferred compilation mechanism currently works, is why we have `F` as a type param. + +# XXX: because of GPU code requiring specialized signatures, we also need to recompile +# when the environment or argument types change. together with the above, this +# negates much of the benefit of opaque closures. + +# TODO: support for constructing an opaque closure from source code + +# TODO: complete support for passing an environment. this probably requires a split into +# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray. + +function compute_ir_rettype(ir::IRCode) + rt = Union{} + for i = 1:length(ir.stmts) + stmt = ir.stmts[i][:inst] + if isa(stmt, Core.Compiler.ReturnNode) && isdefined(stmt, :val) + rt = Core.Compiler.tmerge(Core.Compiler.argextype(stmt.val, ir), rt) + end + end + return Core.Compiler.widenconst(rt) +end + +function compute_oc_signature(ir::IRCode, nargs::Int, isva::Bool) + argtypes = Vector{Any}(undef, nargs) + for i = 1:nargs + argtypes[i] = Core.Compiler.widenconst(ir.argtypes[i+1]) + end + if isva + lastarg = pop!(argtypes) + if lastarg <: Tuple + append!(argtypes, lastarg.parameters) + else + push!(argtypes, Vararg{Any}) + end + end + return Tuple{argtypes...} +end + +function OpaqueClosure(ir::IRCode, @nospecialize env...; isva::Bool = false) + # NOTE: we need ir.argtypes[1] == typeof(env) + ir = Core.Compiler.copy(ir) + nargs = length(ir.argtypes)-1 + sig = compute_oc_signature(ir, nargs, isva) + rt = compute_ir_rettype(ir) + src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ()) + src.slotnames = Base.fill(:none, nargs+1) + src.slotflags = Base.fill(zero(UInt8), length(ir.argtypes)) + src.slottypes = copy(ir.argtypes) + src.rettype = rt + src = Core.Compiler.ir_to_codeinf!(src, ir) + config = compiler_config(device(); kernel=false) + return generate_opaque_closure(config, src, sig, rt, nargs, isva, env...) +end + +function OpaqueGPUClosure(src::CodeInfo, @nospecialize env...) + src.inferred || throw(ArgumentError("Expected inferred src::CodeInfo")) + mi = src.parent::Core.MethodInstance + sig = Base.tuple_type_tail(mi.specTypes) + method = mi.def::Method + nargs = method.nargs-1 + isva = method.isva + return generate_opaque_closure(config, src, sig, src.rettype, nargs, isva, env...) +end + +function generate_opaque_closure(config::CompilerConfig, src::CodeInfo, + @nospecialize(sig), @nospecialize(rt), + nargs::Int, isva::Bool, @nospecialize env...; + mod::Module=@__MODULE__, + file::Union{Nothing,Symbol}=nothing, line::Int=0) + # create a method (like `jl_make_opaque_closure_method`) + meth = ccall(:jl_new_method_uninit, Ref{Method}, (Any,), Main) + meth.sig = Tuple + meth.isva = isva # XXX: probably not supported? + meth.is_for_opaque_closure = 0 # XXX: do we want this? + meth.name = Symbol("opaque gpu closure") + meth.nargs = nargs + 1 + meth.file = something(file, Symbol()) + meth.line = line + ccall(:jl_method_set_source, Nothing, (Any, Any), meth, src) + + # look up a method instance and create a compiler job + full_sig = Tuple{typeof(env), sig.parameters...} + mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, + (Any, Any, Any), meth, full_sig, Core.svec()) + job = CompilerJob(mi, config) # this captures the current world age + + # create a code instance and store it in the cache + ci = CodeInstance(mi, rt, C_NULL, src, Int32(0), meth.primary_world, typemax(UInt), + UInt32(0), UInt32(0), nothing, UInt8(0)) + Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, mi) + + id = length(GPUCompiler.deferred_codegen_jobs) + 1 + GPUCompiler.deferred_codegen_jobs[id] = job + return OpaqueClosure{id, typeof(env), sig, rt}(env) +end + +# device-side call to an opaque closure +function (oc::OpaqueClosure{F})(a, b) where F + ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), F) + assume(ptr != C_NULL) + return ccall(ptr, Int, (Int, Int), a, b) +end diff --git a/test/execution.jl b/test/execution.jl index 85bd7a62b9..41532d37e5 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -1085,3 +1085,49 @@ end end ############################################################################################ + +if VERSION >= v"1.10-" +@testset "opaque closures" begin + +# basic closure, constructed from IRCode +let + ir, rettyp = only(Base.code_ircode(+, (Int, Int))) + oc = CUDA.OpaqueClosure(ir) + + c = CuArray([0]) + a = CuArray([1]) + b = CuArray([2]) + + function kernel(oc, c, a, b) + i = threadIdx().x + @inbounds c[i] = oc(a[i], b[i]) + return + end + @cuda threads=1 kernel(oc, c, a, b) + + @test Array(c)[] == 3 +end + +# basic closure, constructed from CodeInfo +let + ir, rettyp = only(Base.code_typed(+, (Int, Int))) + oc = CUDA.OpaqueClosure(ir) + + c = CuArray([0]) + a = CuArray([1]) + b = CuArray([2]) + + function kernel(oc, c, a, b) + i = threadIdx().x + @inbounds c[i] = oc(a[i], b[i]) + return + end + @cuda threads=1 kernel(oc, c, a, b) + + @test Array(c)[] == 3 +end + +end +end + +############################################################################################