diff --git a/Cargo.toml b/Cargo.toml index 1ffdae5..93355f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ opencl = ["mnn-sys/opencl"] metal = ["mnn-sys/metal"] coreml = ["mnn-sys/coreml"] +cuda = ["mnn-sys/cuda"] vulkan = [] # This is currently unimplemented @@ -44,7 +45,7 @@ serde = ["dep:serde"] simd = ["mnn-sys/simd"] -default = ["simd"] +# default = ["simd"] [dev-dependencies] diff --git a/benches/mnn-bench.rs b/benches/mnn-bench.rs index f05e438..7163d29 100644 --- a/benches/mnn-bench.rs +++ b/benches/mnn-bench.rs @@ -45,4 +45,19 @@ mod mnn_realesr_bench_with_ones { net.wait(&session); }); } + + #[cfg(feature = "cuda")] + #[divan::bench] + pub fn mnn_realesr_benchmark_cuda(bencher: Bencher) { + let net = Interpreter::from_file("tests/assets/realesr.mnn").unwrap(); + let mut config = ScheduleConfig::new(); + config.set_type(ForwardType::Cuda); + let session = net.create_session(config).unwrap(); + bencher.bench_local(|| { + let mut input = net.input(&session, "data").unwrap(); + input.fill(1f32); + net.run_session(&session).unwrap(); + net.wait(&session); + }); + } } diff --git a/flake.nix b/flake.nix index d007e57..86e00b6 100644 --- a/flake.nix +++ b/flake.nix @@ -42,6 +42,7 @@ flake-utils.lib.eachDefaultSystem ( system: let pkgs = import nixpkgs { + config.allowUnfree = true; inherit system; overlays = [ rust-overlay.overlays.default @@ -67,12 +68,13 @@ extensions = ["rust-docs" "rust-src" "rust-analyzer"]; } // (lib.optionalAttrs pkgs.stdenv.isDarwin { - targets = ["aarch64-apple-darwin" "x86_64-apple-darwin"]; + targets = ["aarch64-apple-darwin" "x86_64-apple-darwin" "wasm32-unknown-unknown"]; })); + nightlyToolchain = pkgs.rust-bin.nightly.latest.default; craneLib = (crane.mkLib pkgs).overrideToolchain rustToolchain; craneLibLLvmTools = (crane.mkLib pkgs).overrideToolchain rustToolchainWithLLvmTools; - src = lib.sources.sourceFilesBySuffices ./. [".rs" ".toml" ".patch" ".mnn" ".h" ".cpp" ".svg" "lock"]; + src = lib.sources.sourceFilesBySuffices ./. [".rs" ".toml" ".patch" ".mnn" ".h" ".cpp" ".svg" ".lock"]; MNN_SRC = pkgs.applyPatches { name = "mnn-src"; src = mnn-src; @@ -80,20 +82,25 @@ }; commonArgs = { inherit src MNN_SRC; + stdenv = pkgs.clangStdenv; pname = "mnn"; doCheck = false; - LIBCLANG_PATH = "${pkgs.llvmPackages.libclang.lib}/lib"; - nativeBuildInputs = with pkgs; [ - cmake - llvmPackages.libclang.lib - clang - pkg-config - ]; - buildInputs = with pkgs; - [] + nativeBuildInputs = with pkgs; + [ + pkg-config + libclang.lib + ] ++ (lib.optionals pkgs.stdenv.isLinux [ + cudatoolkit + ]); + LIBCLANG_PATH = "${pkgs.libclang.lib}/lib"; + buildInputs = with pkgs; + (lib.optionals pkgs.stdenv.isLinux [ ocl-icd opencl-headers + (lib.getDev cudaPackages.cuda_cudart) + (lib.getLib cudaPackages.cuda_cudart) + (lib.getStatic cudaPackages.cuda_cudart) ]) ++ (lib.optionals pkgs.stdenv.isDarwin [ apple-sdk_13 @@ -157,18 +164,13 @@ # name = "mnn-leaks"; # cargoLock = { # lockFile = ./Cargo.lock; - # outputHashes = { - # "cmake-0.1.50" = "sha256-GM2D7dpb2i2S6qYVM4HYk5B40TwKCmGQnUPfXksyf0M="; - # }; # }; # # buildPhase = '' - # cargo test --target aarch64-apple-darwin + # cargo test --profile rwd --target aarch64-apple-darwin # ''; # RUSTFLAGS = "-Zsanitizer=address"; # ASAN_OPTIONS = "detect_leaks=1"; - # # MNN_COMPILE = "NO"; - # # MNN_LIB_DIR = "${pkgs.mnn}/lib"; # } # ); } @@ -200,10 +202,13 @@ }; devShells = { - default = pkgs.mkShell (commonArgs - // { + default = pkgs.mkShell.override {stdenv = pkgs.clangStdenv;} ( + { MNN_SRC = null; LLDB_DEBUGSERVER_PATH = "/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Versions/A/Resources/debugserver"; + nativeBuildInputs = commonArgs.nativeBuildInputs; + buildInputs = commonArgs.buildInputs; + LIBCLANG_PATH = commonArgs.LIBCLANG_PATH; packages = with pkgs; [ cargo-audit @@ -220,14 +225,19 @@ rust-bindgen google-cloud-sdk rustToolchainWithRustAnalyzer + mnn ] ++ ( lib.optionals pkgs.stdenv.isLinux [ + cudatoolkit cargo-llvm-cov ] ); - # ++ (with packages; [bencher inspect]); - }); + } + // lib.optionalAttrs pkgs.stdenv.isLinux { + CUDA_PATH = "${pkgs.cudatoolkit}"; + } + ); }; } ) diff --git a/mnn-sys/Cargo.toml b/mnn-sys/Cargo.toml index 08075d1..5314318 100644 --- a/mnn-sys/Cargo.toml +++ b/mnn-sys/Cargo.toml @@ -30,6 +30,7 @@ thiserror = "2.0.3" [features] opencl = [] +cuda = [] metal = [] coreml = ["metal"] vulkan = [] diff --git a/mnn-sys/build.rs b/mnn-sys/build.rs index 5d06bea..aa5c136 100644 --- a/mnn-sys/build.rs +++ b/mnn-sys/build.rs @@ -257,6 +257,7 @@ pub fn mnn_c_bindgen(vendor: impl AsRef, out: impl AsRef) -> Result< .clang_arg(CxxOption::METAL.cxx()) .clang_arg(CxxOption::COREML.cxx()) .clang_arg(CxxOption::OPENCL.cxx()) + .clang_arg(CxxOption::CUDA.cxx()) .pipe(|builder| { if is_emscripten() { println!("cargo:rustc-cdylib-link-arg=-fvisibility=default"); @@ -314,6 +315,7 @@ pub fn mnn_cpp_bindgen(vendor: impl AsRef, out: impl AsRef) -> Resul .clang_arg(CxxOption::METAL.cxx()) .clang_arg(CxxOption::COREML.cxx()) .clang_arg(CxxOption::OPENCL.cxx()) + .clang_arg(CxxOption::CUDA.cxx()) .clang_arg(format!("-I{}", vendor.join("include").to_string_lossy())) .generate_cstr(true) .generate_inline_functions(true) @@ -327,9 +329,12 @@ pub fn mnn_cpp_bindgen(vendor: impl AsRef, out: impl AsRef) -> Resul .join("Interpreter.hpp") .to_string_lossy(), ) + // .header( + // vendor + // .join("include/MNN/MNNSharedContext.h") + // .to_string_lossy(), + // ) .allowlist_item(".*SessionInfoCode.*"); - // let cmd = bindings.command_line_flags().join(" "); - // println!("cargo:warn=bindgen: {}", cmd); let bindings = bindings.generate().change_context(Error)?; bindings .write_to_file(out.as_ref().join("mnn_cpp.rs")) @@ -351,19 +356,17 @@ pub fn mnn_c_build(path: impl AsRef, vendor: impl AsRef) -> Result<( let vendor = vendor.as_ref(); cc::Build::new() .include(vendor.join("include")) - // .includes(vulkan_includes(vendor)) .pipe(|config| { - #[cfg(feature = "vulkan")] - config.define("MNN_VULKAN", "1"); - #[cfg(feature = "metal")] - config.define("MNN_METAL", "1"); - #[cfg(feature = "coreml")] - config.define("MNN_COREML", "1"); - #[cfg(feature = "opencl")] - config.define("MNN_OPENCL", "ON"); + CxxOption::COREML.define(config); + CxxOption::CUDA.define(config); + CxxOption::METAL.define(config); + CxxOption::OPENCL.define(config); + CxxOption::VULKAN.define(config); if is_emscripten() { config.compiler("emcc"); // We can't compile wasm32-unknown-unknown with emscripten + // emscripten works with cpu backend only so we are not sure if it would work with + // others at all config.target("wasm32-unknown-emscripten"); config.cpp_link_stdlib("c++-noexcept"); } @@ -463,6 +466,7 @@ impl CxxOption { cxx_option_from_features! { VULKAN => "vulkan", "MNN_VULKAN", METAL => "metal", "MNN_METAL", + CUDA => "cuda", "MNN_CUDA", COREML => "coreml", "MNN_COREML", OPENCL => "opencl", "MNN_OPENCL", CRT_STATIC => "crt_static", "MNN_WIN_RUNTIME_MT", @@ -621,6 +625,7 @@ pub fn mnn_cpp_build(vendor: impl AsRef) -> Result<()> { // CxxOption::VULKAN.define(&mut build); // CxxOption::COREML.define(&mut build); + CxxOption::CUDA.define(&mut build); CxxOption::METAL.define(&mut build); CxxOption::OPENCL.define(&mut build); CxxOption::CRT_STATIC.define(&mut build); @@ -697,6 +702,8 @@ pub fn mnn_cpp_build(vendor: impl AsRef) -> Result<()> { let build = opencl(build, vendor).change_context(Error)?; #[cfg(feature = "metal")] let build = metal(build, vendor).change_context(Error)?; + #[cfg(feature = "cuda")] + let build = cuda(build, vendor).change_context(Error)?; build .try_compile("mnn") @@ -1037,3 +1044,83 @@ pub fn cc_builder() -> cc::Build { .std("c++11") .to_owned() } + +pub fn cuda(mut build: cc::Build, vendor: impl AsRef) -> Result { + let cuda_dir = vendor.as_ref().join("source/backend/cuda"); + let (cuda_files_cu, cuda_files_cpp): (Vec<_>, Vec<_>) = + ignore::WalkBuilder::new(cuda_dir.join("core")) + .add(cuda_dir.join("execution")) + .build() + .flatten() + .filter(|p| p.path().has_extension(["cpp", "cu"])) + .map(|e| e.into_path()) + .filter(|p| { + !p.components() + .any(|component| component.as_os_str().eq("plugin")) + }) + .filter(|p| { + !p.components() + .any(|component| component.as_os_str().eq("weight_only_quant")) + }) + .partition(|p| p.has_extension(["cu"])); + + fn cuda_compute(version: u8, enable: bool) -> impl FnOnce(&mut cc::Build) -> &mut cc::Build { + move |build: &mut cc::Build| { + if enable { + build.define(&format!("MNN_CUDA_ENABLE_SM{version}"), None); + } + build.flag("-gencode"); + build.flag(&format!("arch=compute_{version},code=sm_{version}",)) + } + } + + let cuda_objects = cc::Build::new() + .cuda(true) + .cudart("static") + .flag("-m64") + .flag("--std") + .flag("c++11") + .flag("-w") + .flag("-O3") + .flag("-g") + .define("MNN_Cuda_Main_EXPORTS", None) + // .flag("--std=c++17") + // .flag("-O3") + .includes(mnn_includes(vendor.as_ref())) + .include(vendor.as_ref().join("3rd_party/cutlass/v2_9_0/include")) + .include(&cuda_dir) + .pipe(|b| { + if *TARGET_OS == "windows" { + b.flag("-Xcompiler").flag("/FS"); + } + b + }) + .pipe(cuda_compute(60, false)) + .pipe(cuda_compute(61, false)) + .pipe(cuda_compute(62, false)) + .pipe(cuda_compute(70, false)) + .pipe(cuda_compute(72, false)) + .pipe(cuda_compute(75, true)) + .pipe(cuda_compute(80, true)) + .pipe(cuda_compute(86, true)) + .pipe(cuda_compute(89, true)) + .files(cuda_files_cu) + .try_compile_intermediates() + .change_context(Error) + .attach_printable("Failed to compile MNNCuda")?; + + cc_builder() + .includes(mnn_includes(vendor.as_ref())) + .include(vendor.as_ref().join("3rd_party/cutlass/v2_9_0/include")) + .include(&cuda_dir) + .file(cuda_dir.join("Register.cpp")) + .files(cuda_files_cpp) + .objects(cuda_objects) + .cargo_debug(true) + .try_compile("MNNCuda") + .change_context(Error) + .attach_printable("Failed to compile cuda/Register.cpp")?; + + CxxOption::CUDA.define(&mut build); + Ok(build) +} diff --git a/mnn-sys/vendor b/mnn-sys/vendor index 707b8a4..dd43b5a 160000 --- a/mnn-sys/vendor +++ b/mnn-sys/vendor @@ -1 +1 @@ -Subproject commit 707b8a41b25e3d0b7c4a39cd81109d7074ca3c28 +Subproject commit dd43b5aa4b157d892b2ef8c78a5c921024709539 diff --git a/src/schedule.rs b/src/schedule.rs index f32666c..b6f397b 100644 --- a/src/schedule.rs +++ b/src/schedule.rs @@ -45,6 +45,9 @@ pub enum ForwardType { #[cfg(feature = "metal")] /// Use the Metal backend for computation. Metal, + #[cfg(feature = "cuda")] + /// Use the Metal backend for computation. + Cuda, #[cfg(feature = "opencl")] /// Use the OpenCL backend for computation. OpenCL, @@ -65,6 +68,8 @@ impl ForwardType { ForwardType::CPU => MNNForwardType::MNN_FORWARD_CPU, #[cfg(feature = "metal")] ForwardType::Metal => MNNForwardType::MNN_FORWARD_METAL, + #[cfg(feature = "cuda")] + ForwardType::Cuda => MNNForwardType::MNN_FORWARD_CUDA, #[cfg(feature = "opencl")] ForwardType::OpenCL => MNNForwardType::MNN_FORWARD_OPENCL, #[cfg(feature = "vulkan")] @@ -81,6 +86,8 @@ impl ForwardType { MNNForwardType::MNN_FORWARD_CPU => ForwardType::CPU, #[cfg(feature = "metal")] MNNForwardType::MNN_FORWARD_METAL => ForwardType::Metal, + #[cfg(feature = "cuda")] + MNNForwardType::MNN_FORWARD_CUDA => ForwardType::Cuda, #[cfg(feature = "opencl")] MNNForwardType::MNN_FORWARD_OPENCL => ForwardType::OpenCL, #[cfg(feature = "vulkan")] @@ -99,6 +106,8 @@ impl ForwardType { "cpu", #[cfg(feature = "metal")] "metal", + #[cfg(feature = "cuda")] + "cuda", #[cfg(feature = "opencl")] "opencl", #[cfg(feature = "vulkan")] @@ -116,6 +125,8 @@ impl ForwardType { ForwardType::CPU => "cpu", #[cfg(feature = "metal")] ForwardType::Metal => "metal", + #[cfg(feature = "cuda")] + ForwardType::Cuda => "cuda", #[cfg(feature = "opencl")] ForwardType::OpenCL => "opencl", #[cfg(feature = "vulkan")] @@ -136,6 +147,8 @@ impl core::str::FromStr for ForwardType { "cpu" => Ok(ForwardType::CPU), #[cfg(feature = "metal")] "metal" => Ok(ForwardType::Metal), + #[cfg(feature = "cuda")] + "cuda" => Ok(ForwardType::Cuda), #[cfg(feature = "opencl")] "opencl" => Ok(ForwardType::OpenCL), #[cfg(feature = "vulkan")] @@ -464,3 +477,22 @@ impl FromIterator for ScheduleConfigs { } unsafe impl Send for ScheduleConfigs {} +// +// #[derive(Debug, Clone)] +// pub enum UserDeviceContext { +// OpenCL(OpenCLContext), +// Cuda(CudaContext), +// } +// +// #[derive(Debug, Clone)] +// pub struct OpenCLContext { +// pub device_id: u32, +// pub platform_id: u32, +// pub context_ptr: *mut core::ffi::c_void, +// pub gl_shared: *mut core::ffi::c_void, +// } +// +// #[derive(Debug, Clone)] +// pub struct CudaContext { +// pub device_id: u32, +// } diff --git a/tools/bencher/Cargo.toml b/tools/bencher/Cargo.toml index b38025f..17b67b6 100644 --- a/tools/bencher/Cargo.toml +++ b/tools/bencher/Cargo.toml @@ -10,8 +10,11 @@ mnn = { workspace = true, features = ["opencl", "serde", "metal"] } [target."x86_64-apple-darwin".dependencies] mnn = { workspace = true, features = ["opencl", "serde"] } -[target."cfg(windows)".dependencies] -mnn = { workspace = true, features = ["opencl", "serde"] } +[target."cfg(not(target_os = \"macos\"))".dependencies] +mnn = { workspace = true, features = [ + "cuda", + "serde", +], default-features = false } [dependencies] bytemuck = { version = "1.20.0", features = ["extern_crate_alloc"] } diff --git a/tools/bencher/src/main.rs b/tools/bencher/src/main.rs index 87f9f49..dfa0854 100644 --- a/tools/bencher/src/main.rs +++ b/tools/bencher/src/main.rs @@ -211,7 +211,9 @@ impl ScheduleConfigItem { bc.set_power_mode(self.power); bc.set_precision_mode(self.precision); bc.set_memory_mode(self.memory); - sc.set_type(self.forward).set_backend_config(bc); + sc.set_type(self.forward) + .set_backup_type(self.forward) + .set_backend_config(bc); sc } } @@ -306,7 +308,16 @@ pub fn main() -> Result<()> { // let indicatif_layer = IndicatifLayer::new(); tracing_subscriber::registry() .with(cli.verbose.tracing_level_filter()) - .with(tracing_subscriber::fmt::layer().with_writer(Term::stderr)) + .with( + tracing_subscriber::fmt::layer() + .event_format( + tracing_subscriber::fmt::format() + .with_line_number(true) + .with_ansi(true) + .with_file(true), + ) + .with_writer(Term::stderr), + ) .init(); match cli.subcommand {