Adapt Cuda device to add 'a at wrap

elftausend · Nov 26, 2024 · 53b1cc3 · 53b1cc3
1 parent 05d1bd4
commit 53b1cc3
Show file tree

Hide file tree

Showing 11 changed files with 71 additions and 70 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -53,7 +53,7 @@ min-cl = { git = "https://github.com/elftausend/min-cl", optional = true }
 [features]
 # default = ["cpu", "blas", "static-api", "macro", "cached", "autograd", "stack", "opencl", "fork", "graph", "untyped"]
 
-default = ["cpu", "cached", "autograd", "static-api", "blas", "macro", "fork"]
+default = ["cpu", "cached", "autograd", "static-api", "blas", "macro", "fork", "cuda"]
 # default = ["no-std"]
 # default = ["opencl"]
 # default = ["untyped", "cpu", "lazy", "graph", "autograd", "fork", "serde", "json", "half", "cached", "static-api", "stack", "opencl", "nnapi"]

diff --git a/examples/implement_operations.rs b/examples/implement_operations.rs
@@ -99,8 +99,8 @@ where
 
 #[cfg(feature = "cuda")]
 // CUDA Implementation
-impl<Mods: Retrieve<Self, T>, T: CDatatype> AddBuf<T> for CUDA<Mods> {
-    fn add(&self, lhs: &Buffer<T, Self>, rhs: &Buffer<T, Self>) -> Buffer<T, Self> {
+impl<'a, Mods: Retrieve<'a, Self, T>, T: CDatatype> AddBuf<'a, T> for CUDA<Mods> {
+    fn add(&'a self, lhs: &Buffer<T, Self>, rhs: &Buffer<T, Self>) -> Buffer<'a, T, Self> {
         // generic CUDA kernel
         let src = format!(
             r#"extern "C" __global__ void add({datatype}* lhs, {datatype}* rhs, {datatype}* out, int numElements)

diff --git a/src/devices.rs b/src/devices.rs
@@ -150,6 +150,14 @@ macro_rules! impl_device_traits {
 
         impl<Mods> $crate::ReplaceBufPassDown for $device<Mods> {}
         impl<Mods> $crate::ExecNowPassDown for $device<Mods> {}
+
+        impl<Mods> $crate::HasModules for $device<Mods> {
+            type Mods = Mods;
+
+            fn modules(&self) -> &Self::Mods {
+                &self.modules
+            }
+        }
     };
 }
 

diff --git a/src/devices/cpu/cpu_device.rs b/src/devices/cpu/cpu_device.rs
@@ -2,8 +2,8 @@ use core::convert::Infallible;
 
 use crate::{
     cpu::CPUPtr, flag::AllocFlag, impl_device_traits, AddLayer, Alloc, Base, Buffer, CloneBuf,
-    Device, DeviceError, DevicelessAble, HasModules, IsShapeIndep, Module, OnNewBuffer,
-    RemoveLayer, ReplaceBufPassDown, Setup, Shape, UnaryFusing, Unit, WrappedData,
+    Device, DeviceError, DevicelessAble, IsShapeIndep, Module, OnNewBuffer,
+    RemoveLayer, Setup, Shape, UnaryFusing, Unit, WrappedData,
 };
 
 pub trait IsCPU {}
@@ -86,14 +86,6 @@ impl<Mods: WrappedData> Device for CPU<Mods> {
 
 impl<T: Unit, S: Shape> DevicelessAble<'_, T, S> for CPU<Base> {}
 
-impl<Mods> HasModules for CPU<Mods> {
-    type Mods = Mods;
-    #[inline]
-    fn modules(&self) -> &Mods {
-        &self.modules
-    }
-}
-
 impl<SimpleMods> CPU<SimpleMods> {
     #[inline]
     pub fn new<'a, NewMods>() -> CPU<SimpleMods::Module>

diff --git a/src/devices/cuda/cuda.rs b/src/devices/cuda/cuda.rs
@@ -7,7 +7,7 @@ use crate::{
     cuda::{api::cumalloc, CUDAPtr},
     flag::AllocFlag,
     impl_device_traits, Alloc, Base, Buffer, CloneBuf, Device, IsShapeIndep, Module as CombModule,
-    OnDropBuffer, OnNewBuffer, Setup, Shape, Unit, WrappedData,
+    OnNewBuffer, Setup, Shape, Unit, WrappedData,
 };
 
 use super::{
@@ -66,40 +66,51 @@ impl<SimpleMods> CUDA<SimpleMods> {
     }
 }
 
-impl<Mods: OnDropBuffer> Device for CUDA<Mods> {
-    type Data<T: Unit, S: Shape> = Mods::Wrap<T, CUDAPtr<T>>;
+impl<Mods: WrappedData> Device for CUDA<Mods> {
+    type Data<'a, T: Unit, S: Shape> = Mods::Wrap<'a, T, CUDAPtr<T>>;
     type Base<T: Unit, S: Shape> = CUDAPtr<T>;
     type Error = i32;
 
     #[inline(always)]
-    fn base_to_data<T: Unit, S: Shape>(&self, base: Self::Base<T, S>) -> Self::Data<T, S> {
+    fn default_base_to_data<'a, T: Unit, S: Shape>(
+        &'a self,
+        base: Self::Base<T, S>,
+    ) -> Self::Data<'a, T, S> {
         self.wrap_in_base(base)
     }
 
     #[inline(always)]
-    fn wrap_to_data<T: Unit, S: Shape>(
+    fn default_base_to_data_unbound<'a, T: Unit, S: Shape>(
         &self,
-        wrap: Self::Wrap<T, Self::Base<T, S>>,
-    ) -> Self::Data<T, S> {
+        base: Self::Base<T, S>,
+    ) -> Self::Data<'a, T, S> {
+        self.wrap_in_base_unbound(base)
+    }
+
+    #[inline(always)]
+    fn wrap_to_data<'a, T: Unit, S: Shape>(
+        &self,
+        wrap: Self::Wrap<'a, T, Self::Base<T, S>>,
+    ) -> Self::Data<'a, T, S> {
         wrap
     }
 
     #[inline(always)]
-    fn data_as_wrap<T: Unit, S: Shape>(
-        data: &Self::Data<T, S>,
-    ) -> &Self::Wrap<T, Self::Base<T, S>> {
+    fn data_as_wrap<'a, 'b, T: Unit, S: Shape>(
+        data: &'b Self::Data<'a, T, S>,
+    ) -> &'b Self::Wrap<'a, T, Self::Base<T, S>> {
         data
     }
 
     #[inline(always)]
-    fn data_as_wrap_mut<T: Unit, S: Shape>(
-        data: &mut Self::Data<T, S>,
-    ) -> &mut Self::Wrap<T, Self::Base<T, S>> {
+    fn data_as_wrap_mut<'a, 'b, T: Unit, S: Shape>(
+        data: &'b mut Self::Data<'a, T, S>,
+    ) -> &'b mut Self::Wrap<'a, T, Self::Base<T, S>> {
         data
     }
 }
 
-impl<Mods: OnDropBuffer, T: Unit> Alloc<T> for CUDA<Mods> {
+impl<Mods: WrappedData, T: Unit> Alloc<T> for CUDA<Mods> {
     #[inline]
     fn alloc<S: Shape>(
         &self,
@@ -125,9 +136,9 @@ impl<Mods: OnDropBuffer, T: Unit> Alloc<T> for CUDA<Mods> {
     }
 }
 
-unsafe impl<Mods: OnDropBuffer> IsShapeIndep for CUDA<Mods> {}
+unsafe impl<Mods: WrappedData> IsShapeIndep for CUDA<Mods> {}
 
-impl<Mods: OnDropBuffer> IsCuda for CUDA<Mods> {}
+impl<Mods: WrappedData> IsCuda for CUDA<Mods> {}
 
 #[cfg(feature = "fork")]
 impl<Mods> crate::ForkSetup for CUDA<Mods> {
@@ -137,9 +148,7 @@ impl<Mods> crate::ForkSetup for CUDA<Mods> {
     }
 }
 
-impl<'a, Mods: OnDropBuffer + OnNewBuffer<'a, T, Self, ()>, T: Unit> CloneBuf<'a, T>
-    for CUDA<Mods>
-{
+impl<'a, Mods: WrappedData + OnNewBuffer<'a, T, Self, ()>, T: Unit> CloneBuf<'a, T> for CUDA<Mods> {
     fn clone_buf(&'a self, buf: &Buffer<'a, T, CUDA<Mods>>) -> Buffer<'a, T, CUDA<Mods>> {
         let cloned = Buffer::new(self, buf.len());
         unsafe {
@@ -160,9 +169,9 @@ mod tests {
     use super::{IsCuda, CUDA};
 
     // compile-time isCuda test
-    fn take_cu_buffer<T: Unit, D: IsCuda + Retriever<T>, S: Shape>(
-        device: &D,
-        buf: &Buffer<T, D, S>,
+    fn take_cu_buffer<'a, T: Unit, D: IsCuda + Retriever<'a, T>, S: Shape>(
+        device: &'a D,
+        buf: &Buffer<'a, T, D, S>,
     ) {
         let _buf = device.retrieve::<0>(buf.len(), ());
     }

diff --git a/src/devices/cuda/cuda_ptr.rs b/src/devices/cuda/cuda_ptr.rs
@@ -1,5 +1,5 @@
 use super::api::{cu_read, cufree, cumalloc, CudaResult};
-use crate::{flag::AllocFlag, HasId, Id, PtrType, ShallowCopy, Unit, WrappedCopy};
+use crate::{flag::AllocFlag, HasId, Id, PtrType, ShallowCopy, Unit};
 use core::marker::PhantomData;
 
 /// The pointer used for `CUDA` [`Buffer`](crate::Buffer)s
@@ -76,15 +76,6 @@ impl<T> Drop for CUDAPtr<T> {
     }
 }
 
-impl<T> WrappedCopy for CUDAPtr<T> {
-    type Base = Self;
-
-    #[inline]
-    fn wrapped_copy(&self, to_wrap: Self::Base) -> Self {
-        to_wrap
-    }
-}
-
 impl<T> ShallowCopy for CUDAPtr<T> {
     #[inline]
     unsafe fn shallow(&self) -> Self {

diff --git a/src/devices/cuda/fusing.rs b/src/devices/cuda/fusing.rs
@@ -1,6 +1,6 @@
-use crate::{OnDropBuffer, UnaryFusing, CUDA};
+use crate::{UnaryFusing, WrappedData, CUDA};
 
-impl<Mods: OnDropBuffer> UnaryFusing for CUDA<Mods> {
+impl<Mods: WrappedData> UnaryFusing for CUDA<Mods> {
     #[cfg(feature = "lazy")]
     #[cfg(feature = "graph")]
     #[inline]

diff --git a/src/devices/cuda/kernel_launch.rs b/src/devices/cuda/kernel_launch.rs
@@ -1,4 +1,4 @@
-use crate::{number::Number, Buffer, OnDropBuffer, Shape, Unit, CUDA};
+use crate::{number::Number, Buffer, Shape, Unit, WrappedData, CUDA};
 use std::{collections::HashMap, ffi::c_void};
 
 use super::{
@@ -45,7 +45,7 @@ pub trait AsCudaCvoidPtr {
 impl<'a, T, Mods, S> AsCudaCvoidPtr for &Buffer<'a, T, CUDA<Mods>, S>
 where
     T: Unit,
-    Mods: OnDropBuffer,
+    Mods: WrappedData,
     S: Shape,
 {
     #[inline]
@@ -54,7 +54,7 @@ where
     }
 }
 
-impl<'a, T: Unit, Mods: OnDropBuffer, S: Shape> AsCudaCvoidPtr for Buffer<'a, T, CUDA<Mods>, S> {
+impl<'a, T: Unit, Mods: WrappedData, S: Shape> AsCudaCvoidPtr for Buffer<'a, T, CUDA<Mods>, S> {
     #[inline]
     fn as_cvoid_ptr(&self) -> *mut c_void {
         &self.base().ptr as *const u64 as *mut c_void

diff --git a/src/devices/cuda/lazy.rs b/src/devices/cuda/lazy.rs
@@ -74,7 +74,7 @@ impl<Mods> crate::LazySetup for CUDA<Mods> {
 #[cfg(test)]
 mod tests {
     use crate::{
-        AddOperation, ApplyFunction, Base, Buffer, Combiner, Device, HasId, Lazy, Retrieve,
+        AddOperation, Base, Buffer, Device, HasId, Lazy, Retrieve,
         Retriever, Run, CUDA,
     };
 
@@ -259,7 +259,7 @@ mod tests {
         fn_name: &'static str,
     ) -> Buffer<'a, i32, CUDA<Mods>>
     where
-        Mods: 'static + AddOperation + Retrieve<CUDA<Mods>, i32, ()>,
+        Mods: 'static + AddOperation + Retrieve<'a, CUDA<Mods>, i32, ()>,
     {
         let mut out = device.retrieve(lhs.len(), (lhs.id(), rhs.id())).unwrap();
 
@@ -303,6 +303,8 @@ mod tests {
     #[cfg(feature = "graph")]
     #[test]
     fn test_cuda_apply_fn_lazy() {
+        use crate::{ApplyFunction, Combiner};
+
         let device = CUDA::<crate::Graph<Lazy<Base>>>::new(0).unwrap();
 
         let lhs = device.buffer([1., 2., 3., 4., 5., 6.]);

diff --git a/src/devices/cuda/ops.rs b/src/devices/cuda/ops.rs
@@ -4,9 +4,9 @@ use crate::{
     bounds_to_range,
     cuda::api::{cu_read_async, CUstreamCaptureStatus},
     op_hint::unary,
-    pass_down_add_operation, pass_down_exec_now, AddOperation, ApplyFunction, Buffer, CDatatype,
-    ClearBuf, CopySlice, OnDropBuffer, Read, Resolve, Retrieve, Retriever, SetOpHint, Shape,
-    ToCLSource, ToMarker, UnaryGrad, Unit, WriteBuf, ZeroGrad, CUDA,
+    pass_down_add_operation, AddOperation, ApplyFunction, Buffer, CDatatype, ClearBuf, CopySlice,
+    Read, Resolve, Retrieve, Retriever, SetOpHint, Shape, ToCLSource, ToMarker, UnaryGrad, Unit,
+    WrappedData, WriteBuf, ZeroGrad, CUDA,
 };
 
 use super::{
@@ -15,9 +15,8 @@ use super::{
 };
 
 pass_down_add_operation!(CUDA);
-pass_down_exec_now!(CUDA);
 
-impl<Mods: OnDropBuffer, T: Unit + Default + Clone, S: Shape> Read<T, S> for CUDA<Mods> {
+impl<Mods: WrappedData, T: Unit + Default + Clone, S: Shape> Read<T, S> for CUDA<Mods> {
     type Read<'a>
         = Vec<T>
     where
@@ -51,21 +50,21 @@ impl<Mods: OnDropBuffer, T: Unit + Default + Clone, S: Shape> Read<T, S> for CUD
     }
 }
 
-impl<Mods: OnDropBuffer, T: CDatatype> ClearBuf<T> for CUDA<Mods> {
+impl<Mods: WrappedData, T: CDatatype> ClearBuf<T> for CUDA<Mods> {
     #[inline]
     fn clear(&self, buf: &mut Buffer<T, Self>) {
         cu_clear(self, buf).unwrap()
     }
 }
 
-impl<Mods: OnDropBuffer, T: CDatatype> ZeroGrad<T> for CUDA<Mods> {
+impl<Mods: WrappedData, T: CDatatype> ZeroGrad<T> for CUDA<Mods> {
     #[inline]
     fn zero_grad<S: Shape>(&self, data: &mut Self::Base<T, S>) {
         cu_clear(self, data).unwrap()
     }
 }
 
-impl<Mods: OnDropBuffer, T: Unit> CopySlice<T> for CUDA<Mods> {
+impl<Mods: WrappedData, T: Unit> CopySlice<T> for CUDA<Mods> {
     fn copy_slice_to<SR: RangeBounds<usize>, DR: RangeBounds<usize>>(
         &self,
         source: &Buffer<T, Self>,
@@ -101,7 +100,7 @@ impl<Mods: OnDropBuffer, T: Unit> CopySlice<T> for CUDA<Mods> {
     }
 }
 
-impl<Mods: OnDropBuffer, T: Unit> WriteBuf<T> for CUDA<Mods> {
+impl<Mods: WrappedData, T: Unit> WriteBuf<T> for CUDA<Mods> {
     #[inline]
     fn write(&self, buf: &mut Buffer<T, Self>, data: &[T]) {
         cu_write_async(buf.cu_ptr(), data, &self.mem_transfer_stream).unwrap();
@@ -119,18 +118,18 @@ impl<Mods: OnDropBuffer, T: Unit> WriteBuf<T> for CUDA<Mods> {
     }
 }
 
-impl<Mods, T, S> ApplyFunction<T, S> for CUDA<Mods>
+impl<'a, Mods, T, S> ApplyFunction<'a, T, S> for CUDA<Mods>
 where
     T: CDatatype + Default,
-    Mods: AddOperation + Retrieve<Self, T, S> + SetOpHint<T> + 'static,
+    Mods: AddOperation + Retrieve<'a, Self, T, S> + SetOpHint<T> + 'static,
     S: Shape,
 {
     #[inline]
     fn apply_fn<F>(
-        &self,
+        &'a self,
         buf: &Buffer<T, Self, S>,
         f: impl Fn(Resolve<T>) -> F + Copy + 'static,
-    ) -> Buffer<T, Self, S>
+    ) -> Buffer<'a, T, Self, S>
     where
         F: crate::TwoWay<T>,
     {
@@ -183,7 +182,7 @@ impl<T, S, Mods> UnaryGrad<T, S> for CUDA<Mods>
 where
     T: CDatatype + Default,
     S: Shape,
-    Mods: OnDropBuffer + AddOperation + 'static,
+    Mods: WrappedData + AddOperation + 'static,
 {
     #[inline]
     fn add_unary_grad<F>(

diff --git a/src/static_api/to_device.rs b/src/static_api/to_device.rs
@@ -39,7 +39,7 @@ impl<'a, T: Unit + Clone> Buffer<'a, T> {
     /// ```
     #[cfg(feature = "cuda")]
     #[inline]
-    pub fn to_cuda(self) -> Buffer<'a, T, crate::CUDA> {
+    pub fn to_cuda(self) -> Buffer<'static, T, crate::CUDA> {
         self.to_dev::<crate::CUDA>()
     }
 
@@ -57,7 +57,7 @@ impl<'a, T: Unit + Clone> Buffer<'a, T> {
     /// ```
     #[cfg(feature = "opencl")]
     #[inline]
-    pub fn to_cl(self) -> Buffer<'a, T, crate::OpenCL> {
+    pub fn to_cl(self) -> Buffer<'static, T, crate::OpenCL> {
         self.to_dev::<crate::OpenCL>()
     }
 
@@ -78,7 +78,7 @@ impl<'a, T: Unit + Clone> Buffer<'a, T> {
     #[cfg(feature = "opencl")]
     #[cfg(not(feature = "cuda"))]
     #[inline]
-    pub fn to_gpu(self) -> Buffer<'a, T, crate::OpenCL> {
+    pub fn to_gpu(self) -> Buffer<'static, T, crate::OpenCL> {
         self.to_cl()
     }
 
@@ -98,7 +98,7 @@ impl<'a, T: Unit + Clone> Buffer<'a, T> {
     /// ```
     #[cfg(feature = "cuda")]
     #[inline]
-    pub fn to_gpu(self) -> Buffer<'a, T, crate::CUDA> {
+    pub fn to_gpu(self) -> Buffer<'static, T, crate::CUDA> {
         self.to_cuda()
     }
 }