Skip to content

Commit

Permalink
Adapt Cuda device to add 'a at wrap
Browse files Browse the repository at this point in the history
  • Loading branch information
elftausend committed Nov 26, 2024
1 parent 05d1bd4 commit 53b1cc3
Show file tree
Hide file tree
Showing 11 changed files with 71 additions and 70 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ min-cl = { git = "https://github.com/elftausend/min-cl", optional = true }
[features]
# default = ["cpu", "blas", "static-api", "macro", "cached", "autograd", "stack", "opencl", "fork", "graph", "untyped"]

default = ["cpu", "cached", "autograd", "static-api", "blas", "macro", "fork"]
default = ["cpu", "cached", "autograd", "static-api", "blas", "macro", "fork", "cuda"]
# default = ["no-std"]
# default = ["opencl"]
# default = ["untyped", "cpu", "lazy", "graph", "autograd", "fork", "serde", "json", "half", "cached", "static-api", "stack", "opencl", "nnapi"]
Expand Down
4 changes: 2 additions & 2 deletions examples/implement_operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ where

#[cfg(feature = "cuda")]
// CUDA Implementation
impl<Mods: Retrieve<Self, T>, T: CDatatype> AddBuf<T> for CUDA<Mods> {
fn add(&self, lhs: &Buffer<T, Self>, rhs: &Buffer<T, Self>) -> Buffer<T, Self> {
impl<'a, Mods: Retrieve<'a, Self, T>, T: CDatatype> AddBuf<'a, T> for CUDA<Mods> {
fn add(&'a self, lhs: &Buffer<T, Self>, rhs: &Buffer<T, Self>) -> Buffer<'a, T, Self> {
// generic CUDA kernel
let src = format!(
r#"extern "C" __global__ void add({datatype}* lhs, {datatype}* rhs, {datatype}* out, int numElements)
Expand Down
8 changes: 8 additions & 0 deletions src/devices.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,14 @@ macro_rules! impl_device_traits {

impl<Mods> $crate::ReplaceBufPassDown for $device<Mods> {}
impl<Mods> $crate::ExecNowPassDown for $device<Mods> {}

impl<Mods> $crate::HasModules for $device<Mods> {
type Mods = Mods;

fn modules(&self) -> &Self::Mods {
&self.modules
}
}
};
}

Expand Down
12 changes: 2 additions & 10 deletions src/devices/cpu/cpu_device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use core::convert::Infallible;

use crate::{
cpu::CPUPtr, flag::AllocFlag, impl_device_traits, AddLayer, Alloc, Base, Buffer, CloneBuf,
Device, DeviceError, DevicelessAble, HasModules, IsShapeIndep, Module, OnNewBuffer,
RemoveLayer, ReplaceBufPassDown, Setup, Shape, UnaryFusing, Unit, WrappedData,
Device, DeviceError, DevicelessAble, IsShapeIndep, Module, OnNewBuffer,
RemoveLayer, Setup, Shape, UnaryFusing, Unit, WrappedData,
};

pub trait IsCPU {}
Expand Down Expand Up @@ -86,14 +86,6 @@ impl<Mods: WrappedData> Device for CPU<Mods> {

impl<T: Unit, S: Shape> DevicelessAble<'_, T, S> for CPU<Base> {}

impl<Mods> HasModules for CPU<Mods> {
type Mods = Mods;
#[inline]
fn modules(&self) -> &Mods {
&self.modules
}
}

impl<SimpleMods> CPU<SimpleMods> {
#[inline]
pub fn new<'a, NewMods>() -> CPU<SimpleMods::Module>
Expand Down
53 changes: 31 additions & 22 deletions src/devices/cuda/cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::{
cuda::{api::cumalloc, CUDAPtr},
flag::AllocFlag,
impl_device_traits, Alloc, Base, Buffer, CloneBuf, Device, IsShapeIndep, Module as CombModule,
OnDropBuffer, OnNewBuffer, Setup, Shape, Unit, WrappedData,
OnNewBuffer, Setup, Shape, Unit, WrappedData,
};

use super::{
Expand Down Expand Up @@ -66,40 +66,51 @@ impl<SimpleMods> CUDA<SimpleMods> {
}
}

impl<Mods: OnDropBuffer> Device for CUDA<Mods> {
type Data<T: Unit, S: Shape> = Mods::Wrap<T, CUDAPtr<T>>;
impl<Mods: WrappedData> Device for CUDA<Mods> {
type Data<'a, T: Unit, S: Shape> = Mods::Wrap<'a, T, CUDAPtr<T>>;
type Base<T: Unit, S: Shape> = CUDAPtr<T>;
type Error = i32;

#[inline(always)]
fn base_to_data<T: Unit, S: Shape>(&self, base: Self::Base<T, S>) -> Self::Data<T, S> {
fn default_base_to_data<'a, T: Unit, S: Shape>(
&'a self,
base: Self::Base<T, S>,
) -> Self::Data<'a, T, S> {
self.wrap_in_base(base)
}

#[inline(always)]
fn wrap_to_data<T: Unit, S: Shape>(
fn default_base_to_data_unbound<'a, T: Unit, S: Shape>(
&self,
wrap: Self::Wrap<T, Self::Base<T, S>>,
) -> Self::Data<T, S> {
base: Self::Base<T, S>,
) -> Self::Data<'a, T, S> {
self.wrap_in_base_unbound(base)
}

#[inline(always)]
fn wrap_to_data<'a, T: Unit, S: Shape>(
&self,
wrap: Self::Wrap<'a, T, Self::Base<T, S>>,
) -> Self::Data<'a, T, S> {
wrap
}

#[inline(always)]
fn data_as_wrap<T: Unit, S: Shape>(
data: &Self::Data<T, S>,
) -> &Self::Wrap<T, Self::Base<T, S>> {
fn data_as_wrap<'a, 'b, T: Unit, S: Shape>(
data: &'b Self::Data<'a, T, S>,
) -> &'b Self::Wrap<'a, T, Self::Base<T, S>> {
data
}

#[inline(always)]
fn data_as_wrap_mut<T: Unit, S: Shape>(
data: &mut Self::Data<T, S>,
) -> &mut Self::Wrap<T, Self::Base<T, S>> {
fn data_as_wrap_mut<'a, 'b, T: Unit, S: Shape>(
data: &'b mut Self::Data<'a, T, S>,
) -> &'b mut Self::Wrap<'a, T, Self::Base<T, S>> {
data
}
}

impl<Mods: OnDropBuffer, T: Unit> Alloc<T> for CUDA<Mods> {
impl<Mods: WrappedData, T: Unit> Alloc<T> for CUDA<Mods> {
#[inline]
fn alloc<S: Shape>(
&self,
Expand All @@ -125,9 +136,9 @@ impl<Mods: OnDropBuffer, T: Unit> Alloc<T> for CUDA<Mods> {
}
}

unsafe impl<Mods: OnDropBuffer> IsShapeIndep for CUDA<Mods> {}
unsafe impl<Mods: WrappedData> IsShapeIndep for CUDA<Mods> {}

impl<Mods: OnDropBuffer> IsCuda for CUDA<Mods> {}
impl<Mods: WrappedData> IsCuda for CUDA<Mods> {}

#[cfg(feature = "fork")]
impl<Mods> crate::ForkSetup for CUDA<Mods> {
Expand All @@ -137,9 +148,7 @@ impl<Mods> crate::ForkSetup for CUDA<Mods> {
}
}

impl<'a, Mods: OnDropBuffer + OnNewBuffer<'a, T, Self, ()>, T: Unit> CloneBuf<'a, T>
for CUDA<Mods>
{
impl<'a, Mods: WrappedData + OnNewBuffer<'a, T, Self, ()>, T: Unit> CloneBuf<'a, T> for CUDA<Mods> {
fn clone_buf(&'a self, buf: &Buffer<'a, T, CUDA<Mods>>) -> Buffer<'a, T, CUDA<Mods>> {
let cloned = Buffer::new(self, buf.len());
unsafe {
Expand All @@ -160,9 +169,9 @@ mod tests {
use super::{IsCuda, CUDA};

// compile-time isCuda test
fn take_cu_buffer<T: Unit, D: IsCuda + Retriever<T>, S: Shape>(
device: &D,
buf: &Buffer<T, D, S>,
fn take_cu_buffer<'a, T: Unit, D: IsCuda + Retriever<'a, T>, S: Shape>(
device: &'a D,
buf: &Buffer<'a, T, D, S>,
) {
let _buf = device.retrieve::<0>(buf.len(), ());
}
Expand Down
11 changes: 1 addition & 10 deletions src/devices/cuda/cuda_ptr.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::api::{cu_read, cufree, cumalloc, CudaResult};
use crate::{flag::AllocFlag, HasId, Id, PtrType, ShallowCopy, Unit, WrappedCopy};
use crate::{flag::AllocFlag, HasId, Id, PtrType, ShallowCopy, Unit};
use core::marker::PhantomData;

/// The pointer used for `CUDA` [`Buffer`](crate::Buffer)s
Expand Down Expand Up @@ -76,15 +76,6 @@ impl<T> Drop for CUDAPtr<T> {
}
}

impl<T> WrappedCopy for CUDAPtr<T> {
type Base = Self;

#[inline]
fn wrapped_copy(&self, to_wrap: Self::Base) -> Self {
to_wrap
}
}

impl<T> ShallowCopy for CUDAPtr<T> {
#[inline]
unsafe fn shallow(&self) -> Self {
Expand Down
4 changes: 2 additions & 2 deletions src/devices/cuda/fusing.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::{OnDropBuffer, UnaryFusing, CUDA};
use crate::{UnaryFusing, WrappedData, CUDA};

impl<Mods: OnDropBuffer> UnaryFusing for CUDA<Mods> {
impl<Mods: WrappedData> UnaryFusing for CUDA<Mods> {
#[cfg(feature = "lazy")]
#[cfg(feature = "graph")]
#[inline]
Expand Down
6 changes: 3 additions & 3 deletions src/devices/cuda/kernel_launch.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{number::Number, Buffer, OnDropBuffer, Shape, Unit, CUDA};
use crate::{number::Number, Buffer, Shape, Unit, WrappedData, CUDA};
use std::{collections::HashMap, ffi::c_void};

use super::{
Expand Down Expand Up @@ -45,7 +45,7 @@ pub trait AsCudaCvoidPtr {
impl<'a, T, Mods, S> AsCudaCvoidPtr for &Buffer<'a, T, CUDA<Mods>, S>
where
T: Unit,
Mods: OnDropBuffer,
Mods: WrappedData,
S: Shape,
{
#[inline]
Expand All @@ -54,7 +54,7 @@ where
}
}

impl<'a, T: Unit, Mods: OnDropBuffer, S: Shape> AsCudaCvoidPtr for Buffer<'a, T, CUDA<Mods>, S> {
impl<'a, T: Unit, Mods: WrappedData, S: Shape> AsCudaCvoidPtr for Buffer<'a, T, CUDA<Mods>, S> {
#[inline]
fn as_cvoid_ptr(&self) -> *mut c_void {
&self.base().ptr as *const u64 as *mut c_void
Expand Down
6 changes: 4 additions & 2 deletions src/devices/cuda/lazy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ impl<Mods> crate::LazySetup for CUDA<Mods> {
#[cfg(test)]
mod tests {
use crate::{
AddOperation, ApplyFunction, Base, Buffer, Combiner, Device, HasId, Lazy, Retrieve,
AddOperation, Base, Buffer, Device, HasId, Lazy, Retrieve,
Retriever, Run, CUDA,
};

Expand Down Expand Up @@ -259,7 +259,7 @@ mod tests {
fn_name: &'static str,
) -> Buffer<'a, i32, CUDA<Mods>>
where
Mods: 'static + AddOperation + Retrieve<CUDA<Mods>, i32, ()>,
Mods: 'static + AddOperation + Retrieve<'a, CUDA<Mods>, i32, ()>,
{
let mut out = device.retrieve(lhs.len(), (lhs.id(), rhs.id())).unwrap();

Expand Down Expand Up @@ -303,6 +303,8 @@ mod tests {
#[cfg(feature = "graph")]
#[test]
fn test_cuda_apply_fn_lazy() {
use crate::{ApplyFunction, Combiner};

let device = CUDA::<crate::Graph<Lazy<Base>>>::new(0).unwrap();

let lhs = device.buffer([1., 2., 3., 4., 5., 6.]);
Expand Down
27 changes: 13 additions & 14 deletions src/devices/cuda/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ use crate::{
bounds_to_range,
cuda::api::{cu_read_async, CUstreamCaptureStatus},
op_hint::unary,
pass_down_add_operation, pass_down_exec_now, AddOperation, ApplyFunction, Buffer, CDatatype,
ClearBuf, CopySlice, OnDropBuffer, Read, Resolve, Retrieve, Retriever, SetOpHint, Shape,
ToCLSource, ToMarker, UnaryGrad, Unit, WriteBuf, ZeroGrad, CUDA,
pass_down_add_operation, AddOperation, ApplyFunction, Buffer, CDatatype, ClearBuf, CopySlice,
Read, Resolve, Retrieve, Retriever, SetOpHint, Shape, ToCLSource, ToMarker, UnaryGrad, Unit,
WrappedData, WriteBuf, ZeroGrad, CUDA,
};

use super::{
Expand All @@ -15,9 +15,8 @@ use super::{
};

pass_down_add_operation!(CUDA);
pass_down_exec_now!(CUDA);

impl<Mods: OnDropBuffer, T: Unit + Default + Clone, S: Shape> Read<T, S> for CUDA<Mods> {
impl<Mods: WrappedData, T: Unit + Default + Clone, S: Shape> Read<T, S> for CUDA<Mods> {
type Read<'a>
= Vec<T>
where
Expand Down Expand Up @@ -51,21 +50,21 @@ impl<Mods: OnDropBuffer, T: Unit + Default + Clone, S: Shape> Read<T, S> for CUD
}
}

impl<Mods: OnDropBuffer, T: CDatatype> ClearBuf<T> for CUDA<Mods> {
impl<Mods: WrappedData, T: CDatatype> ClearBuf<T> for CUDA<Mods> {
#[inline]
fn clear(&self, buf: &mut Buffer<T, Self>) {
cu_clear(self, buf).unwrap()
}
}

impl<Mods: OnDropBuffer, T: CDatatype> ZeroGrad<T> for CUDA<Mods> {
impl<Mods: WrappedData, T: CDatatype> ZeroGrad<T> for CUDA<Mods> {
#[inline]
fn zero_grad<S: Shape>(&self, data: &mut Self::Base<T, S>) {
cu_clear(self, data).unwrap()
}
}

impl<Mods: OnDropBuffer, T: Unit> CopySlice<T> for CUDA<Mods> {
impl<Mods: WrappedData, T: Unit> CopySlice<T> for CUDA<Mods> {
fn copy_slice_to<SR: RangeBounds<usize>, DR: RangeBounds<usize>>(
&self,
source: &Buffer<T, Self>,
Expand Down Expand Up @@ -101,7 +100,7 @@ impl<Mods: OnDropBuffer, T: Unit> CopySlice<T> for CUDA<Mods> {
}
}

impl<Mods: OnDropBuffer, T: Unit> WriteBuf<T> for CUDA<Mods> {
impl<Mods: WrappedData, T: Unit> WriteBuf<T> for CUDA<Mods> {
#[inline]
fn write(&self, buf: &mut Buffer<T, Self>, data: &[T]) {
cu_write_async(buf.cu_ptr(), data, &self.mem_transfer_stream).unwrap();
Expand All @@ -119,18 +118,18 @@ impl<Mods: OnDropBuffer, T: Unit> WriteBuf<T> for CUDA<Mods> {
}
}

impl<Mods, T, S> ApplyFunction<T, S> for CUDA<Mods>
impl<'a, Mods, T, S> ApplyFunction<'a, T, S> for CUDA<Mods>
where
T: CDatatype + Default,
Mods: AddOperation + Retrieve<Self, T, S> + SetOpHint<T> + 'static,
Mods: AddOperation + Retrieve<'a, Self, T, S> + SetOpHint<T> + 'static,
S: Shape,
{
#[inline]
fn apply_fn<F>(
&self,
&'a self,
buf: &Buffer<T, Self, S>,
f: impl Fn(Resolve<T>) -> F + Copy + 'static,
) -> Buffer<T, Self, S>
) -> Buffer<'a, T, Self, S>
where
F: crate::TwoWay<T>,
{
Expand Down Expand Up @@ -183,7 +182,7 @@ impl<T, S, Mods> UnaryGrad<T, S> for CUDA<Mods>
where
T: CDatatype + Default,
S: Shape,
Mods: OnDropBuffer + AddOperation + 'static,
Mods: WrappedData + AddOperation + 'static,
{
#[inline]
fn add_unary_grad<F>(
Expand Down
8 changes: 4 additions & 4 deletions src/static_api/to_device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ impl<'a, T: Unit + Clone> Buffer<'a, T> {
/// ```
#[cfg(feature = "cuda")]
#[inline]
pub fn to_cuda(self) -> Buffer<'a, T, crate::CUDA> {
pub fn to_cuda(self) -> Buffer<'static, T, crate::CUDA> {
self.to_dev::<crate::CUDA>()
}

Expand All @@ -57,7 +57,7 @@ impl<'a, T: Unit + Clone> Buffer<'a, T> {
/// ```
#[cfg(feature = "opencl")]
#[inline]
pub fn to_cl(self) -> Buffer<'a, T, crate::OpenCL> {
pub fn to_cl(self) -> Buffer<'static, T, crate::OpenCL> {
self.to_dev::<crate::OpenCL>()
}

Expand All @@ -78,7 +78,7 @@ impl<'a, T: Unit + Clone> Buffer<'a, T> {
#[cfg(feature = "opencl")]
#[cfg(not(feature = "cuda"))]
#[inline]
pub fn to_gpu(self) -> Buffer<'a, T, crate::OpenCL> {
pub fn to_gpu(self) -> Buffer<'static, T, crate::OpenCL> {
self.to_cl()
}

Expand All @@ -98,7 +98,7 @@ impl<'a, T: Unit + Clone> Buffer<'a, T> {
/// ```
#[cfg(feature = "cuda")]
#[inline]
pub fn to_gpu(self) -> Buffer<'a, T, crate::CUDA> {
pub fn to_gpu(self) -> Buffer<'static, T, crate::CUDA> {
self.to_cuda()
}
}
Expand Down

0 comments on commit 53b1cc3

Please sign in to comment.