Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DIRTY] Using m1 intrinsics for f16xf16 #4

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[workspace]
members = ["gemm", "gemm-common", "gemm-f16", "gemm-f32", "gemm-f64", "gemm-c32", "gemm-c64"]
resolver = "2"

[workspace.dependencies]
lazy_static = "1.4"
Expand All @@ -13,3 +14,4 @@ paste = "1.0"

[profile.dev]
opt-level = 3

2 changes: 1 addition & 1 deletion gemm-common/src/gemm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ macro_rules! gemm_def {
$crate::__inject_mod!(avx512f, $ty, 8 * $multiplier, Avx512f);

#[cfg(target_arch = "aarch64")]
$crate::__inject_mod!(neon, $ty, 2 * $multiplier, Scalar);
$crate::__inject_mod!(neon, $ty, 2 * $multiplier, Neon);

#[cfg(target_arch = "wasm32")]
$crate::__inject_mod!(simd128, $ty, 2 * $multiplier, Simd128);
Expand Down
306 changes: 306 additions & 0 deletions gemm-common/src/microkernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,312 @@ macro_rules! microkernel {
};
}

#[macro_export]
macro_rules! microkernel_f16 {
($([$target: tt])?, $unroll: tt, $name: ident, $mr_div_n: tt, $nr: tt $(, $nr_div_n: tt, $n: tt)?) => {
#[inline]
$(#[target_feature(enable = $target)])?
// 0, 1, or 2 for generic alpha
pub unsafe fn $name(
m: usize,
n: usize,
k: usize,
dst: *mut T,
mut packed_lhs: *const T,
mut packed_rhs: *const T,
dst_cs: isize,
dst_rs: isize,
lhs_cs: isize,
rhs_rs: isize,
rhs_cs: isize,
alpha: T,
beta: T,
alpha_status: u8,
_conj_dst: bool,
_conj_lhs: bool,
_conj_rhs: bool,
mut next_lhs: *const T,
) {
let mut accum_storage = [[splat(T::ZERO); $mr_div_n]; $nr];
let accum = accum_storage.as_mut_ptr() as *mut Pack;

let mut lhs = [::core::mem::MaybeUninit::<Pack>::uninit(); $mr_div_n];
let mut rhs = ::core::mem::MaybeUninit::<Pack>::uninit();

#[derive(Copy, Clone)]
struct KernelIter {
packed_lhs: *const T,
packed_rhs: *const T,
next_lhs: *const T,
lhs_cs: isize,
rhs_rs: isize,
rhs_cs: isize,
accum: *mut Pack,
lhs: *mut Pack,
rhs: *mut Pack,
}

impl KernelIter {
#[inline(always)]
unsafe fn execute(self, iter: usize) {
let packed_lhs = self.packed_lhs.wrapping_offset(iter as isize * self.lhs_cs);
let packed_rhs = self.packed_rhs.wrapping_offset(iter as isize * self.rhs_rs);
let next_lhs = self.next_lhs.wrapping_offset(iter as isize * self.lhs_cs);

seq_macro::seq!(M_ITER in 0..$mr_div_n {{
*self.lhs.add(M_ITER) = (packed_lhs.add(M_ITER * N) as *const Pack).read_unaligned();
}});

seq_macro::seq!(N_ITER in 0..$nr {{
*self.rhs = splat(*packed_rhs.wrapping_offset(N_ITER * self.rhs_cs));
let accum = self.accum.add(N_ITER * $mr_div_n);
seq_macro::seq!(M_ITER in 0..$mr_div_n {{
let accum = &mut *accum.add(M_ITER);
*accum = mul_add(
*self.lhs.add(M_ITER),
*self.rhs,
*accum,
);
}});
}});

let _ = next_lhs;
}

$(
#[inline(always)]
unsafe fn execute_neon(self, iter: usize) {
debug_assert_eq!(self.rhs_cs, 1);
let packed_lhs = self.packed_lhs.wrapping_offset(iter as isize * self.lhs_cs);
let packed_rhs = self.packed_rhs.wrapping_offset(iter as isize * self.rhs_rs);

seq_macro::seq!(M_ITER in 0..$mr_div_n {{
*self.lhs.add(M_ITER) = (packed_lhs.add(M_ITER * N) as *const Pack).read_unaligned();
}});

seq_macro::seq!(N_ITER0 in 0..$nr_div_n {{
*self.rhs = (packed_rhs.wrapping_offset(N_ITER0 * $n) as *const Pack).read_unaligned();

seq_macro::seq!(N_ITER1 in 0..$n {{
const N_ITER: usize = N_ITER0 * $n + N_ITER1;
let accum = self.accum.add(N_ITER * $mr_div_n);
seq_macro::seq!(M_ITER in 0..$mr_div_n {{
let accum = &mut *accum.add(M_ITER);
*accum = mul_add_lane::<N_ITER1>(
*self.lhs.add(M_ITER),
*self.rhs,
*accum,
);
}});
}});
}});
}
)?
}

let k_unroll = k / $unroll;
let k_leftover = k % $unroll;

loop {
$(
let _ = $nr_div_n;
if rhs_cs == 1 {
let mut depth = k_unroll;
if depth != 0 {
loop {
let iter = KernelIter {
packed_lhs,
next_lhs,
packed_rhs,
lhs_cs,
rhs_rs,
rhs_cs,
accum,
lhs: lhs.as_mut_ptr() as _,
rhs: &mut rhs as *mut _ as _,
};

seq_macro::seq!(UNROLL_ITER in 0..$unroll {{
iter.execute_neon(UNROLL_ITER);
}});

packed_lhs = packed_lhs.wrapping_offset($unroll * lhs_cs);
packed_rhs = packed_rhs.wrapping_offset($unroll * rhs_rs);
next_lhs = next_lhs.wrapping_offset($unroll * lhs_cs);

depth -= 1;
if depth == 0 {
break;
}
}
}
depth = k_leftover;
if depth != 0 {
loop {
KernelIter {
packed_lhs,
next_lhs,
packed_rhs,
lhs_cs,
rhs_rs,
rhs_cs,
accum,
lhs: lhs.as_mut_ptr() as _,
rhs: &mut rhs as *mut _ as _,
}
.execute_neon(0);

packed_lhs = packed_lhs.wrapping_offset(lhs_cs);
packed_rhs = packed_rhs.wrapping_offset(rhs_rs);
next_lhs = next_lhs.wrapping_offset(lhs_cs);

depth -= 1;
if depth == 0 {
break;
}
}
}
break;
}
)?

let mut depth = k_unroll;
if depth != 0 {
loop {
let iter = KernelIter {
packed_lhs,
next_lhs,
packed_rhs,
lhs_cs,
rhs_rs,
rhs_cs,
accum,
lhs: lhs.as_mut_ptr() as _,
rhs: &mut rhs as *mut _ as _,
};

seq_macro::seq!(UNROLL_ITER in 0..$unroll {{
iter.execute(UNROLL_ITER);
}});

packed_lhs = packed_lhs.wrapping_offset($unroll * lhs_cs);
packed_rhs = packed_rhs.wrapping_offset($unroll * rhs_rs);
next_lhs = next_lhs.wrapping_offset($unroll * lhs_cs);

depth -= 1;
if depth == 0 {
break;
}
}
}
depth = k_leftover;
if depth != 0 {
loop {
KernelIter {
packed_lhs,
next_lhs,
packed_rhs,
lhs_cs,
rhs_rs,
rhs_cs,
accum,
lhs: lhs.as_mut_ptr() as _,
rhs: &mut rhs as *mut _ as _,
}
.execute(0);

packed_lhs = packed_lhs.wrapping_offset(lhs_cs);
packed_rhs = packed_rhs.wrapping_offset(rhs_rs);
next_lhs = next_lhs.wrapping_offset(lhs_cs);

depth -= 1;
if depth == 0 {
break;
}
}
}
break;
}

if m == $mr_div_n * N && n == $nr && dst_rs == 1 {
let alpha = splat(alpha);
let beta = splat(beta);
if alpha_status == 2 {
seq_macro::seq!(N_ITER in 0..$nr {{
seq_macro::seq!(M_ITER in 0..$mr_div_n {{
let dst = dst.offset(M_ITER * N as isize + N_ITER * dst_cs) as *mut Pack;
dst.write_unaligned(add(
mul(alpha, dst.read_unaligned()),
mul(beta, *accum.offset(M_ITER + $mr_div_n * N_ITER)),
));
}});
}});
} else if alpha_status == 1 {
seq_macro::seq!(N_ITER in 0..$nr {{
seq_macro::seq!(M_ITER in 0..$mr_div_n {{
let dst = dst.offset(M_ITER * N as isize + N_ITER * dst_cs) as *mut Pack;
dst.write_unaligned(mul_add(
beta,
*accum.offset(M_ITER + $mr_div_n * N_ITER),
dst.read_unaligned(),
));
}});
}});
} else {
seq_macro::seq!(N_ITER in 0..$nr {{
seq_macro::seq!(M_ITER in 0..$mr_div_n {{
let dst = dst.offset(M_ITER * N as isize + N_ITER * dst_cs) as *mut Pack;
dst.write_unaligned(mul(beta, *accum.offset(M_ITER + $mr_div_n * N_ITER)));
}});
}});
}
} else {
let src = accum_storage; // write to stack
let src = src.as_ptr() as *const T;

if alpha_status == 2 {
for j in 0..n {
let dst_j = dst.offset(dst_cs * j as isize);
let src_j = src.add(j * $mr_div_n * N);

for i in 0..m {
let dst_ij = dst_j.offset(dst_rs * i as isize);
let src_ij = src_j.add(i);

*dst_ij = alpha * *dst_ij + beta * *src_ij;
}
}
} else if alpha_status == 1 {
for j in 0..n {
let dst_j = dst.offset(dst_cs * j as isize);
let src_j = src.add(j * $mr_div_n * N);

for i in 0..m {
let dst_ij = dst_j.offset(dst_rs * i as isize);
let src_ij = src_j.add(i);

*dst_ij = *dst_ij + beta * *src_ij;
}
}
} else {
for j in 0..n {
let dst_j = dst.offset(dst_cs * j as isize);
let src_j = src.add(j * $mr_div_n * N);

for i in 0..m {
let dst_ij = dst_j.offset(dst_rs * i as isize);
let src_ij = src_j.add(i);

*dst_ij = beta * *src_ij;
}
}
}
}

}
};
}

#[macro_export]
macro_rules! microkernel_cplx {
($([$target: tt])?, $unroll: tt, $name: ident, $mr_div_n: tt, $nr: tt) => {
Expand Down
18 changes: 18 additions & 0 deletions gemm-common/src/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,24 @@ mod x86 {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub use x86::*;

#[cfg(target_arch = "aarch64")]
mod aarch64 {
use super::*;

#[derive(Copy, Clone)]
pub struct Neon;

impl Simd for Neon {
#[inline]
#[target_feature(enable = "neon")]
unsafe fn vectorize(f: impl FnOnce()) {
f()
}
}
}
#[cfg(target_arch = "aarch64")]
pub use aarch64::*;

#[cfg(target_arch = "wasm32")]
mod wasm32 {
use super::*;
Expand Down
3 changes: 2 additions & 1 deletion gemm-f16/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ paste = { workspace = true }

gemm-common = { version = "0.15", path = "../gemm-common" }
gemm-f32 = { version = "0.15", path = "../gemm-f32" }
half = { version = "2.2", features = ["num-traits"] }
# half = { version = "2.2", features = ["num-traits"] }
half = { git = "https://github.com/Narsil/half-rs", branch="more_intrinsics", features = ["num-traits"] }

[features]
default = ["std"]
Expand Down
Loading
Loading