LaurentMazare · Narsil · Aug 1, 2023 · Aug 1, 2023
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,5 +1,6 @@
 [workspace]
 members = ["gemm", "gemm-common", "gemm-f16", "gemm-f32", "gemm-f64", "gemm-c32", "gemm-c64"]
+resolver = "2"
 
 [workspace.dependencies]
 lazy_static = "1.4"
@@ -13,3 +14,4 @@ paste = "1.0"
 
 [profile.dev]
 opt-level = 3
+
diff --git a/gemm-common/src/gemm.rs b/gemm-common/src/gemm.rs
@@ -814,7 +814,7 @@ macro_rules! gemm_def {
         $crate::__inject_mod!(avx512f, $ty, 8 * $multiplier, Avx512f);
 
         #[cfg(target_arch = "aarch64")]
-        $crate::__inject_mod!(neon, $ty, 2 * $multiplier, Scalar);
+        $crate::__inject_mod!(neon, $ty, 2 * $multiplier, Neon);
 
         #[cfg(target_arch = "wasm32")]
         $crate::__inject_mod!(simd128, $ty, 2 * $multiplier, Simd128);

diff --git a/gemm-common/src/microkernel.rs b/gemm-common/src/microkernel.rs
@@ -414,6 +414,312 @@ macro_rules! microkernel {
     };
 }
 
+#[macro_export]
+macro_rules! microkernel_f16 {
+    ($([$target: tt])?, $unroll: tt, $name: ident, $mr_div_n: tt, $nr: tt $(, $nr_div_n: tt, $n: tt)?) => {
+        #[inline]
+        $(#[target_feature(enable = $target)])?
+        // 0, 1, or 2 for generic alpha
+        pub unsafe fn $name(
+            m: usize,
+            n: usize,
+            k: usize,
+            dst: *mut T,
+            mut packed_lhs: *const T,
+            mut packed_rhs: *const T,
+            dst_cs: isize,
+            dst_rs: isize,
+            lhs_cs: isize,
+            rhs_rs: isize,
+            rhs_cs: isize,
+            alpha: T,
+            beta: T,
+            alpha_status: u8,
+            _conj_dst: bool,
+            _conj_lhs: bool,
+            _conj_rhs: bool,
+            mut next_lhs: *const T,
+        ) {
+            let mut accum_storage = [[splat(T::ZERO); $mr_div_n]; $nr];
+            let accum = accum_storage.as_mut_ptr() as *mut Pack;
+
+            let mut lhs = [::core::mem::MaybeUninit::<Pack>::uninit(); $mr_div_n];
+            let mut rhs = ::core::mem::MaybeUninit::<Pack>::uninit();
+
+            #[derive(Copy, Clone)]
+            struct KernelIter {
+                packed_lhs: *const T,
+                packed_rhs: *const T,
+                next_lhs: *const T,
+                lhs_cs: isize,
+                rhs_rs: isize,
+                rhs_cs: isize,
+                accum: *mut Pack,
+                lhs: *mut Pack,
+                rhs: *mut Pack,
+            }
+
+            impl KernelIter {
+                #[inline(always)]
+                unsafe fn execute(self, iter: usize) {
+                    let packed_lhs = self.packed_lhs.wrapping_offset(iter as isize * self.lhs_cs);
+                    let packed_rhs = self.packed_rhs.wrapping_offset(iter as isize * self.rhs_rs);
+                    let next_lhs = self.next_lhs.wrapping_offset(iter as isize * self.lhs_cs);
+
+                    seq_macro::seq!(M_ITER in 0..$mr_div_n {{
+                        *self.lhs.add(M_ITER) = (packed_lhs.add(M_ITER * N) as *const Pack).read_unaligned();
+                    }});
+
+                    seq_macro::seq!(N_ITER in 0..$nr {{
+                        *self.rhs = splat(*packed_rhs.wrapping_offset(N_ITER * self.rhs_cs));
+                        let accum = self.accum.add(N_ITER * $mr_div_n);
+                        seq_macro::seq!(M_ITER in 0..$mr_div_n {{
+                            let accum = &mut *accum.add(M_ITER);
+                            *accum = mul_add(
+                                *self.lhs.add(M_ITER),
+                                *self.rhs,
+                                *accum,
+                                );
+                        }});
+                    }});
+
+                    let _ = next_lhs;
+                }
+
+                $(
+                    #[inline(always)]
+                    unsafe fn execute_neon(self, iter: usize) {
+                        debug_assert_eq!(self.rhs_cs, 1);
+                        let packed_lhs = self.packed_lhs.wrapping_offset(iter as isize * self.lhs_cs);
+                        let packed_rhs = self.packed_rhs.wrapping_offset(iter as isize * self.rhs_rs);
+
+                        seq_macro::seq!(M_ITER in 0..$mr_div_n {{
+                            *self.lhs.add(M_ITER) = (packed_lhs.add(M_ITER * N) as *const Pack).read_unaligned();
+                        }});
+
+                        seq_macro::seq!(N_ITER0 in 0..$nr_div_n {{
+                            *self.rhs = (packed_rhs.wrapping_offset(N_ITER0 * $n) as *const Pack).read_unaligned();
+
+                            seq_macro::seq!(N_ITER1 in 0..$n {{
+                                const N_ITER: usize = N_ITER0 * $n + N_ITER1;
+                                let accum = self.accum.add(N_ITER * $mr_div_n);
+                                seq_macro::seq!(M_ITER in 0..$mr_div_n {{
+                                    let accum = &mut *accum.add(M_ITER);
+                                    *accum = mul_add_lane::<N_ITER1>(
+                                        *self.lhs.add(M_ITER),
+                                        *self.rhs,
+                                        *accum,
+                                        );
+                                }});
+                            }});
+                        }});
+                    }
+                )?
+            }
+
+            let k_unroll = k / $unroll;
+            let k_leftover = k % $unroll;
+
+            loop {
+                $(
+                let _ = $nr_div_n;
+                if rhs_cs == 1 {
+                    let mut depth = k_unroll;
+                    if depth != 0 {
+                        loop {
+                            let iter = KernelIter {
+                                packed_lhs,
+                                next_lhs,
+                                packed_rhs,
+                                lhs_cs,
+                                rhs_rs,
+                                rhs_cs,
+                                accum,
+                                lhs: lhs.as_mut_ptr() as _,
+                                rhs: &mut rhs as *mut _ as _,
+                            };
+
+                            seq_macro::seq!(UNROLL_ITER in 0..$unroll {{
+                                iter.execute_neon(UNROLL_ITER);
+                            }});
+
+                            packed_lhs = packed_lhs.wrapping_offset($unroll * lhs_cs);
+                            packed_rhs = packed_rhs.wrapping_offset($unroll * rhs_rs);
+                            next_lhs = next_lhs.wrapping_offset($unroll * lhs_cs);
+
+                            depth -= 1;
+                            if depth == 0 {
+                                break;
+                            }
+                        }
+                    }
+                    depth = k_leftover;
+                    if depth != 0 {
+                        loop {
+                            KernelIter {
+                                packed_lhs,
+                                next_lhs,
+                                packed_rhs,
+                                lhs_cs,
+                                rhs_rs,
+                                rhs_cs,
+                                accum,
+                                lhs: lhs.as_mut_ptr() as _,
+                                rhs: &mut rhs as *mut _ as _,
+                            }
+                            .execute_neon(0);
+
+                            packed_lhs = packed_lhs.wrapping_offset(lhs_cs);
+                            packed_rhs = packed_rhs.wrapping_offset(rhs_rs);
+                            next_lhs = next_lhs.wrapping_offset(lhs_cs);
+
+                            depth -= 1;
+                            if depth == 0 {
+                                break;
+                            }
+                        }
+                    }
+                    break;
+                }
+                )?
+
+                let mut depth = k_unroll;
+                if depth != 0 {
+                    loop {
+                        let iter = KernelIter {
+                            packed_lhs,
+                            next_lhs,
+                            packed_rhs,
+                            lhs_cs,
+                            rhs_rs,
+                            rhs_cs,
+                            accum,
+                            lhs: lhs.as_mut_ptr() as _,
+                            rhs: &mut rhs as *mut _ as _,
+                        };
+
+                        seq_macro::seq!(UNROLL_ITER in 0..$unroll {{
+                            iter.execute(UNROLL_ITER);
+                        }});
+
+                        packed_lhs = packed_lhs.wrapping_offset($unroll * lhs_cs);
+                        packed_rhs = packed_rhs.wrapping_offset($unroll * rhs_rs);
+                        next_lhs = next_lhs.wrapping_offset($unroll * lhs_cs);
+
+                        depth -= 1;
+                        if depth == 0 {
+                            break;
+                        }
+                    }
+                }
+                depth = k_leftover;
+                if depth != 0 {
+                    loop {
+                        KernelIter {
+                            packed_lhs,
+                            next_lhs,
+                            packed_rhs,
+                            lhs_cs,
+                            rhs_rs,
+                            rhs_cs,
+                            accum,
+                            lhs: lhs.as_mut_ptr() as _,
+                            rhs: &mut rhs as *mut _ as _,
+                        }
+                        .execute(0);
+
+                        packed_lhs = packed_lhs.wrapping_offset(lhs_cs);
+                        packed_rhs = packed_rhs.wrapping_offset(rhs_rs);
+                        next_lhs = next_lhs.wrapping_offset(lhs_cs);
+
+                        depth -= 1;
+                        if depth == 0 {
+                            break;
+                        }
+                    }
+                }
+                break;
+            }
+
+            if m == $mr_div_n * N && n == $nr && dst_rs == 1  {
+                let alpha = splat(alpha);
+                let beta = splat(beta);
+                if alpha_status == 2 {
+                    seq_macro::seq!(N_ITER in 0..$nr {{
+                        seq_macro::seq!(M_ITER in 0..$mr_div_n {{
+                            let dst = dst.offset(M_ITER * N as isize + N_ITER * dst_cs) as *mut Pack;
+                            dst.write_unaligned(add(
+                                    mul(alpha, dst.read_unaligned()),
+                                    mul(beta, *accum.offset(M_ITER + $mr_div_n * N_ITER)),
+                                    ));
+                        }});
+                    }});
+                } else if alpha_status == 1 {
+                    seq_macro::seq!(N_ITER in 0..$nr {{
+                        seq_macro::seq!(M_ITER in 0..$mr_div_n {{
+                            let dst = dst.offset(M_ITER * N as isize + N_ITER * dst_cs) as *mut Pack;
+                            dst.write_unaligned(mul_add(
+                                    beta,
+                                    *accum.offset(M_ITER + $mr_div_n * N_ITER),
+                                    dst.read_unaligned(),
+                                    ));
+                        }});
+                    }});
+                } else {
+                    seq_macro::seq!(N_ITER in 0..$nr {{
+                        seq_macro::seq!(M_ITER in 0..$mr_div_n {{
+                            let dst = dst.offset(M_ITER * N as isize + N_ITER * dst_cs) as *mut Pack;
+                            dst.write_unaligned(mul(beta, *accum.offset(M_ITER + $mr_div_n * N_ITER)));
+                        }});
+                    }});
+                }
+            } else {
+                let src = accum_storage; // write to stack
+                let src = src.as_ptr() as *const T;
+
+                if alpha_status == 2 {
+                    for j in 0..n {
+                        let dst_j = dst.offset(dst_cs * j as isize);
+                        let src_j = src.add(j * $mr_div_n * N);
+
+                        for i in 0..m {
+                            let dst_ij = dst_j.offset(dst_rs * i as isize);
+                            let src_ij = src_j.add(i);
+
+                            *dst_ij = alpha * *dst_ij + beta * *src_ij;
+                        }
+                    }
+                } else if alpha_status == 1 {
+                    for j in 0..n {
+                        let dst_j = dst.offset(dst_cs * j as isize);
+                        let src_j = src.add(j * $mr_div_n * N);
+
+                        for i in 0..m {
+                            let dst_ij = dst_j.offset(dst_rs * i as isize);
+                            let src_ij = src_j.add(i);
+
+                            *dst_ij = *dst_ij + beta * *src_ij;
+                        }
+                    }
+                } else {
+                    for j in 0..n {
+                        let dst_j = dst.offset(dst_cs * j as isize);
+                        let src_j = src.add(j * $mr_div_n * N);
+
+                        for i in 0..m {
+                            let dst_ij = dst_j.offset(dst_rs * i as isize);
+                            let src_ij = src_j.add(i);
+
+                            *dst_ij = beta * *src_ij;
+                        }
+                    }
+                }
+            }
+
+        }
+    };
+}
+
 #[macro_export]
 macro_rules! microkernel_cplx {
     ($([$target: tt])?, $unroll: tt, $name: ident, $mr_div_n: tt, $nr: tt) => {

diff --git a/gemm-common/src/simd.rs b/gemm-common/src/simd.rs
@@ -64,6 +64,24 @@ mod x86 {
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub use x86::*;
 
+#[cfg(target_arch = "aarch64")]
+mod aarch64 {
+    use super::*;
+
+    #[derive(Copy, Clone)]
+    pub struct Neon;
+
+    impl Simd for Neon {
+        #[inline]
+        #[target_feature(enable = "neon")]
+        unsafe fn vectorize(f: impl FnOnce()) {
+            f()
+        }
+    }
+}
+#[cfg(target_arch = "aarch64")]
+pub use aarch64::*;
+
 #[cfg(target_arch = "wasm32")]
 mod wasm32 {
     use super::*;

diff --git a/gemm-f16/Cargo.toml b/gemm-f16/Cargo.toml
@@ -21,7 +21,8 @@ paste = { workspace = true }
 
 gemm-common = { version = "0.15", path = "../gemm-common" }
 gemm-f32 = { version = "0.15", path = "../gemm-f32" }
-half = { version = "2.2", features = ["num-traits"] }
+# half = { version = "2.2", features = ["num-traits"] }
+half = { git = "https://github.com/Narsil/half-rs", branch="more_intrinsics", features = ["num-traits"] }
 
 [features]
 default = ["std"]