From 9afb7ec643654d851f1991ddd12e0b590b1576ca Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 11 Aug 2024 04:18:43 -0400
Subject: [PATCH 1/7] Remove commented-out block of code

---
 base/runtime/internal.odin | 59 --------------------------------------
 1 file changed, 59 deletions(-)
diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin
index 1a97ade096f..39ed5ce1659 100644
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -236,65 +236,6 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
 		}
 	}
 	return true
-	
-/*
-
-	when size_of(uint) == 8 {
-		if word_length := length >> 3; word_length != 0 {
-			for _ in 0..<word_length {
-				if intrinsics.unaligned_load((^u64)(a)) != intrinsics.unaligned_load((^u64)(b)) {
-					return false
-				}
-				a = a[size_of(u64):]
-				b = b[size_of(u64):]
-			}
-		}
-		
-		if length & 4 != 0 {
-			if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
-				return false
-			}
-			a = a[size_of(u32):]
-			b = b[size_of(u32):]
-		}
-		
-		if length & 2 != 0 {
-			if intrinsics.unaligned_load((^u16)(a)) != intrinsics.unaligned_load((^u16)(b)) {
-				return false
-			}
-			a = a[size_of(u16):]
-			b = b[size_of(u16):]
-		}
-		
-		if length & 1 != 0 && a[0] != b[0] {
-			return false	
-		}
-		return true
-	} else {
-		if word_length := length >> 2; word_length != 0 {
-			for _ in 0..<word_length {
-				if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
-					return false
-				}
-				a = a[size_of(u32):]
-				b = b[size_of(u32):]
-			}
-		}
-		
-		length &= 3
-		
-		if length != 0 {
-			for i in 0..<length {
-				if a[i] != b[i] {
-					return false
-				}
-			}
-		}
-
-		return true
-	}
-*/
-
 }
 memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
 	switch {

From c53419184d18b3bdf1ab33622b0991afea5836d2 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 11 Aug 2024 15:59:38 -0400
Subject: [PATCH 2/7] Use `tick_*` procs in `core:bytes` benchmark

---
 tests/benchmark/bytes/benchmark_bytes.odin | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/benchmark/bytes/benchmark_bytes.odin b/tests/benchmark/bytes/benchmark_bytes.odin
index d303e81dd69..0085bbcf82d 100644
--- a/tests/benchmark/bytes/benchmark_bytes.odin
+++ b/tests/benchmark/bytes/benchmark_bytes.odin
@@ -53,9 +53,9 @@ run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup:
 	}
 
 	for _ in 0..<runs {
-		start := time.now()
+		start := time.tick_now()
 		accumulator += p(data, 'z')
-		done := time.since(start)
+		done := time.tick_since(start)
 		timing += done
 	}
 

From 97107b61d4a367397344e934974fc22e1e38754e Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 11 Aug 2024 16:00:04 -0400
Subject: [PATCH 3/7] Vectorize `base:runtime.memory_*`

---
 base/runtime/internal.odin | 176 ++++++++++++++++++++++++-------------
 1 file changed, 117 insertions(+), 59 deletions(-)

diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin
index 39ed5ce1659..78d38e9517e 100644
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -17,6 +17,32 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE
 @(private)
 __float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16
 
+@(private)
+SIMD_SCAN_WIDTH :: 8 * size_of(uintptr)
+
+when SIMD_SCAN_WIDTH == 32 {
+	@(private, rodata)
+	simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
+		 0,  1,  2,  3,  4,  5,  6,  7,
+		 8,  9, 10, 11, 12, 13, 14, 15,
+		16, 17, 18, 19, 20, 21, 22, 23,
+		24, 25, 26, 27, 28, 29, 30, 31,
+	}
+} else when SIMD_SCAN_WIDTH == 64 {
+	@(private, rodata)
+	simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
+		 0,  1,  2,  3,  4,  5,  6,  7,
+		 8,  9, 10, 11, 12, 13, 14, 15,
+		16, 17, 18, 19, 20, 21, 22, 23,
+		24, 25, 26, 27, 28, 29, 30, 31,
+		32, 33, 34, 35, 36, 37, 38, 39,
+		40, 41, 42, 43, 44, 45, 46, 47,
+		48, 49, 50, 51, 52, 53, 54, 55,
+		56, 57, 58, 59, 60, 61, 62, 63,
+	}
+} else {
+	#panic("Invalid SIMD_SCAN_WIDTH. Must be 32 or 64.")
+}
 
 @(private)
 byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check {
@@ -227,88 +253,120 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
 	case n == 0: return true
 	case x == y: return true
 	}
-	a, b := ([^]byte)(x), ([^]byte)(y)
-	length := uint(n)
+	a, b := cast([^]u8)x, cast([^]u8)y
+	i := 0
+
+	// NOTE: Because we cannot guarantee simultaneous alignment of two separate
+	// pointers with a single iterator, we only align by length and not by the
+	// actual data layout.
+	//
+	// Therefore, in the vector loop, all loads must be unaligned.
+	//
+	// This at least lets us iterate freely without regard for a tail portion.
+	alignment_start := n % SIMD_SCAN_WIDTH
 
-	for i := uint(0); i < length; i += 1 {
+	// Iterate as a scalar until the remaining length is aligned.
+	for /**/; i < alignment_start; i += 1 {
 		if a[i] != b[i] {
 			return false
 		}
 	}
+
+	// Iterate as a vector over the remaining data.
+	for /**/; i < n; i += SIMD_SCAN_WIDTH {
+		load_a := intrinsics.unaligned_load(cast(^#simd[SIMD_SCAN_WIDTH]u8)(&a[i]))
+		load_b := intrinsics.unaligned_load(cast(^#simd[SIMD_SCAN_WIDTH]u8)(&b[i]))
+		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+		match := intrinsics.simd_reduce_or(comparison)
+		if match != 0 {
+			return false
+		}
+	}
+
 	return true
 }
-memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
+memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check {
 	switch {
-	case a == b:   return 0
-	case a == nil: return -1
-	case b == nil: return +1
-	}
-
-	x := uintptr(a)
-	y := uintptr(b)
-	n := uintptr(n)
-
-	SU :: size_of(uintptr)
-	fast := n/SU + 1
-	offset := (fast-1)*SU
-	curr_block := uintptr(0)
-	if n < SU {
-		fast = 0
-	}
-
-	for /**/; curr_block < fast; curr_block += 1 {
-		va := (^uintptr)(x + curr_block * size_of(uintptr))^
-		vb := (^uintptr)(y + curr_block * size_of(uintptr))^
-		if va ~ vb != 0 {
-			for pos := curr_block*SU; pos < n; pos += 1 {
-				a := (^byte)(x+pos)^
-				b := (^byte)(y+pos)^
-				if a ~ b != 0 {
-					return -1 if (int(a) - int(b)) < 0 else +1
-				}
-			}
+	case x == y:   return 0
+	case x == nil: return -1
+	case y == nil: return +1
+	}
+	a, b := cast([^]u8)x, cast([^]u8)y
+	i := 0
+
+	// NOTE: Because we cannot guarantee simultaneous alignment of two separate
+	// pointers with a single iterator, we only align by length and not by the
+	// actual data layout.
+	//
+	// Therefore, in the vector loop, all loads must be unaligned.
+	//
+	// This at least lets us iterate freely without regard for a tail portion.
+	alignment_start := n % SIMD_SCAN_WIDTH
+
+	// Iterate as a scalar until the remaining length is aligned.
+	for /**/; i < alignment_start; i += 1 {
+		if a[i] ~ b[i] != 0 {
+			return -1 if a[i] < b[i] else +1
 		}
 	}
 
-	for /**/; offset < n; offset += 1 {
-		a := (^byte)(x+offset)^
-		b := (^byte)(y+offset)^
-		if a ~ b != 0 {
-			return -1 if (int(a) - int(b)) < 0 else +1
+	// Iterate as a vector over the remaining data.
+	for /**/; i < n; i += SIMD_SCAN_WIDTH {
+		load_a := intrinsics.unaligned_load(cast(^#simd[SIMD_SCAN_WIDTH]u8)(&a[i]))
+		load_b := intrinsics.unaligned_load(cast(^#simd[SIMD_SCAN_WIDTH]u8)(&b[i]))
+		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+		match := intrinsics.simd_reduce_or(comparison)
+		if match != 0 {
+			sentinel: #simd[SIMD_SCAN_WIDTH]u8 = u8(0xFF)
+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
+			index_reduce := cast(int)intrinsics.simd_reduce_min(index_select)
+			return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
 		}
 	}
 
 	return 0
 }
 
-memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
-	x := uintptr(a)
-	n := uintptr(n)
+memory_compare_zero :: proc "contextless" (x: rawptr, n: int) -> int #no_bounds_check {
+	a := cast([^]u8)x
+	i := 0
 
-	SU :: size_of(uintptr)
-	fast := n/SU + 1
-	offset := (fast-1)*SU
-	curr_block := uintptr(0)
-	if n < SU {
-		fast = 0
-	}
+	// NOTE: Because we're comparing against zero, we can never return -1.
 
-	for /**/; curr_block < fast; curr_block += 1 {
-		va := (^uintptr)(x + curr_block * size_of(uintptr))^
-		if va ~ 0 != 0 {
-			for pos := curr_block*SU; pos < n; pos += 1 {
-				a := (^byte)(x+pos)^
-				if a ~ 0 != 0 {
-					return -1 if int(a) < 0 else +1
-				}
+	// Guard against small data.
+	if n < SIMD_SCAN_WIDTH {
+		for /**/; i < n; i += 1 {
+			if a[i] != 0 {
+				return 1
 			}
 		}
+		return 0
+	}
+
+	alignment_start := (SIMD_SCAN_WIDTH - cast(int)cast(uintptr)a % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH
+
+	// Iterate as a scalar until the data is aligned.
+	for /**/; i < alignment_start; i += 1 {
+		if a[i] != 0 {
+			return 1
+		}
+	}
+
+	// Iterate as a vector over the memory-aligned portion.
+	tail := n - (n - alignment_start) % SIMD_SCAN_WIDTH
+
+	for /**/; i < tail; i += SIMD_SCAN_WIDTH {
+		load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&a[i]))^
+		match := intrinsics.simd_reduce_or(load)
+		if match != 0 {
+			return 1
+		}
 	}
 
-	for /**/; offset < n; offset += 1 {
-		a := (^byte)(x+offset)^
-		if a ~ 0 != 0 {
-			return -1 if int(a) < 0 else +1
+	// Iterate as a scalar over the remaining unaligned portion.
+	for /**/; i < n; i += 1 {
+		if a[i] != 0 {
+			return 1
 		}
 	}
 

From d230a19229731d4d43e9c858cb57e34addb8b02f Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 11 Aug 2024 18:29:48 -0400
Subject: [PATCH 4/7] Add tests for `base:runtime.memory_*`

---
 tests/core/runtime/test_core_runtime.odin | 90 ++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/tests/core/runtime/test_core_runtime.odin b/tests/core/runtime/test_core_runtime.odin
index 008146dcf5e..33fb7345b33 100644
--- a/tests/core/runtime/test_core_runtime.odin
+++ b/tests/core/runtime/test_core_runtime.odin
@@ -3,6 +3,7 @@ package test_core_runtime
 import "base:intrinsics"
 import "core:mem"
 import "base:runtime"
+import "core:slice"
 import "core:testing"
 
 // Tests that having space for the allocation, but not for the allocation and alignment
@@ -36,4 +37,91 @@ test_temp_allocator_returns_correct_size :: proc(t: ^testing.T) {
 	bytes, err := mem.alloc_bytes(10, 16)
 	testing.expect(t, err == nil)
 	testing.expect(t, len(bytes) == 10)
-}
\ No newline at end of file
+}
+
+@(private)
+SIMD_SCAN_WIDTH :: 8 * size_of(uintptr)
+
+@(test)
+test_memory_equal :: proc(t: ^testing.T) {
+	data: [2 * SIMD_SCAN_WIDTH]u8
+	cmp: [2 * SIMD_SCAN_WIDTH]u8
+
+	slice.fill(data[:], 0xAA)
+	slice.fill(cmp[:], 0xAA)
+
+	INDEX_MAX :: SIMD_SCAN_WIDTH - 1
+
+	for offset in 0..<INDEX_MAX {
+		for idx in 0..<INDEX_MAX {
+			subdata := data[offset:]
+			subcmp := cmp[offset:]
+
+			if !testing.expect_value(t, runtime.memory_equal(&data[0], &cmp[0], len(data)), true) {
+				return
+			}
+
+			subcmp[idx] = 0x55
+			if !testing.expect_value(t, runtime.memory_equal(&data[0], &cmp[0], len(data)), false) {
+				return
+			}
+			subcmp[idx] = 0xAA
+		}
+	}
+}
+
+@(test)
+test_memory_compare :: proc(t: ^testing.T) {
+	data: [2 * SIMD_SCAN_WIDTH]u8
+	cmp: [2 * SIMD_SCAN_WIDTH]u8
+
+	INDEX_MAX :: SIMD_SCAN_WIDTH - 1
+
+	for offset in 0..<INDEX_MAX {
+		for idx in 0..<INDEX_MAX {
+			subdata := data[offset:]
+			subcmp := cmp[offset:]
+
+			if !testing.expect_value(t, runtime.memory_compare(&data[0], &cmp[0], len(data)), 0) {
+				return
+			}
+
+			subdata[idx] = 0x7F
+			subcmp[idx] = 0xFF
+			if !testing.expect_value(t, runtime.memory_compare(&data[0], &cmp[0], len(data)), -1) {
+				return
+			}
+
+			subdata[idx] = 0xFF
+			subcmp[idx] = 0x7F
+			if !testing.expect_value(t, runtime.memory_compare(&data[0], &cmp[0], len(data)), 1) {
+				return
+			}
+
+			subdata[idx] = 0
+			subcmp[idx] = 0
+		}
+	}
+}
+
+@(test)
+test_memory_compare_zero :: proc(t: ^testing.T) {
+	data: [2 * SIMD_SCAN_WIDTH]u8
+
+	INDEX_MAX :: SIMD_SCAN_WIDTH - 1
+
+	for offset in 0..<INDEX_MAX {
+		for idx in 0..<INDEX_MAX {
+			sub := data[offset:]
+
+			if !testing.expect_value(t, runtime.memory_compare_zero(&data[0], len(data)), 0) {
+				return
+			}
+			sub[idx] = 0xFF
+			if !testing.expect_value(t, runtime.memory_compare_zero(&data[0], len(data)), 1) {
+				return
+			}
+			sub[idx] = 0
+		}
+	}
+}

From 2afb4dffcf0b468b52f8d30bc76353a01598e30b Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 11 Aug 2024 18:38:24 -0400
Subject: [PATCH 5/7] Remove unneeded alloc in `bytes` test

---
 tests/core/bytes/test_core_bytes.odin | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/core/bytes/test_core_bytes.odin b/tests/core/bytes/test_core_bytes.odin
index fb3c460aa8c..640e6b97284 100644
--- a/tests/core/bytes/test_core_bytes.odin
+++ b/tests/core/bytes/test_core_bytes.odin
@@ -9,9 +9,8 @@ import "core:testing"
 @test
 test_index_byte_sanity :: proc(t: ^testing.T) {
 	// We must be able to find the byte at the correct index.
-	data := make([]u8, 2 * SIMD_SCAN_WIDTH)
-	defer delete(data)
-	slice.fill(data, '-')
+	data: [2 * SIMD_SCAN_WIDTH]u8
+	slice.fill(data[:], '-')
 
 	INDEX_MAX :: SIMD_SCAN_WIDTH - 1
 

From 90820ea53de9d76521512c0910277f5aaa3285db Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 11 Aug 2024 18:38:46 -0400
Subject: [PATCH 6/7] Add sizes adjacent to 64-bit `SIMD_SCAN_WIDTH` in `bytes`
 benchmark

---
 tests/benchmark/bytes/benchmark_bytes.odin | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/benchmark/bytes/benchmark_bytes.odin b/tests/benchmark/bytes/benchmark_bytes.odin
index 0085bbcf82d..e8329664ebb 100644
--- a/tests/benchmark/bytes/benchmark_bytes.odin
+++ b/tests/benchmark/bytes/benchmark_bytes.odin
@@ -30,6 +30,7 @@ plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
 sizes := [?]int {
 	15, 16, 17,
 	31, 32, 33,
+	63, 64, 65,
 	256,
 	512,
 	1024,

From de7dd9f35e3ddd8212d665e3b5b931130f4c5ddd Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 11 Aug 2024 18:47:13 -0400
Subject: [PATCH 7/7] Add benchmarks for `base:runtime.memory_*`

---
 tests/benchmark/runtime/runtime.odin | 332 +++++++++++++++++++++++++++
 1 file changed, 332 insertions(+)
 create mode 100644 tests/benchmark/runtime/runtime.odin

diff --git a/tests/benchmark/runtime/runtime.odin b/tests/benchmark/runtime/runtime.odin
new file mode 100644
index 00000000000..fe7c25d9541
--- /dev/null
+++ b/tests/benchmark/runtime/runtime.odin
@@ -0,0 +1,332 @@
+package benchmark_runtime
+
+import "base:runtime"
+import "core:fmt"
+import "core:log"
+import "core:testing"
+import "core:time"
+
+
+// These are the normal, unoptimized algorithms.
+
+plain_memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
+	switch {
+	case n == 0: return true
+	case x == y: return true
+	}
+	a, b := ([^]byte)(x), ([^]byte)(y)
+	length := uint(n)
+
+	for i := uint(0); i < length; i += 1 {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+plain_memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
+	switch {
+	case a == b:   return 0
+	case a == nil: return -1
+	case b == nil: return +1
+	}
+
+	x := uintptr(a)
+	y := uintptr(b)
+	n := uintptr(n)
+
+	SU :: size_of(uintptr)
+	fast := n/SU + 1
+	offset := (fast-1)*SU
+	curr_block := uintptr(0)
+	if n < SU {
+		fast = 0
+	}
+
+	for /**/; curr_block < fast; curr_block += 1 {
+		va := (^uintptr)(x + curr_block * size_of(uintptr))^
+		vb := (^uintptr)(y + curr_block * size_of(uintptr))^
+		if va ~ vb != 0 {
+			for pos := curr_block*SU; pos < n; pos += 1 {
+				a := (^byte)(x+pos)^
+				b := (^byte)(y+pos)^
+				if a ~ b != 0 {
+					return -1 if (int(a) - int(b)) < 0 else +1
+				}
+			}
+		}
+	}
+
+	for /**/; offset < n; offset += 1 {
+		a := (^byte)(x+offset)^
+		b := (^byte)(y+offset)^
+		if a ~ b != 0 {
+			return -1 if (int(a) - int(b)) < 0 else +1
+		}
+	}
+
+	return 0
+}
+
+plain_memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
+	x := uintptr(a)
+	n := uintptr(n)
+
+	SU :: size_of(uintptr)
+	fast := n/SU + 1
+	offset := (fast-1)*SU
+	curr_block := uintptr(0)
+	if n < SU {
+		fast = 0
+	}
+
+	for /**/; curr_block < fast; curr_block += 1 {
+		va := (^uintptr)(x + curr_block * size_of(uintptr))^
+		if va ~ 0 != 0 {
+			for pos := curr_block*SU; pos < n; pos += 1 {
+				a := (^byte)(x+pos)^
+				if a ~ 0 != 0 {
+					return -1 if int(a) < 0 else +1
+				}
+			}
+		}
+	}
+
+	for /**/; offset < n; offset += 1 {
+		a := (^byte)(x+offset)^
+		if a ~ 0 != 0 {
+			return -1 if int(a) < 0 else +1
+		}
+	}
+
+	return 0
+}
+
+
+sizes := [?]int {
+	15, 16, 17,
+	31, 32, 33,
+	63, 64, 65,
+	256,
+	512,
+	1024,
+	1024 * 1024,
+	1024 * 1024 * 1024,
+}
+
+run_trial_size :: proc(p: proc "contextless" (rawptr, int) -> int, size: int, warmup: int, runs: int) -> (timing: time.Duration) {
+	data := make([]u8, size)
+	defer delete(data)
+	data[size - 1] = 1
+
+	accumulator: int
+
+	for _ in 0..<warmup {
+		accumulator += p(&data[0], size)
+	}
+
+	for _ in 0..<runs {
+		start := time.tick_now()
+		accumulator += p(&data[0], size)
+		done := time.tick_since(start)
+		timing += done
+	}
+
+	timing /= time.Duration(runs)
+
+	log.debug(accumulator)
+	return
+}
+
+run_trial_size_compare :: proc(p: proc "contextless" (rawptr, rawptr, int) -> bool, size: int, warmup: int, runs: int) -> (timing: time.Duration) {
+	data_a := make([]u8, size)
+	data_b := make([]u8, size)
+	defer {
+		delete(data_a)
+		delete(data_b)
+	}
+	data_a[size - 1] = 1
+	data_b[size - 1] = 2
+
+	accumulator: int
+
+	for _ in 0..<warmup {
+		val := p(&data_a[0], &data_b[0], size)
+		if !val {
+			accumulator += 1
+		}
+	}
+
+	for _ in 0..<runs {
+		start := time.tick_now()
+		val := p(&data_a[0], &data_b[0], size)
+		done := time.tick_since(start)
+		if !val {
+			accumulator += 1
+		}
+		timing += done
+	}
+
+	timing /= time.Duration(runs)
+
+	log.debug(accumulator)
+	return
+}
+
+run_trial_size_compare_int :: proc(p: proc "contextless" (rawptr, rawptr, int) -> int, size: int, warmup: int, runs: int) -> (timing: time.Duration) {
+	data_a := make([]u8, size)
+	data_b := make([]u8, size)
+	defer {
+		delete(data_a)
+		delete(data_b)
+	}
+	data_a[size - 1] = 1
+	data_b[size - 1] = 2
+
+	accumulator: int
+
+	for _ in 0..<warmup {
+		accumulator += p(&data_a[0], &data_b[0], size)
+	}
+
+	for _ in 0..<runs {
+		start := time.tick_now()
+		accumulator += p(&data_a[0], &data_b[0], size)
+		done := time.tick_since(start)
+		timing += done
+	}
+
+	timing /= time.Duration(runs)
+
+	log.debug(accumulator)
+	return
+}
+
+HOT :: 3
+
+/* Memory Equal */
+
+@test
+benchmark_plain_memory_equal_cold :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size_compare(plain_memory_equal, size, 0, 1)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_plain_memory_equal_hot :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size_compare(plain_memory_equal, size, HOT, HOT)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_simd_memory_equal_cold :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size_compare(runtime.memory_equal, size, 0, 1)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_simd_memory_equal_hot :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size_compare(runtime.memory_equal, size, HOT, HOT)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+/* Memory Compare */
+
+@test
+benchmark_plain_memory_compare_cold :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size_compare_int(plain_memory_compare, size, 0, 1)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_plain_memory_compare_hot :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size_compare_int(plain_memory_compare, size, HOT, HOT)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_simd_memory_compare_cold :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size_compare_int(runtime.memory_compare, size, 0, 1)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_simd_memory_compare_hot :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size_compare_int(runtime.memory_compare, size, HOT, HOT)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+/* Memory Compare Zero */
+
+@test
+benchmark_plain_memory_compare_zero_cold :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size(plain_memory_compare_zero, size, 0, 1)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_plain_memory_compare_zero_hot :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size(plain_memory_compare_zero, size, HOT, HOT)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_simd_memory_compare_zero_cold :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size(runtime.memory_compare_zero, size, 0, 1)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_simd_memory_compare_zero_hot :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size(runtime.memory_compare_zero, size, HOT, HOT)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}