From 9afb7ec643654d851f1991ddd12e0b590b1576ca Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 11 Aug 2024 04:18:43 -0400 Subject: [PATCH 1/7] Remove commented-out block of code --- base/runtime/internal.odin | 59 -------------------------------------- 1 file changed, 59 deletions(-) diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin index 1a97ade096f..39ed5ce1659 100644 --- a/base/runtime/internal.odin +++ b/base/runtime/internal.odin @@ -236,65 +236,6 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool { } } return true - -/* - - when size_of(uint) == 8 { - if word_length := length >> 3; word_length != 0 { - for _ in 0..> 2; word_length != 0 { - for _ in 0.. int #no_bounds_check { switch { From c53419184d18b3bdf1ab33622b0991afea5836d2 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 11 Aug 2024 15:59:38 -0400 Subject: [PATCH 2/7] Use `tick_*` procs in `core:bytes` benchmark --- tests/benchmark/bytes/benchmark_bytes.odin | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/benchmark/bytes/benchmark_bytes.odin b/tests/benchmark/bytes/benchmark_bytes.odin index d303e81dd69..0085bbcf82d 100644 --- a/tests/benchmark/bytes/benchmark_bytes.odin +++ b/tests/benchmark/bytes/benchmark_bytes.odin @@ -53,9 +53,9 @@ run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: } for _ in 0.. Date: Sun, 11 Aug 2024 16:00:04 -0400 Subject: [PATCH 3/7] Vectorize `base:runtime.memory_*` --- base/runtime/internal.odin | 176 ++++++++++++++++++++++++------------- 1 file changed, 117 insertions(+), 59 deletions(-) diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin index 39ed5ce1659..78d38e9517e 100644 --- a/base/runtime/internal.odin +++ b/base/runtime/internal.odin @@ -17,6 +17,32 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE @(private) __float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16 +@(private) +SIMD_SCAN_WIDTH :: 8 * size_of(uintptr) + +when SIMD_SCAN_WIDTH == 32 { + @(private, rodata) + simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + } +} else when SIMD_SCAN_WIDTH == 64 { + @(private, rodata) + simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + } +} else { + #panic("Invalid SIMD_SCAN_WIDTH. Must be 32 or 64.") +} @(private) byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check { @@ -227,88 +253,120 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool { case n == 0: return true case x == y: return true } - a, b := ([^]byte)(x), ([^]byte)(y) - length := uint(n) + a, b := cast([^]u8)x, cast([^]u8)y + i := 0 + + // NOTE: Because we cannot guarantee simultaneous alignment of two separate + // pointers with a single iterator, we only align by length and not by the + // actual data layout. + // + // Therefore, in the vector loop, all loads must be unaligned. + // + // This at least lets us iterate freely without regard for a tail portion. + alignment_start := n % SIMD_SCAN_WIDTH - for i := uint(0); i < length; i += 1 { + // Iterate as a scalar until the remaining length is aligned. + for /**/; i < alignment_start; i += 1 { if a[i] != b[i] { return false } } + + // Iterate as a vector over the remaining data. + for /**/; i < n; i += SIMD_SCAN_WIDTH { + load_a := intrinsics.unaligned_load(cast(^#simd[SIMD_SCAN_WIDTH]u8)(&a[i])) + load_b := intrinsics.unaligned_load(cast(^#simd[SIMD_SCAN_WIDTH]u8)(&b[i])) + comparison := intrinsics.simd_lanes_ne(load_a, load_b) + match := intrinsics.simd_reduce_or(comparison) + if match != 0 { + return false + } + } + return true } -memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check { +memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check { switch { - case a == b: return 0 - case a == nil: return -1 - case b == nil: return +1 - } - - x := uintptr(a) - y := uintptr(b) - n := uintptr(n) - - SU :: size_of(uintptr) - fast := n/SU + 1 - offset := (fast-1)*SU - curr_block := uintptr(0) - if n < SU { - fast = 0 - } - - for /**/; curr_block < fast; curr_block += 1 { - va := (^uintptr)(x + curr_block * size_of(uintptr))^ - vb := (^uintptr)(y + curr_block * size_of(uintptr))^ - if va ~ vb != 0 { - for pos := curr_block*SU; pos < n; pos += 1 { - a := (^byte)(x+pos)^ - b := (^byte)(y+pos)^ - if a ~ b != 0 { - return -1 if (int(a) - int(b)) < 0 else +1 - } - } + case x == y: return 0 + case x == nil: return -1 + case y == nil: return +1 + } + a, b := cast([^]u8)x, cast([^]u8)y + i := 0 + + // NOTE: Because we cannot guarantee simultaneous alignment of two separate + // pointers with a single iterator, we only align by length and not by the + // actual data layout. + // + // Therefore, in the vector loop, all loads must be unaligned. + // + // This at least lets us iterate freely without regard for a tail portion. + alignment_start := n % SIMD_SCAN_WIDTH + + // Iterate as a scalar until the remaining length is aligned. + for /**/; i < alignment_start; i += 1 { + if a[i] ~ b[i] != 0 { + return -1 if a[i] < b[i] else +1 } } - for /**/; offset < n; offset += 1 { - a := (^byte)(x+offset)^ - b := (^byte)(y+offset)^ - if a ~ b != 0 { - return -1 if (int(a) - int(b)) < 0 else +1 + // Iterate as a vector over the remaining data. + for /**/; i < n; i += SIMD_SCAN_WIDTH { + load_a := intrinsics.unaligned_load(cast(^#simd[SIMD_SCAN_WIDTH]u8)(&a[i])) + load_b := intrinsics.unaligned_load(cast(^#simd[SIMD_SCAN_WIDTH]u8)(&b[i])) + comparison := intrinsics.simd_lanes_ne(load_a, load_b) + match := intrinsics.simd_reduce_or(comparison) + if match != 0 { + sentinel: #simd[SIMD_SCAN_WIDTH]u8 = u8(0xFF) + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := cast(int)intrinsics.simd_reduce_min(index_select) + return -1 if a[i+index_reduce] < b[i+index_reduce] else +1 } } return 0 } -memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check { - x := uintptr(a) - n := uintptr(n) +memory_compare_zero :: proc "contextless" (x: rawptr, n: int) -> int #no_bounds_check { + a := cast([^]u8)x + i := 0 - SU :: size_of(uintptr) - fast := n/SU + 1 - offset := (fast-1)*SU - curr_block := uintptr(0) - if n < SU { - fast = 0 - } + // NOTE: Because we're comparing against zero, we can never return -1. - for /**/; curr_block < fast; curr_block += 1 { - va := (^uintptr)(x + curr_block * size_of(uintptr))^ - if va ~ 0 != 0 { - for pos := curr_block*SU; pos < n; pos += 1 { - a := (^byte)(x+pos)^ - if a ~ 0 != 0 { - return -1 if int(a) < 0 else +1 - } + // Guard against small data. + if n < SIMD_SCAN_WIDTH { + for /**/; i < n; i += 1 { + if a[i] != 0 { + return 1 } } + return 0 + } + + alignment_start := (SIMD_SCAN_WIDTH - cast(int)cast(uintptr)a % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH + + // Iterate as a scalar until the data is aligned. + for /**/; i < alignment_start; i += 1 { + if a[i] != 0 { + return 1 + } + } + + // Iterate as a vector over the memory-aligned portion. + tail := n - (n - alignment_start) % SIMD_SCAN_WIDTH + + for /**/; i < tail; i += SIMD_SCAN_WIDTH { + load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&a[i]))^ + match := intrinsics.simd_reduce_or(load) + if match != 0 { + return 1 + } } - for /**/; offset < n; offset += 1 { - a := (^byte)(x+offset)^ - if a ~ 0 != 0 { - return -1 if int(a) < 0 else +1 + // Iterate as a scalar over the remaining unaligned portion. + for /**/; i < n; i += 1 { + if a[i] != 0 { + return 1 } } From d230a19229731d4d43e9c858cb57e34addb8b02f Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 11 Aug 2024 18:29:48 -0400 Subject: [PATCH 4/7] Add tests for `base:runtime.memory_*` --- tests/core/runtime/test_core_runtime.odin | 90 ++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/tests/core/runtime/test_core_runtime.odin b/tests/core/runtime/test_core_runtime.odin index 008146dcf5e..33fb7345b33 100644 --- a/tests/core/runtime/test_core_runtime.odin +++ b/tests/core/runtime/test_core_runtime.odin @@ -3,6 +3,7 @@ package test_core_runtime import "base:intrinsics" import "core:mem" import "base:runtime" +import "core:slice" import "core:testing" // Tests that having space for the allocation, but not for the allocation and alignment @@ -36,4 +37,91 @@ test_temp_allocator_returns_correct_size :: proc(t: ^testing.T) { bytes, err := mem.alloc_bytes(10, 16) testing.expect(t, err == nil) testing.expect(t, len(bytes) == 10) -} \ No newline at end of file +} + +@(private) +SIMD_SCAN_WIDTH :: 8 * size_of(uintptr) + +@(test) +test_memory_equal :: proc(t: ^testing.T) { + data: [2 * SIMD_SCAN_WIDTH]u8 + cmp: [2 * SIMD_SCAN_WIDTH]u8 + + slice.fill(data[:], 0xAA) + slice.fill(cmp[:], 0xAA) + + INDEX_MAX :: SIMD_SCAN_WIDTH - 1 + + for offset in 0.. Date: Sun, 11 Aug 2024 18:38:24 -0400 Subject: [PATCH 5/7] Remove unneeded alloc in `bytes` test --- tests/core/bytes/test_core_bytes.odin | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/core/bytes/test_core_bytes.odin b/tests/core/bytes/test_core_bytes.odin index fb3c460aa8c..640e6b97284 100644 --- a/tests/core/bytes/test_core_bytes.odin +++ b/tests/core/bytes/test_core_bytes.odin @@ -9,9 +9,8 @@ import "core:testing" @test test_index_byte_sanity :: proc(t: ^testing.T) { // We must be able to find the byte at the correct index. - data := make([]u8, 2 * SIMD_SCAN_WIDTH) - defer delete(data) - slice.fill(data, '-') + data: [2 * SIMD_SCAN_WIDTH]u8 + slice.fill(data[:], '-') INDEX_MAX :: SIMD_SCAN_WIDTH - 1 From 90820ea53de9d76521512c0910277f5aaa3285db Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 11 Aug 2024 18:38:46 -0400 Subject: [PATCH 6/7] Add sizes adjacent to 64-bit `SIMD_SCAN_WIDTH` in `bytes` benchmark --- tests/benchmark/bytes/benchmark_bytes.odin | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/benchmark/bytes/benchmark_bytes.odin b/tests/benchmark/bytes/benchmark_bytes.odin index 0085bbcf82d..e8329664ebb 100644 --- a/tests/benchmark/bytes/benchmark_bytes.odin +++ b/tests/benchmark/bytes/benchmark_bytes.odin @@ -30,6 +30,7 @@ plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { sizes := [?]int { 15, 16, 17, 31, 32, 33, + 63, 64, 65, 256, 512, 1024, From de7dd9f35e3ddd8212d665e3b5b931130f4c5ddd Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 11 Aug 2024 18:47:13 -0400 Subject: [PATCH 7/7] Add benchmarks for `base:runtime.memory_*` --- tests/benchmark/runtime/runtime.odin | 332 +++++++++++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 tests/benchmark/runtime/runtime.odin diff --git a/tests/benchmark/runtime/runtime.odin b/tests/benchmark/runtime/runtime.odin new file mode 100644 index 00000000000..fe7c25d9541 --- /dev/null +++ b/tests/benchmark/runtime/runtime.odin @@ -0,0 +1,332 @@ +package benchmark_runtime + +import "base:runtime" +import "core:fmt" +import "core:log" +import "core:testing" +import "core:time" + + +// These are the normal, unoptimized algorithms. + +plain_memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool { + switch { + case n == 0: return true + case x == y: return true + } + a, b := ([^]byte)(x), ([^]byte)(y) + length := uint(n) + + for i := uint(0); i < length; i += 1 { + if a[i] != b[i] { + return false + } + } + return true +} + +plain_memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check { + switch { + case a == b: return 0 + case a == nil: return -1 + case b == nil: return +1 + } + + x := uintptr(a) + y := uintptr(b) + n := uintptr(n) + + SU :: size_of(uintptr) + fast := n/SU + 1 + offset := (fast-1)*SU + curr_block := uintptr(0) + if n < SU { + fast = 0 + } + + for /**/; curr_block < fast; curr_block += 1 { + va := (^uintptr)(x + curr_block * size_of(uintptr))^ + vb := (^uintptr)(y + curr_block * size_of(uintptr))^ + if va ~ vb != 0 { + for pos := curr_block*SU; pos < n; pos += 1 { + a := (^byte)(x+pos)^ + b := (^byte)(y+pos)^ + if a ~ b != 0 { + return -1 if (int(a) - int(b)) < 0 else +1 + } + } + } + } + + for /**/; offset < n; offset += 1 { + a := (^byte)(x+offset)^ + b := (^byte)(y+offset)^ + if a ~ b != 0 { + return -1 if (int(a) - int(b)) < 0 else +1 + } + } + + return 0 +} + +plain_memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check { + x := uintptr(a) + n := uintptr(n) + + SU :: size_of(uintptr) + fast := n/SU + 1 + offset := (fast-1)*SU + curr_block := uintptr(0) + if n < SU { + fast = 0 + } + + for /**/; curr_block < fast; curr_block += 1 { + va := (^uintptr)(x + curr_block * size_of(uintptr))^ + if va ~ 0 != 0 { + for pos := curr_block*SU; pos < n; pos += 1 { + a := (^byte)(x+pos)^ + if a ~ 0 != 0 { + return -1 if int(a) < 0 else +1 + } + } + } + } + + for /**/; offset < n; offset += 1 { + a := (^byte)(x+offset)^ + if a ~ 0 != 0 { + return -1 if int(a) < 0 else +1 + } + } + + return 0 +} + + +sizes := [?]int { + 15, 16, 17, + 31, 32, 33, + 63, 64, 65, + 256, + 512, + 1024, + 1024 * 1024, + 1024 * 1024 * 1024, +} + +run_trial_size :: proc(p: proc "contextless" (rawptr, int) -> int, size: int, warmup: int, runs: int) -> (timing: time.Duration) { + data := make([]u8, size) + defer delete(data) + data[size - 1] = 1 + + accumulator: int + + for _ in 0.. bool, size: int, warmup: int, runs: int) -> (timing: time.Duration) { + data_a := make([]u8, size) + data_b := make([]u8, size) + defer { + delete(data_a) + delete(data_b) + } + data_a[size - 1] = 1 + data_b[size - 1] = 2 + + accumulator: int + + for _ in 0.. int, size: int, warmup: int, runs: int) -> (timing: time.Duration) { + data_a := make([]u8, size) + data_b := make([]u8, size) + defer { + delete(data_a) + delete(data_b) + } + data_a[size - 1] = 1 + data_b[size - 1] = 2 + + accumulator: int + + for _ in 0..