From 5e1cc54d1b3c82bf03460678d504a6a2122a5dca Mon Sep 17 00:00:00 2001 From: Bandhav Veluri Date: Fri, 31 Jul 2020 11:54:41 -0700 Subject: [PATCH 1/5] Kernel for estimating dram latency --- .../dram_latency/Makefile | 38 +++++++++++ .../dram_latency/kernel_dram_latency.cpp | 66 +++++++++++++++++++ .../bsg_cuda_lite_runtime/dram_latency/main.c | 10 +++ 3 files changed, 114 insertions(+) create mode 100644 software/spmd/bsg_cuda_lite_runtime/dram_latency/Makefile create mode 100644 software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp create mode 100644 software/spmd/bsg_cuda_lite_runtime/dram_latency/main.c diff --git a/software/spmd/bsg_cuda_lite_runtime/dram_latency/Makefile b/software/spmd/bsg_cuda_lite_runtime/dram_latency/Makefile new file mode 100644 index 000000000..a04013cbd --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/dram_latency/Makefile @@ -0,0 +1,38 @@ +######################################################### +# Network Configutaion +# If not configured, Will use default Values + bsg_global_X ?= $(bsg_tiles_X) + bsg_global_Y ?= $(bsg_tiles_Y)+1 + +######################################################### +#Tile group configuration +# If not configured, Will use default Values + bsg_tiles_org_X ?= 0 + bsg_tiles_org_Y ?= 1 + +# If not configured, Will use default Values + bsg_tiles_X ?= 2 + bsg_tiles_Y ?= 2 + + +all: main.run + + +KERNEL_NAME ?=kernel_dram_latency + +OBJECT_FILES=main.o kernel_dram_latency.o + +RISCV_GXX_EXTRA_OPTS = -DVCACHE_SET=$(BSG_MACHINE_VCACHE_SET) \ + -DVCACHE_WAY=$(BSG_MACHINE_VCACHE_WAY) \ + -DVCACHE_BLOCK_SIZE_WORDS=$(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS) + +include ../../Makefile.include + + +main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) ../../common/crt.o + $(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS) + + +main.o: Makefile + +include ../../../mk/Makefile.tail_rules diff --git a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp new file mode 100644 index 000000000..111f9c39a --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp @@ -0,0 +1,66 @@ +// Kernel to estimate DRAM latency + +#include "bsg_manycore.h" +#include "bsg_set_tile_x_y.h" +#include +#include + +const size_t VCACHE_NUM_BLOCKS = VCACHE_SET * VCACHE_WAY; +const size_t VCACHE_SIZE_WORDS = VCACHE_NUM_BLOCKS * VCACHE_BLOCK_SIZE_WORDS; + +const size_t NUM_BANKS = 2 * bsg_tiles_Y; + +const uint32_t DRAM_START_ADDR = 0x80000000; + +// Returns the eva we should write to given the index in +// the Vcache. +// +// Inverse of bsg_manycore/v/vanilla_bean/hash_function.v +// Based on bsg_manycore/v/vanilla_bean/hash_function_reverse.v +uint32_t vcache_inverse_hash_function(size_t block_index, + size_t bank) { + return 0x80000000; +} + +// Issues a load to given vcache block index and bank +inline void load_vcache_index(size_t i, size_t bank) { + uint32_t eva = vcache_inverse_hash_function(i, bank); + + int dummy; + asm volatile ( + "lw %0, 0(%1)" + : "=r" (dummy) + : "r" (eva)); +} + +// Flushes the vcache associated with a given bank +void flush_vcache(size_t bank) { + // Distribute vcache block indices among all tiles + size_t len_per_tile = VCACHE_SIZE_WORDS / (bsg_tiles_X * bsg_tiles_Y) + 1; + size_t start = __bsg_id * len_per_tile; + size_t end = start + len_per_tile; + end = (end > VCACHE_SIZE_WORDS) ? VCACHE_SIZE_WORDS : end; + + // Issue load to each block index + for(size_t i = start; i < end; ++i) + load_vcache_index(i, bank); +} + +extern "C" __attribute__ ((noinline)) +int kernel_dram_latency(int dummy) { + // Flush vcahe associated with bank 0 + flush_vcache(0); + + // Opens a new page assuming vcache size would be + // a page boundary. + load_vcache_index(VCACHE_NUM_BLOCKS, 0); + + bsg_cuda_print_stat_kernel_start(); + size_t offset = VCACHE_NUM_BLOCKS + 1; + // Issue loads to 64 blocks in the opened page + for(size_t i = offset; i < offset + 64; ++i) + load_vcache_index(i, 0); + bsg_cuda_print_stat_kernel_end(); + + return 0; +} diff --git a/software/spmd/bsg_cuda_lite_runtime/dram_latency/main.c b/software/spmd/bsg_cuda_lite_runtime/dram_latency/main.c new file mode 100644 index 000000000..264aed765 --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/dram_latency/main.c @@ -0,0 +1,10 @@ +#include "bsg_manycore.h" +#include "bsg_set_tile_x_y.h" +#include "bsg_cuda_lite_runtime.h" + + +int main() +{ + __wait_until_valid_func(); +} + From 312080f6821c9d1a8ab010b1ad4667e87cd7ff9d Mon Sep 17 00:00:00 2001 From: Bandhav Veluri Date: Fri, 31 Jul 2020 12:52:59 -0700 Subject: [PATCH 2/5] Added vache inverse hash function --- .../dram_latency/Makefile | 4 +- .../dram_latency/kernel_dram_latency.cpp | 37 ++++++++++++++++--- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/software/spmd/bsg_cuda_lite_runtime/dram_latency/Makefile b/software/spmd/bsg_cuda_lite_runtime/dram_latency/Makefile index a04013cbd..536834271 100644 --- a/software/spmd/bsg_cuda_lite_runtime/dram_latency/Makefile +++ b/software/spmd/bsg_cuda_lite_runtime/dram_latency/Makefile @@ -23,8 +23,8 @@ KERNEL_NAME ?=kernel_dram_latency OBJECT_FILES=main.o kernel_dram_latency.o RISCV_GXX_EXTRA_OPTS = -DVCACHE_SET=$(BSG_MACHINE_VCACHE_SET) \ - -DVCACHE_WAY=$(BSG_MACHINE_VCACHE_WAY) \ - -DVCACHE_BLOCK_SIZE_WORDS=$(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS) + -DVCACHE_WAY=$(BSG_MACHINE_VCACHE_WAY) \ + -DVCACHE_BLOCK_SIZE_WORDS=$(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS) include ../../Makefile.include diff --git a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp index 111f9c39a..be50c788f 100644 --- a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp +++ b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp @@ -10,16 +10,39 @@ const size_t VCACHE_SIZE_WORDS = VCACHE_NUM_BLOCKS * VCACHE_BLOCK_SIZE_WORDS; const size_t NUM_BANKS = 2 * bsg_tiles_Y; -const uint32_t DRAM_START_ADDR = 0x80000000; +const uint32_t DRAM_ADDR_PREFIX = 0x80000000; -// Returns the eva we should write to given the index in -// the Vcache. +// Returns the eva we should write to given the block index and bank. // // Inverse of bsg_manycore/v/vanilla_bean/hash_function.v // Based on bsg_manycore/v/vanilla_bean/hash_function_reverse.v uint32_t vcache_inverse_hash_function(size_t block_index, size_t bank) { - return 0x80000000; + uint32_t bank_shift = 0; + uint32_t num_banks = NUM_BANKS; + while(num_banks >>= 1) + bank_shift++; + + uint32_t block_shift = 2; // byte offset + uint32_t vcache_block_size_words = VCACHE_BLOCK_SIZE_WORDS; + while(vcache_block_size_words >>= 1) + block_shift++; + + uint32_t eva; + + if(NUM_BANKS != 9) { + eva = (block_index << bank_shift) | bank; + } else { + if (bank != 8) { + eva = (block_index << bank_shift) | (bank & 7); + } else { + eva = (block_index << bank_shift) | (block_index & 6) | + ((block_index ^ (block_index >> 9)) & 1); + } + } + + eva = DRAM_ADDR_PREFIX | (eva << block_shift); + return eva; } // Issues a load to given vcache block index and bank @@ -57,9 +80,11 @@ int kernel_dram_latency(int dummy) { bsg_cuda_print_stat_kernel_start(); size_t offset = VCACHE_NUM_BLOCKS + 1; - // Issue loads to 64 blocks in the opened page - for(size_t i = offset; i < offset + 64; ++i) + // Issue loads to 32 blocks in the opened page + for(size_t i = offset; i < offset + 32; ++i) { load_vcache_index(i, 0); + bsg_fence(); + } bsg_cuda_print_stat_kernel_end(); return 0; From 04b719f604456de5378f2f343f5a82f709c8f3ff Mon Sep 17 00:00:00 2001 From: Bandhav Veluri Date: Fri, 31 Jul 2020 13:03:45 -0700 Subject: [PATCH 3/5] Flush with block indices --- .../dram_latency/kernel_dram_latency.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp index be50c788f..6a5768e14 100644 --- a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp +++ b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp @@ -59,10 +59,10 @@ inline void load_vcache_index(size_t i, size_t bank) { // Flushes the vcache associated with a given bank void flush_vcache(size_t bank) { // Distribute vcache block indices among all tiles - size_t len_per_tile = VCACHE_SIZE_WORDS / (bsg_tiles_X * bsg_tiles_Y) + 1; + size_t len_per_tile = VCACHE_NUM_BLOCKS / (bsg_tiles_X * bsg_tiles_Y) + 1; size_t start = __bsg_id * len_per_tile; size_t end = start + len_per_tile; - end = (end > VCACHE_SIZE_WORDS) ? VCACHE_SIZE_WORDS : end; + end = (end > VCACHE_NUM_BLOCKS) ? VCACHE_NUM_BLOCKS : end; // Issue load to each block index for(size_t i = start; i < end; ++i) From beac2b23e6ad1144103ffca86c57436178c2523c Mon Sep 17 00:00:00 2001 From: Bandhav Veluri Date: Fri, 31 Jul 2020 13:18:24 -0700 Subject: [PATCH 4/5] Only do fenced loads from tile 0 --- .../dram_latency/kernel_dram_latency.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp index 6a5768e14..116fc7041 100644 --- a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp +++ b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp @@ -79,11 +79,13 @@ int kernel_dram_latency(int dummy) { load_vcache_index(VCACHE_NUM_BLOCKS, 0); bsg_cuda_print_stat_kernel_start(); - size_t offset = VCACHE_NUM_BLOCKS + 1; - // Issue loads to 32 blocks in the opened page - for(size_t i = offset; i < offset + 32; ++i) { - load_vcache_index(i, 0); - bsg_fence(); + if(__bsg_id == 0) { + size_t offset = VCACHE_NUM_BLOCKS + 1; + // Issue loads to 32 blocks in the opened page + for(size_t i = offset; i < offset + 32; ++i) { + load_vcache_index(i, 0); + bsg_fence(); + } } bsg_cuda_print_stat_kernel_end(); From 6069db1084bc8d1a03896f7e24a2d51e4634b484 Mon Sep 17 00:00:00 2001 From: Bandhav Veluri Date: Fri, 31 Jul 2020 13:21:08 -0700 Subject: [PATCH 5/5] Added barrier --- .../dram_latency/kernel_dram_latency.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp index 116fc7041..0a00ceb3d 100644 --- a/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp +++ b/software/spmd/bsg_cuda_lite_runtime/dram_latency/kernel_dram_latency.cpp @@ -5,6 +5,10 @@ #include #include +#include "bsg_tile_group_barrier.hpp" + +bsg_barrier barrier; + const size_t VCACHE_NUM_BLOCKS = VCACHE_SET * VCACHE_WAY; const size_t VCACHE_SIZE_WORDS = VCACHE_NUM_BLOCKS * VCACHE_BLOCK_SIZE_WORDS; @@ -76,7 +80,7 @@ int kernel_dram_latency(int dummy) { // Opens a new page assuming vcache size would be // a page boundary. - load_vcache_index(VCACHE_NUM_BLOCKS, 0); + if(__bsg_id == 0) load_vcache_index(VCACHE_NUM_BLOCKS, 0); bsg_cuda_print_stat_kernel_start(); if(__bsg_id == 0) { @@ -89,5 +93,7 @@ int kernel_dram_latency(int dummy) { } bsg_cuda_print_stat_kernel_end(); + barrier.sync(); + return 0; }