diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile index 74d58eeb3..9ca9ec067 100644 --- a/examples/graphit/test_pr_nibble/Makefile +++ b/examples/graphit/test_pr_nibble/Makefile @@ -52,8 +52,29 @@ GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx TEST_NAME = main # KERNEL_NAME is the name of the CUDA-Lite Kernel KERNEL_NAME = pr_nibble +HOST_TARGET := $(TEST_NAME).profile + +BASE_VERSIONS += hybrid-update + +ITERATIONS := 0 1 2 3 4 5 6 7 8 9 +v-from-basev-and-iter = $1-iteration-$2 +basev-from-v = $(word 1,$(subst -iteration-, ,$1)) +iter-from-v = $(word 2,$(subst -iteration-, ,$1)) + +VERSIONS := $(foreach i,$(ITERATIONS),$(foreach v,$(BASE_VERSIONS),\ + $(call v-from-basev-and-iter,$v,$i))) + +VERSION-DIRS := $(foreach v,$(VERSIONS),kernel/$v) + +.PHONY: $(VERSION-DIRS) +$(VERSION-DIRS): + cp -r $(call basev-from-v,$@) $@ + +.PHONY: versions bleach-versions +versions: $(VERSION-DIRS) +bleach-versions: + rm -rf $(VERSION-DIRS) -VERSIONS = hybrid DEFAULT_VERSION := hybrid KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.cpp @@ -131,20 +152,19 @@ SIM_ARGS ?= # Include platform-specific execution rules include $(EXAMPLES_PATH)/execution.mk -HOST_TARGET := $(TEST_NAME).profile $(VERSIONS): %: kernel/%/$(HOST_TARGET).log -ALIASES = vanilla_stats.csv vcache_stats.csv +ALIASES = vanilla_stats.csv vcache_stats.csv dramsim3epoch.json dramsim3.json dramsim3.tag.json dramsim3.txt $(ALIASES): $(HOST_TARGET).log ; -$(HOST_TARGET).log: kernel.riscv $(HOST_TARGET) +$(HOST_TARGET).log: $(HOST_TARGET) kernel.riscv ./$(HOST_TARGET) $(SIM_ARGS) +c_args="kernel.riscv $(DEFAULT_VERSION) $(C_ARGS)" 2>&1 | tee $@ KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a) .PRECIOUS: $(KERNEL_ALIASES) $(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ; -kernel/%/$(HOST_TARGET).log: kernel/%/kernel.riscv $(HOST_TARGET) +kernel/%/$(HOST_TARGET).log: $(HOST_TARGET) kernel/%/kernel.riscv $(eval EXEC_PATH := $(patsubst %/,%,$(dir $@))) $(eval KERNEL_PATH := $(CURRENT_PATH)/$(EXEC_PATH)) $(eval _VERSION := $(notdir $(EXEC_PATH))) @@ -152,7 +172,9 @@ kernel/%/$(HOST_TARGET).log: kernel/%/kernel.riscv $(HOST_TARGET) $(CURRENT_PATH)/$(HOST_TARGET) $(SIM_ARGS) +c_args="$(KERNEL_PATH)/kernel.riscv $(_VERSION) $(C_ARGS)" \ 2>&1 | tee $(notdir $a) -versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log) +.PRECIOUS: %.log + +all-versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log) ############################################################################### # Regression Flow diff --git a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp new file mode 100644 index 000000000..16e66425c --- /dev/null +++ b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp @@ -0,0 +1,229 @@ +//#define DEBUG +#include + +#ifdef DEBUG +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#else +#include +// #define BSG_TILE_GROUP_X_DIM 16 +// #define BSG_TILE_GROUP_Y_DIM 8 +#endif + +#include +bsg_barrier barrier; + +#include +#include + +#ifdef DEBUG +#define pr_dbg(fmt, ...) \ + bsg_printf(fmt, ##__VA_ARGS__) +#else +#define pr_dbg(fmt, ...) +#endif + +__attribute__((section(".dram"))) float * __restrict p; +__attribute__((section(".dram"))) float * __restrict old_rank; +__attribute__((section(".dram"))) float * __restrict new_rank; +__attribute__((section(".dram"))) int * __restrict out_degree; +__attribute__((section(".dram"))) int * __restrict generated_tmp_vector_3; +//__attribute__((section(".dram"))) double alpha = 0.15; +//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; + +template int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); + //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); + int start, end; + local_range(V, &start, &end); + for ( int d = start; d < end; d++) { + int degree = in_indices[d + 1] - in_indices[d]; + int * neighbors = &in_neighbors[in_indices[d]]; + for(int s = 0; s < degree; s++) { + if(from_vertexset[neighbors[s]]) { + //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); + apply_func (neighbors[s] , d); + } + } //end of loop on in neighbors + } //end of outer for loop + return 0; +} //end of edgeset apply function + +template int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); + //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); + int start, end; + local_range(V, &start, &end); + for ( int s = start; s < end; s++) { + if(from_vertexset[s]) { + int degree = out_indices[s + 1] - out_indices[s]; + int * neighbors = &out_neighbors[out_indices[s]]; + for(int d = 0; d < degree; d++) { + apply_func (s, neighbors[d]); + //if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); } + + } + } //end of loop on in neighbors + } //end of outer for loop + //barrier.sync(); + return 0; +} //end of edgeset apply function + + +struct generated_vector_op_apply_func_4 +{ + void operator() (int v) + { + out_degree[v] = generated_tmp_vector_3[v]; + }; +}; +struct new_rank_generated_vector_op_apply_func_2 +{ + void operator() (int v) + { + new_rank[v] = ((float) 0) ; + }; +}; +struct old_rank_generated_vector_op_apply_func_1 +{ + void operator() (int v) + { + old_rank[v] = ((float) 0) ; + }; +}; +struct p_generated_vector_op_apply_func_0 +{ + void operator() (int v) + { + p[v] = ((float) 0) ; + }; +}; +struct updateEdge +{ + void operator() (int src, int dst) + { + float alpha = 0.15; + new_rank[dst] = (new_rank[dst] + (((((1) - alpha) / ((1) + alpha)) * old_rank[src]) / out_degree[src])); + }; +}; +struct updateSelf +{ + void operator() (int v) + { + float alpha = 0.15; + p[v] = (p[v] + ((((2) * alpha) / ((1) + alpha)) * old_rank[v])); + new_rank[v] = (0) ; + }; +}; +struct filter_frontier +{ + bool operator() (int v) + { + float epsilon = (float) 1e-6; + bool output ; + //if(old_rank[v] == 0) return 0; + if(new_rank[v] == 0) return 0; + //output = (old_rank[v]) > ((out_degree[v] * epsilon)); + output = (new_rank[v]) > ((out_degree[v] * epsilon)); + return output; + }; +}; + +extern "C" int __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + p_generated_vector_op_apply_func_0()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + old_rank_generated_vector_op_apply_func_1()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + new_rank_generated_vector_op_apply_func_2()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + generated_vector_op_apply_func_4()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) { + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if(frontier[iter_x]) { updateSelf()(iter_x); } + } + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c); + bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); + edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + + extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); + edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + +extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { + //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c); + //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c); + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if (iter_x < V) { + next5[iter_x] = 0; + if ( filter_frontier()( iter_x ) ) { + next5[iter_x] = 1; + //pr_dbg("added vertex %i to frontier\n", iter_x); + } + } + else { break; } + } //end of loop + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + + diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp index ff396f302..9cbfdde2b 100644 --- a/examples/graphit/test_pr_nibble/main.cpp +++ b/examples/graphit/test_pr_nibble/main.cpp @@ -34,11 +34,11 @@ int launch(int argc, char ** argv){ std::string ucode_path = input.getRISCVFile(); int iter = 0; - // std::string iterstrbase = "iteration-"; - // auto pos = ucode_path.find(iterstrbase); - // auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos); - // std::stringstream ss(iterstr); - // ss >> iter; + std::string iterstrbase = "iteration-"; + auto pos = ucode_path.find(iterstrbase); + auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos); + std::stringstream ss(iterstr); + ss >> iter; std::cerr << "iteration: " << iter << std::endl; int version = 0; //default to vertex pull @@ -84,7 +84,6 @@ int launch(int argc, char ** argv){ float epsilon = ((float) 1e-06) ; int start_vertex = ROOT; Vector frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); - //Vector next_frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); std::vector hfrontier(edges.num_nodes(), 0); std::vector p(edges.num_nodes(), (float) 0.0); @@ -122,12 +121,7 @@ int launch(int argc, char ** argv){ std::cerr << "start of while loop\n"; int tag_c = 0; - //double host_rank[edges.num_nodes()]; - //ofstream prog_file; - //prog_file.open("./progress.txt"); - //prog_file << "starting computation w/ root vertex: " << start_vertex << std::endl; //while ( builtin_getVertexSetSizeHB(frontier, edges.num_nodes()) != 0) - //while ( iter < 16) for(int i = 0; i < 1; i++) { int f_sz = 0; @@ -136,10 +130,10 @@ int launch(int argc, char ** argv){ case 0: //vertex pull std::cerr << "pull kernel\n"; std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; - std::cerr << "run update edges kernel on iter : " << iter << "\n"; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; @@ -154,25 +148,25 @@ int launch(int argc, char ** argv){ case 1: //vertex push std::cerr << "push kernel\n"; std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; std::cerr << "run update edges kernel on iter : " << iter << "\n"; device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; - std::cerr << "swap arrays\n"; - hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); std::cerr << "create next frontier\n"; device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); device->runJobs(); + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; break; case 2: //blocked pull std::cerr << "blocked pull kernel\n"; std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; std::cerr << "run update edges kernel on iter : " << iter << "\n"; @@ -191,13 +185,9 @@ int launch(int argc, char ** argv){ tag_c++; iter++; - //prog_file << "finished iteration: " << iter << std::endl; } std::cerr << "*******end of program********\n"; - //prog_file << "*******end of program********\n"; std::cerr << "took: " << iter << " iterations to complete\n"; - //prog_file << "took: " << iter << " iterations to complete\n"; - //prog_file.close(); if(VERIFY) { ofstream ver_file; ver_file.open("./rank.txt"); diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp index 7e7479495..1923c6d6d 100644 --- a/examples/graphit/test_pr_nibble/pr_host.hpp +++ b/examples/graphit/test_pr_nibble/pr_host.hpp @@ -13,16 +13,16 @@ inline void host_pr_calc(std::vector & p, std::vector & old_rank, ofstream ofile; ofile.open (fname); for(int i = 0; i < iter; i++) { - //std::memcpy(new_rank, old_rank, sizeof(float)*edges.num_nodes()); - //new_rank = old_rank; new_rank.assign(old_rank.begin(), old_rank.end()); //print out iteration and size: int num_items = std::count(frontier.begin(), frontier.end(), 1); std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl; //update_self for(int v = 0; v < g.num_nodes(); v++) { - p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; - new_rank[v] = (float) 0.0 ; + if(frontier[v]) { + p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; + new_rank[v] = (float) 0.0 ; + } } //update edges for(int d = 0; d < g.num_nodes(); d++) { @@ -35,9 +35,6 @@ inline void host_pr_calc(std::vector & p, std::vector & old_rank, } } } - //old_rank.swap(new_rank); - //std::memcpy(old_rank, new_rank, sizeof(float)*edges.num_nodes()); - //old_rank = new_rank; old_rank.assign(new_rank.begin(), new_rank.end()); //update frontier for(int v = 0; v < g.num_nodes(); v++) {