diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..f352d419a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,12 @@ +[submodule "n"] + path = examples/sdh-eval-workloads/ipnsw/hb-prog-eval + url = git@github.com:bespoke-silicon-group/hb-prog-eval +[submodule "examples/sdh-eval-workloads/ipnsw/graph-tools"] + path = examples/sdh-eval-workloads/ipnsw/graph-tools + url = git@github.com:bespoke-silicon-group/graph-tools +[submodule "examples/sdh-eval-workloads/ipnsw/hammerblade-helpers"] + path = examples/sdh-eval-workloads/ipnsw/hammerblade-helpers + url = git@github.com:bespoke-silicon-group/hammerblade-helpers +[submodule "examples/graphit/graphit-src"] + path = examples/graphit/graphit-src + url = git@github.com:bespoke-silicon-group/graphit.git diff --git a/examples/Makefile b/examples/Makefile index 1bc3055e8..6fda8a5ef 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -45,7 +45,7 @@ include $(REPLICANT_PATH)/environment.mk include $(EXAMPLES_PATH)/link.mk # Supported example suites -TARGETS = library spmd cuda python +TARGETS = library spmd cuda python graphit # Define the tests that get run TESTS += test_loader diff --git a/examples/cuda/riscv.mk b/examples/cuda/riscv.mk index 87a52d511..00b37c1eb 100644 --- a/examples/cuda/riscv.mk +++ b/examples/cuda/riscv.mk @@ -244,7 +244,7 @@ RISCV_LDFLAGS += -Wl,--no-check-sections # This builds a .riscv binary for the current machine type and tile # group size. RISCV_TARGET_OBJECTS are .rvo files that will be linked # in the final binary. -%.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) +kernel.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) $(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@ kernel.link.clean: diff --git a/examples/graphit/Makefile b/examples/graphit/Makefile new file mode 100644 index 000000000..600ef53f4 --- /dev/null +++ b/examples/graphit/Makefile @@ -0,0 +1,61 @@ +# Copyright (c) 2019, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This Makefile compiles, links, and executes examples Run `make help` +# to see the available targets for the selected platform. + +# environment.mk verifies the build environment and sets the following +# makefile variables: +# +# LIBRAIRES_PATH: The path to the libraries directory +# HARDWARE_PATH: The path to the hardware directory +# EXAMPLES_PATH: The path to the examples directory +# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL +# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore +# CL_DIR: Path to the directory of this AWS F1 Project +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) + +include $(REPLICANT_PATH)/environment.mk + +# Defines REGRESSION_PREBUILD +include $(EXAMPLES_PATH)/link.mk + +# Define the tests that get run +TESTS += test_pr_nibble + +regression: $(TESTS) + @echo "GRAPHIT REGRESSION PASSED" + +$(TESTS): $(REGRESSION_PREBUILD) + $(MAKE) -C $@ regression + +clean: $(TESTS:=.clean) + +%.clean: + $(MAKE) -C $(@:.clean=) clean + +.PHONY: clean regression $(TESTS) %.clean diff --git a/examples/graphit/graphit-src b/examples/graphit/graphit-src new file mode 160000 index 000000000..9f4d8e9ba --- /dev/null +++ b/examples/graphit/graphit-src @@ -0,0 +1 @@ +Subproject commit 9f4d8e9bacac0ed44afe7c3abde697f21457a487 diff --git a/examples/graphit/pr_nibble/Makefile b/examples/graphit/pr_nibble/Makefile new file mode 100644 index 000000000..af6475765 --- /dev/null +++ b/examples/graphit/pr_nibble/Makefile @@ -0,0 +1,206 @@ +# Copyright (c) 2021, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This Makefile compiles, links, and executes examples Run `make help` +# to see the available targets for the selected platform. + +################################################################################ +# environment.mk verifies the build environment and sets the following +# makefile variables: +# +# LIBRAIRES_PATH: The path to the libraries directory +# HARDWARE_PATH: The path to the hardware directory +# EXAMPLES_PATH: The path to the examples directory +# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL +# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore +############################################################################### + +CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) + +include $(REPLICANT_PATH)/environment.mk +SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd +CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime +GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-src + +GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx +# TEST_NAME is the basename of the executable +TEST_NAME = main +# KERNEL_NAME is the name of the CUDA-Lite Kernel +KERNEL_NAME = pr_nibble +HOST_TARGET := $(TEST_NAME).exec + +BASE_VERSIONS += hybrid + +ITERATIONS := 0 1 2 3 4 5 6 7 8 9 +v-from-basev-and-iter = $1-iteration-$2 +basev-from-v = $(word 1,$(subst -iteration-, ,$1)) +iter-from-v = $(word 2,$(subst -iteration-, ,$1)) + +VERSIONS := $(foreach i,$(ITERATIONS),$(foreach v,$(BASE_VERSIONS),\ + $(call v-from-basev-and-iter,$v,$i))) + +VERSION-DIRS := $(foreach v,$(VERSIONS),kernel/$v) + +.PHONY: $(VERSION-DIRS) +$(VERSION-DIRS): + cp -r $(call basev-from-v,$@) $@ + +.PHONY: versions bleach-versions +versions: $(VERSION-DIRS) +bleach-versions: + rm -rf $(VERSION-DIRS) + +DEFAULT_VERSION := hybrid +KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.cpp + +############################################################################### +# Host code compilation flags and flow +############################################################################### + +# TEST_SOURCES is a list of source files that need to be compiled +TEST_SOURCES = main.cpp + +DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE +CDEFINES += +CXXDEFINES += + +FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable +CFLAGS += -std=c99 $(FLAGS) +CXXFLAGS += -std=c++11 $(FLAGS) + + +# compilation.mk defines rules for compilation of C/C++ +include $(EXAMPLES_PATH)/compilation.mk + +# Specify any header file dependencies +main.o: INCLUDES += -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/ + +############################################################################### +# Host code link flags and flow +############################################################################### + +LDFLAGS += + +# link.mk defines rules for linking of the final execution binary. +include $(EXAMPLES_PATH)/link.mk + +############################################################################### +# Device code compilation flow +############################################################################### + +# BSG_MANYCORE_KERNELS is a list of manycore executables that should +# be built before executing. +BSG_MANYCORE_KERNELS = kernel.riscv + +kernel.rvo: RISCV_CXX = $(RISCV_GXX) +kernel.riscv: kernel.rvo + +%/kernel.rvo: RISCV_CXX = $(RISCV_GXX) + +# Tile Group Dimensions +TILE_GROUP_DIM_X = 16 +TILE_GROUP_DIM_Y = 8 +RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) +RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) + +RISCV_INCLUDES += -I$(CURRENT_PATH)/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/ + +include $(EXAMPLES_PATH)/cuda/riscv.mk + +%/kernel.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo %/kernel.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) + $(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@ + +############################################################################### +# Execution flow +# +# C_ARGS: Use this to pass arguments that you want to appear in argv +# For SPMD tests C arguments are: +# +# SIM_ARGS: Use this to pass arguments to the simulator +############################################################################### +#C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH) +C_ARGS ?= $(KERNEL_NAME) -g $(GRAPH_PATH) + +SIM_ARGS ?= + +# Include platform-specific execution rules +include $(EXAMPLES_PATH)/execution.mk + + +$(VERSIONS): %: kernel/%/$(HOST_TARGET).log + +ALIASES = vanilla_stats.csv vcache_stats.csv dramsim3epoch.json dramsim3.json dramsim3.tag.json dramsim3.txt +$(ALIASES): $(HOST_TARGET).log ; +$(HOST_TARGET).log: $(HOST_TARGET) kernel.riscv + ./$(HOST_TARGET) $(SIM_ARGS) +c_args="kernel.riscv $(DEFAULT_VERSION) $(C_ARGS)" 2>&1 | tee $@ + + +KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a) +.PRECIOUS: $(KERNEL_ALIASES) kernel/%/kernel.riscv +$(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ; +kernel/%/$(HOST_TARGET).log: $(HOST_TARGET) kernel/%/kernel.riscv + $(eval EXEC_PATH := $(patsubst %/,%,$(dir $@))) + $(eval KERNEL_PATH := $(CURRENT_PATH)/$(EXEC_PATH)) + $(eval _VERSION := $(notdir $(EXEC_PATH))) + cd $(EXEC_PATH) && \ + $(CURRENT_PATH)/$(HOST_TARGET) $(SIM_ARGS) +c_args="$(KERNEL_PATH)/kernel.riscv $(_VERSION) $(C_ARGS)" \ + 2>&1 | tee $(notdir $a) + +.PRECIOUS: %.log + +all-versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log) + +############################################################################### +# Regression Flow +############################################################################### + +regression: versions all-versions + @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null + +############################################################################### +# Default rules, help, and clean +############################################################################### +.DEFAULT_GOAL := help +help: + @echo "Usage:" + @echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}" + @echo " $(TEST_NAME).profile: Build executable with profilers enabled" + @echo " $(TEST_NAME).debug: Build waveform executable (if VCS)" + @echo " $(TEST_NAME).{profile,debug}.log: Run specific executable" + @echo " clean: Remove all subdirectory-specific outputs" + +print-% : ; @echo $* = $($*) + +version.clean: + rm -rf kernel/*/*{.ucli,.csv,.log,.rvo,.riscv,.vpd,.key,.dis,.ll,.ll.s} + +.PHONY: clean + +clean: bleach-versions version.clean + + diff --git a/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp new file mode 100644 index 000000000..294d564a6 --- /dev/null +++ b/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp @@ -0,0 +1,175 @@ +//#define DEBUG +#include + +#ifdef DEBUG +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#else +#include +#endif + +#include +bsg_barrier barrier; + +#include +#include + +#ifdef DEBUG +#define pr_dbg(fmt, ...) \ + bsg_printf(fmt, ##__VA_ARGS__) +#else +#define pr_dbg(fmt, ...) +#endif + +__attribute__((section(".dram"))) float * __restrict p; +__attribute__((section(".dram"))) float * __restrict old_rank; +__attribute__((section(".dram"))) float * __restrict new_rank; +__attribute__((section(".dram"))) int * __restrict out_degree; + + +template int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + int start, end; + local_range(V, &start, &end); + for ( int d = start; d < end; d++) { + int degree = in_indices[d + 1] - in_indices[d]; + int * neighbors = &in_neighbors[in_indices[d]]; + for(int s = 0; s < degree; s++) { + if(from_vertexset[neighbors[s]]) { + apply_func (neighbors[s] , d); + } + } //end of loop on in neighbors + } //end of outer for loop + return 0; +} //end of edgeset apply function + +template int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + int start, end; + local_range(V, &start, &end); + for ( int s = start; s < end; s++) { + if(from_vertexset[s]) { + int degree = out_indices[s + 1] - out_indices[s]; + int * neighbors = &out_neighbors[out_indices[s]]; + for(int d = 0; d < degree; d++) { + apply_func (s, neighbors[d]); + + } + } //end of loop on in neighbors + } //end of outer for loop + return 0; +} //end of edgeset apply function + +struct updateEdge +{ + void operator() (int src, int dst) + { + float alpha = 0.15; + new_rank[dst] = (new_rank[dst] + (((((1) - alpha) / ((1) + alpha)) * old_rank[src]) / out_degree[src])); + }; +}; +struct updateSelf +{ + void operator() (int v) + { + float alpha = 0.15; + p[v] = (p[v] + ((((2) * alpha) / ((1) + alpha)) * old_rank[v])); + new_rank[v] = (0) ; + }; +}; +struct filter_frontier +{ + bool operator() (int v) + { + float epsilon = (float) 1e-6; + bool output ; + if(new_rank[v] == 0) return 0; + output = (new_rank[v]) > ((out_degree[v] * epsilon)); + return output; + }; +}; + +extern "C" int __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) { + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if(frontier[iter_x]) {updateSelf()(iter_x);} + } + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); + edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + + extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); + edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + +extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if (iter_x < V) { + next5[iter_x] = 0; + if ( filter_frontier()( iter_x ) ) { + next5[iter_x] = 1; + } + } + else { break; } + } //end of loop + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + +extern "C" void prefetch(int * in_indices, int * in_neighbors, int * from_vertexset, int V, int E) { + int id = __bsg_id; + int threads = bsg_tiles_X * bsg_tiles_Y; + // prefetch all data; + for (int i = 32 * id; i < E; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (in_neighbors[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (in_indices[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (from_vertexset[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (out_degree[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (p[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (old_rank[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (new_rank[i])); + } + barrier.sync(); + return ; + +} diff --git a/examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp b/examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp new file mode 100644 index 000000000..ee50a54d6 --- /dev/null +++ b/examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp @@ -0,0 +1,9 @@ +#pragma once +#ifndef __PR_PULL_BENCHMARK_HPP +#define __PR_PULL_BENCHMARK_HPP + +#include +#include +#include +#include +#endif diff --git a/examples/graphit/pr_nibble/main.cpp b/examples/graphit/pr_nibble/main.cpp new file mode 100644 index 000000000..aa2d4032f --- /dev/null +++ b/examples/graphit/pr_nibble/main.cpp @@ -0,0 +1,177 @@ +#include "pr.hpp" + +//#define DEBUG + +#define VERIFY 0 + +#ifdef DEBUG +#define X 1 +#define Y 1 +#else +#define X 16 //tile group dim X +#define Y 8 // tile group dim Y +#endif + +#define ROOT 6 +#define NUM_LOCKS 1024 //width of manycore * 64 + +GraphHB edges; +GlobalScalar p_dev; +GlobalScalar old_rank_dev; +GlobalScalar new_rank_dev; +GlobalScalar out_degree_dev; + +#include "pr_host.hpp" + +int test_pr_nibble(int argc, char ** argv){ + InputParser input(argc, argv); + if(!input.cmdOptionExists("-g")){ + std::cerr << "no input args\n"; + return 0; + } + std::string ucode_path = input.getRISCVFile(); + + int iter = 0; + std::string iterstrbase = "iteration-"; + auto pos = ucode_path.find(iterstrbase); + auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos); + std::stringstream ss(iterstr); + ss >> iter; + std::cerr << "iteration: " << iter << std::endl; + + int version = 0; //default to vertex pull + if(ucode_path.find("push") != std::string::npos) { + version = 1; + } + int hybrid = 0; //default to vertex pull + if(ucode_path.find("hybrid") != std::string::npos) { + hybrid = 1; + } + std::cerr << "version: " << version << std::endl; + std::cerr << "hybrid: " << hybrid << std::endl; + std::cerr << "load microcode" << std::endl; + hammerblade::builtin_loadMicroCodeFromFile(ucode_path); + + std::cerr << "load graph" << std::endl; + std::string graph_f = input.getCmdOption("-g"); + edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); + + std::cerr << "size of graph: " << std::endl; + std::cerr << edges.num_nodes() << std::endl; + std::cerr << edges.num_edges() << std::endl; + + std::cerr << "init global scalars" << std::endl; + p_dev = GlobalScalar("p"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), p_dev); + old_rank_dev = GlobalScalar("old_rank"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), old_rank_dev); + new_rank_dev = GlobalScalar("new_rank"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), new_rank_dev); + out_degree_dev = GlobalScalar("out_degree"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), out_degree_dev); + + std::cerr << "init locks" << std::endl; + GlobalScalar glbl_locks = GlobalScalar("locks"); + hammerblade::init_global_array>(NUM_LOCKS, glbl_locks); + std::atomic tmp_a[NUM_LOCKS] = {}; + Device::Ptr device = Device::GetInstance(); + int start_vertex = ROOT; + Vector frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); + + std::vector hfrontier(edges.num_nodes(), 0); + std::vector p(edges.num_nodes(), (float) 0.0); + std::vector new_rank(edges.num_nodes(), (float) 0.0); + std::vector old_rank(edges.num_nodes(), (float) 0.0); + std::vector out_degs = edges.get_out_degrees(); + + //compute up to current iter on host + hfrontier[start_vertex] = 1; + new_rank[start_vertex] = (float) 1.0; + old_rank[start_vertex] = (float) 1.0; + host_pr_calc(p, old_rank, new_rank, hfrontier, iter); + + //copy all variables at their current state to device + frontier.copyToDevice(hfrontier.data(), hfrontier.size()); + hammerblade::write_global_buffer_dma(p.data(), p_dev, p.size()); + hammerblade::write_global_buffer_dma(old_rank.data(), old_rank_dev, old_rank.size()); + hammerblade::write_global_buffer_dma(new_rank.data(), new_rank_dev, new_rank.size()); + hammerblade::write_global_buffer_dma(out_degs.data(), out_degree_dev, out_degs.size()); + //initialize locks for atomics on device + hammerblade::write_global_buffer_dma>(tmp_a, glbl_locks, NUM_LOCKS); + + device->freeze_cores(); + device->write_dma(); + device->unfreeze_cores(); + //determine push or pull traversal for this iteration + if(hybrid) { + int num_items = std::count(hfrontier.begin(), hfrontier.end(), 1); + int dir = calculate_direction(num_items, hfrontier, edges, edges.num_nodes(), edges.num_edges()); + if(dir){ + version = 0; //pull + } else { + version = 1; //push + } + } + + std::cerr << "start of while loop\n"; + int tag_c = 0; + int f_sz = 0; + switch(version) { + case 0: //vertex pull + std::cerr << "pull kernel\n"; + std::cerr << "preloading the cache\n"; + device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()}); + device->runJobs(); + std::cerr << "run update self vertex kernel\n"; + device->enqueueJob("updateself_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; + device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "create next frontier\n"; + device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runJobs(); + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); + std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; + break; + case 1: //vertex push + std::cerr << "push kernel\n"; + std::cerr << "preloading the cache\n"; + device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()}); + device->runJobs(); + std::cerr << "run update self vertex kernel\n"; + device->enqueueJob("updateself_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; + device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "create next frontier\n"; + device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runJobs(); + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); + std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; + break; + } + if(VERIFY) { + ofstream ver_file; + ver_file.open("./rank.txt"); + float host_rank[edges.num_nodes()]; + hammerblade::read_global_buffer_dma(host_rank, old_rank_dev, edges.num_nodes()); + for(int i = 0; i < edges.num_nodes(); i++) { + ver_file << host_rank[i] << std::endl; + } + ver_file.close(); + } + device->finish(); + return 0; +} + +declare_program_main("test_pr_nibble", test_pr_nibble); diff --git a/examples/graphit/pr_nibble/pr.hpp b/examples/graphit/pr_nibble/pr.hpp new file mode 100644 index 000000000..ae01c8cc2 --- /dev/null +++ b/examples/graphit/pr_nibble/pr.hpp @@ -0,0 +1,25 @@ +#pragma once +#ifndef __PR_PULL_BENCHMARK_HPP +#define __PR_PULL_BENCHMARK_HPP + +#include "hb_intrinsics.h" //graphit host runtime libs +#include "infra_hb/host/arg_parser.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using hammerblade::Device; +using hammerblade::Vector; +using hammerblade::GraphHB; +using hammerblade::GlobalScalar; + +#endif diff --git a/examples/graphit/pr_nibble/pr_host.hpp b/examples/graphit/pr_nibble/pr_host.hpp new file mode 100644 index 000000000..fcbb811e0 --- /dev/null +++ b/examples/graphit/pr_nibble/pr_host.hpp @@ -0,0 +1,45 @@ +//function to compute pr-nibble on host up to current iter +#pragma once +#include +#include + +inline void host_pr_calc(std::vector & p, std::vector & old_rank, std::vector & new_rank, std::vector & frontier, int iter) { + float alpha = (float) 0.15; + float epsilon = (float) 1e-06; + auto g = edges.getHostGraph(); + int * in_neigh = g.in_neighbors_shared_.get(); + int ** in_index = g.in_index_shared_.get(); + for(int i = 0; i < iter; i++) { + new_rank.assign(old_rank.begin(), old_rank.end()); + //print out iteration and size: + int num_items = std::count(frontier.begin(), frontier.end(), 1); + std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl; + //update_self + for(int v = 0; v < g.num_nodes(); v++) { + if(frontier[v]) { + p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; + new_rank[v] = (float) 0.0 ; + } + } + //update edges + for(int d = 0; d < g.num_nodes(); d++) { + for(int s : g.in_neigh(d)) { + if(frontier[s]){ + float update = ((1.0 - alpha) / (1.0 + alpha)) * old_rank[s]; + update = update / ((float) g.out_degree(s)); + new_rank[d] += update; + } + } + } + old_rank.assign(new_rank.begin(), new_rank.end()); + //update frontier + for(int v = 0; v < g.num_nodes(); v++) { + frontier[v] = 0; + if(g.out_degree(v) > 0 && old_rank[v] >= (((float) g.out_degree(v)) * epsilon)) { + frontier[v] = 1; + } + } + } + int num_items = std::count(frontier.begin(), frontier.end(), 1); + std::cerr << "returning with frontier size: " << num_items << std::endl; +} diff --git a/examples/sdh-eval-workloads/ipnsw/.gitignore b/examples/sdh-eval-workloads/ipnsw/.gitignore new file mode 100644 index 000000000..737e26b00 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/.gitignore @@ -0,0 +1 @@ +run/ \ No newline at end of file diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp new file mode 100644 index 000000000..3d14b2c8d --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp @@ -0,0 +1,11 @@ +#pragma once +#include "IPNSWFactory.hpp" +#include "BeamSearchKernelRunner.hpp" +#include "BeamSearchResultReader.hpp" +namespace ipnsw { + class BeamSearchFactory : public IPNSWFactory { + private: + IPNSWKernelRunner *_KernelRunner() const { return new BeamSearchKernelRunner; } + IPNSWResultReader *_ResultReader() const { return new BeamSearchResultReader; } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp new file mode 100644 index 000000000..6fe724b68 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp @@ -0,0 +1,52 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IPNSWRunner.hpp" +#include "GreedyWalkResults.hpp" + +namespace ipnsw { + class BeamSearchKernelRunner : public IPNSWKernelRunner { + std::string kernelName(const IPNSWRunner & runner) const { + return "ipnsw_beam_search"; + } + + Dim tgd(const IPNSWRunner & runner) const { + return Dim(runner.cfg().grp_x(), + runner.cfg().grp_y()); + } + + Dim gd(const IPNSWRunner & runner) const { + return Dim(runner.cfg().grid_x(), + runner.cfg().grid_y()); + } + std::vector argv(const IPNSWRunner & runner) const { + int v_curr; + float d_curr; + std::vector do_queries = runner._io->do_queries(); + if (do_queries.empty()) { + v_curr = std::get(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]); + d_curr = std::get(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]); + } else { + v_curr = std::get(GREEDY_WALK_RESULTS[do_queries[0]]); + d_curr = std::get(GREEDY_WALK_RESULTS[do_queries[0]]); + } + + HammerBlade::Ptr hb = HammerBlade::Get(); + hb->write(runner.v_curr_dev(0), &v_curr, sizeof(v_curr)); + hb->write(runner.d_curr_dev(0), &d_curr, sizeof(d_curr)); + + std::vector argv = { + runner.graph_metadata_dev(), + runner.db_dev(), + runner.query_dev(0), + runner.seen_dev(0), + runner.v_curr_dev(0), + runner.d_curr_dev(0), + runner.candidates_dev(0), + runner.results_dev(0), + runner.n_results_dev(0), + }; + return argv; + } + + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp new file mode 100644 index 000000000..3d4cc7493 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp @@ -0,0 +1,26 @@ +#pragma once +#include "IPNSWRunner.hpp" +#include "IPNSWResultReader.hpp" +#include "GreedyWalkResults.hpp" + +namespace ipnsw { + class BeamSearchResultReader : public IPNSWResultReader { + public: + void readResults(const IPNSWRunner & runner) { + HammerBlade::Ptr hb = HammerBlade::Get(); + + hb_mc_eva_t grp = 0; + int n_results; + hb->read(runner.n_results_dev(grp), &n_results, sizeof(int)); + + std::vector results(n_results); + hb->push_read(runner.results_dev(grp), &results[0], n_results * sizeof(GreedyWalkResult)); + hb->sync_read(); + + std::cout << "Beam search:" << std::endl; + for (auto & r : results) { + std::cout << "{" << std::get<0>(r) << "," << std::get<1>(r) << "}" << std::endl; + } + } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp new file mode 100644 index 000000000..e98f11ad2 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp @@ -0,0 +1,12 @@ +#pragma once +#include "IPNSWFactory.hpp" +#include "GreedyWalkKernelRunner.hpp" +#include "GreedyWalkResultReader.hpp" + +namespace ipnsw { + class GreedyWalkFactory : public IPNSWFactory { + private: + IPNSWKernelRunner *_KernelRunner() const { return new GreedyWalkKernelRunner; } + IPNSWResultReader *_ResultReader() const { return new GreedyWalkResultReader; } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp new file mode 100644 index 000000000..ac51739b4 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp @@ -0,0 +1,35 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IPNSWRunner.hpp" + +namespace ipnsw { + class GreedyWalkKernelRunner : public IPNSWKernelRunner { + + Dim tgd(const IPNSWRunner & runner) const { + return Dim(runner.cfg().grp_x(), + runner.cfg().grp_y()); + } + + Dim gd(const IPNSWRunner & runner) const { + return Dim(runner.cfg().grid_x(), + runner.cfg().grid_y()); + } + + std::string kernelName(const IPNSWRunner & runner) const { + return "ipnsw_greedy_search"; + } + + std::vector argv(const IPNSWRunner & runner) const { + std::vector argv = { + runner.graph_metadata_dev(), + runner.db_dev(), + runner.query_dev(0), + runner.seen_dev(0), + runner.v_curr_dev(0), + runner.d_curr_dev(0), + }; + return argv; + } + + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp new file mode 100644 index 000000000..6ca7851ff --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp @@ -0,0 +1,21 @@ +#pragma once +#include "IPNSWRunner.hpp" +#include "IPNSWResultReader.hpp" + +namespace ipnsw { + class GreedyWalkResultReader : public IPNSWResultReader { + public: + void readResults(const IPNSWRunner & runner) { + HammerBlade::Ptr hb = HammerBlade::Get(); + int v_curr; + float d_curr; + + hb->read(runner.v_curr_dev(0), &v_curr, sizeof(int)); + hb->read(runner.d_curr_dev(0), &d_curr, sizeof(float)); + + std::cout << "Greedy walk (v_curr,d_curr) = " + << "(" << v_curr << "," << d_curr << ")" + << std::endl; + } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp new file mode 100644 index 000000000..7d37104df --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp @@ -0,0 +1,517 @@ +#include "GreedyWalkResults.hpp" +namespace ipnsw { + std::vector GREEDY_WALK_RESULTS = { + GreedyWalkResult(static_cast(-0x1.94442e0000000p-2), 40323), + GreedyWalkResult(static_cast(-0x1.e72901fffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.cb85360000001p-4),541780), + GreedyWalkResult(static_cast(-0x1.e56d7ffffffffp-8), 78517), + GreedyWalkResult(static_cast(-0x1.655f860000000p-4),732469), + GreedyWalkResult(static_cast(-0x1.04cbcc0000000p-4),380912), + GreedyWalkResult(static_cast(-0x1.3243d20000000p-5),606365), + GreedyWalkResult(static_cast(-0x1.2dbf640000000p-4),950108), + GreedyWalkResult(static_cast(-0x1.fa90ea0000001p-1),168533), + GreedyWalkResult(static_cast(-0x1.2922f80000000p-3),228514), + GreedyWalkResult(static_cast(-0x1.5974060000000p-1),725033), + GreedyWalkResult(static_cast(-0x1.abcf2c0000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.b262380000000p-1),272753), + GreedyWalkResult(static_cast(-0x1.c0e98a0000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.01b4680000000p-2),184077), + GreedyWalkResult(static_cast(-0x1.96e3280000000p-2),208965), + GreedyWalkResult(static_cast(-0x1.58dd120000000p-3),580161), + GreedyWalkResult(static_cast(-0x1.1f333a0000000p-3),236872), + GreedyWalkResult(static_cast(-0x1.8db7de0000000p-2),294738), + GreedyWalkResult(static_cast(-0x1.4e43500000000p-2),909721), + GreedyWalkResult(static_cast(-0x1.a5ae760000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.7fcff00000000p-5),294738), + GreedyWalkResult(static_cast(-0x1.5630f40000000p-1),960530), + GreedyWalkResult(static_cast(-0x1.48d8c20000000p-1),853984), + GreedyWalkResult(static_cast(-0x1.14556ffffffffp+0),909721), + GreedyWalkResult(static_cast(-0x1.a746760000000p-2),865184), + GreedyWalkResult(static_cast(-0x1.ddcb81fffffffp-3),513240), + GreedyWalkResult(static_cast(-0x1.94a92ffffffffp-2),550771), + GreedyWalkResult(static_cast(-0x1.45b69c0000000p-1),432335), + GreedyWalkResult(static_cast(-0x1.2ef8fa0000000p-3),226268), + GreedyWalkResult(static_cast(-0x1.9909440000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.1ce937fffffffp-5),321516), + GreedyWalkResult(static_cast(-0x1.c0de380000000p-2),228514), + GreedyWalkResult(static_cast(-0x1.0de8e60000000p-8),897966), + GreedyWalkResult(static_cast(-0x1.99783c0000000p-1),865184), + GreedyWalkResult(static_cast(-0x1.01316e0000000p+0),886263), + GreedyWalkResult(static_cast(-0x1.a172140000000p-6),177485), + GreedyWalkResult(static_cast(-0x1.2b8f9a0000000p-7),973080), + GreedyWalkResult(static_cast(-0x1.924b440000000p-5),290055), + GreedyWalkResult(static_cast(-0x1.8515aa0000000p-2),905210), + GreedyWalkResult(static_cast(-0x1.f68975ffffffep-3),294738), + GreedyWalkResult(static_cast(-0x1.dd5ed00000001p-6),790506), + GreedyWalkResult(static_cast(-0x1.be40740000000p-1),870888), + GreedyWalkResult(static_cast(-0x1.08f4460000001p-2),666073), + GreedyWalkResult(static_cast(-0x1.2589100000000p-2),385014), + GreedyWalkResult(static_cast(-0x1.e43ad00000001p-3),230001), + GreedyWalkResult(static_cast(-0x1.161b360000000p+0),646867), + GreedyWalkResult(static_cast(-0x1.475e87fffffffp-6),179303), + GreedyWalkResult(static_cast(-0x1.425b1c0000000p-1),463324), + GreedyWalkResult(static_cast(-0x1.f4b68c0000000p-1),909721), + GreedyWalkResult(static_cast(-0x1.1333440000000p-1),168533), + GreedyWalkResult(static_cast(-0x1.0e35aa0000000p-1),312088), + GreedyWalkResult(static_cast(-0x1.1b7653fffffffp+0),854962), + GreedyWalkResult(static_cast(-0x1.cb8adc0000000p-3),491377), + GreedyWalkResult(static_cast(-0x1.51a0380000000p-1),226268), + GreedyWalkResult(static_cast(-0x1.e4b9940000000p-2),603696), + GreedyWalkResult(static_cast(-0x1.623f9a0000000p-2),991097), + GreedyWalkResult(static_cast(-0x1.1660b20000000p-1), 18868), + GreedyWalkResult(static_cast(-0x1.bd75200000000p-7), 56131), + GreedyWalkResult(static_cast(-0x1.4dbbe00000000p+0), 16476), + GreedyWalkResult(static_cast(-0x1.1b55860000000p-5),310512), + GreedyWalkResult(static_cast(-0x1.1f40e00000000p+0),115894), + GreedyWalkResult(static_cast(-0x1.d403c60000001p-2),718485), + GreedyWalkResult(static_cast(-0x1.a7b7bdfffffffp-7),601673), + GreedyWalkResult(static_cast(-0x1.7f5c8c0000000p-2),552153), + GreedyWalkResult(static_cast(-0x1.6834060000001p-3),294738), + GreedyWalkResult(static_cast(-0x1.8ccf620000000p-2),513240), + GreedyWalkResult(static_cast(-0x1.1508660000000p+0),666073), + GreedyWalkResult(static_cast(-0x1.6362300000000p-1),982683), + GreedyWalkResult(static_cast(-0x1.175fbc0000000p-4),226268), + GreedyWalkResult(static_cast(-0x1.10e30a0000000p-5),703851), + GreedyWalkResult(static_cast(-0x1.0343340000000p+0),580161), + GreedyWalkResult(static_cast(-0x1.9337a20000000p-3),236872), + GreedyWalkResult(static_cast(-0x1.986e8a0000000p-7),986292), + GreedyWalkResult(static_cast(-0x1.1f400a0000000p+0),336830), + GreedyWalkResult(static_cast(-0x1.3c0e060000000p-1),168533), + GreedyWalkResult(static_cast(-0x1.8589cc0000000p-1),118607), + GreedyWalkResult(static_cast(-0x1.745f000000000p-3),272753), + GreedyWalkResult(static_cast(-0x1.317ca40000000p-4),494402), + GreedyWalkResult(static_cast(-0x1.ebd52a0000001p-7),517512), + GreedyWalkResult(static_cast(-0x1.7ad9100000001p-6),986292), + GreedyWalkResult(static_cast(-0x1.6ed8a00000000p-2),134880), + GreedyWalkResult(static_cast(-0x1.273edc0000000p-2),294738), + GreedyWalkResult(static_cast(-0x1.93db8c0000000p-1),620143), + GreedyWalkResult(static_cast(-0x1.324dd60000000p-4),778172), + GreedyWalkResult(static_cast(-0x1.3c59a80000000p-1),270175), + GreedyWalkResult(static_cast(-0x1.fc51e80000000p-2),114191), + GreedyWalkResult(static_cast(-0x1.a7fbc60000000p-2),603696), + GreedyWalkResult(static_cast(-0x1.ab76780000000p-1),406402), + GreedyWalkResult(static_cast(-0x1.8733320000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.447bb00000000p-1),513240), + GreedyWalkResult(static_cast(-0x1.b5c3140000000p-5),729175), + GreedyWalkResult(static_cast(-0x1.ca9b880000000p-1),785859), + GreedyWalkResult(static_cast(-0x1.beee640000000p-1),854962), + GreedyWalkResult(static_cast(-0x1.47b4e80000000p-1),738101), + GreedyWalkResult(static_cast(-0x1.069a7c0000000p-1),193430), + GreedyWalkResult(static_cast(-0x1.20f53c0000000p-1),118809), + GreedyWalkResult(static_cast(-0x1.1612f80000000p-2),711979), + GreedyWalkResult(static_cast(-0x1.25c6c80000000p-1),348136), + GreedyWalkResult(static_cast(-0x1.2507300000000p-2), 36731), + GreedyWalkResult(static_cast(-0x1.14ef720000001p+0),268974), + GreedyWalkResult(static_cast(-0x1.2b54f80000000p-4), 40323), + GreedyWalkResult(static_cast(-0x1.e07ccbfffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.070d960000000p-4),785239), + GreedyWalkResult(static_cast(-0x1.49e6200000000p-1),496330), + GreedyWalkResult(static_cast(-0x1.86c9080000000p-1),969505), + GreedyWalkResult(static_cast(-0x1.0b584c0000000p-1),587902), + GreedyWalkResult(static_cast(-0x1.bb1ee00000000p-7),439426), + GreedyWalkResult(static_cast(-0x1.ff17c9fffffffp-11),467026), + GreedyWalkResult(static_cast(-0x1.0da6980000000p+0),294738), + GreedyWalkResult(static_cast(-0x1.1d0ba40000001p-3),288912), + GreedyWalkResult(static_cast(-0x1.301dec0000000p-1),541780), + GreedyWalkResult(static_cast(-0x1.2f9b800000000p-4),261103), + GreedyWalkResult(static_cast(-0x1.8d769e0000000p-4),239334), + GreedyWalkResult(static_cast(-0x1.6ea4f80000000p-3),223977), + GreedyWalkResult(static_cast(-0x1.fcc7dc0000000p-2),662137), + GreedyWalkResult(static_cast(-0x1.5949fe0000000p-3),565830), + GreedyWalkResult(static_cast(-0x1.1a11aa0000000p-1),908217), + GreedyWalkResult(static_cast(-0x1.8bff140000000p-1), 2251), + GreedyWalkResult(static_cast(-0x1.7ccda1fffffffp-2),467026), + GreedyWalkResult(static_cast(-0x1.80bf6e0000000p-2), 50016), + GreedyWalkResult(static_cast(-0x1.3444300000000p-2), 2251), + GreedyWalkResult(static_cast(-0x1.c8e8bc0000000p-1),223249), + GreedyWalkResult(static_cast(-0x1.679767fffffffp-3),494887), + GreedyWalkResult(static_cast(-0x1.6c896c0000000p-3),114191), + GreedyWalkResult(static_cast(-0x1.413b740000000p-4),772422), + GreedyWalkResult(static_cast(-0x1.4e1d760000000p-3),168533), + GreedyWalkResult(static_cast(-0x1.7202fe0000001p-1),131611), + GreedyWalkResult(static_cast(-0x1.2589840000000p+0),385014), + GreedyWalkResult(static_cast(-0x1.5820da0000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.96ceb00000001p-3),177485), + GreedyWalkResult(static_cast(-0x1.d6ac77fffffffp-4),865184), + GreedyWalkResult(static_cast(-0x1.bfefa00000000p-7),149329), + GreedyWalkResult(static_cast(-0x1.69ac280000000p-1), 73867), + GreedyWalkResult(static_cast(-0x1.04bb900000000p+0),567514), + GreedyWalkResult(static_cast(-0x1.142a3dfffffffp+0),550771), + GreedyWalkResult(static_cast(-0x1.2f1ca40000000p-5),552153), + GreedyWalkResult(static_cast(-0x1.1def580000000p-1),679881), + GreedyWalkResult(static_cast(-0x1.072ac60000000p-4), 29163), + GreedyWalkResult(static_cast(-0x1.2821940000000p-4),854962), + GreedyWalkResult(static_cast(-0x1.72a68e0000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.cafce80000000p-3),729852), + GreedyWalkResult(static_cast(-0x1.3ba2d80000000p-2),729021), + GreedyWalkResult(static_cast(-0x1.68739e0000000p-3),226268), + GreedyWalkResult(static_cast(-0x1.aeb25c0000000p-1),134880), + GreedyWalkResult(static_cast(-0x1.18c0840000000p-5),693842), + GreedyWalkResult(static_cast(-0x1.fe21ce0000001p-1), 40323), + GreedyWalkResult(static_cast(-0x1.b41fb00000001p-1),735181), + GreedyWalkResult(static_cast(-0x1.2826320000000p-8),379502), + GreedyWalkResult(static_cast(-0x1.5eecda0000000p-1),925333), + GreedyWalkResult(static_cast(-0x1.b002d40000000p-1),842476), + GreedyWalkResult(static_cast(-0x1.4e53aa0000000p-2),228514), + GreedyWalkResult(static_cast(-0x1.a1b49bfffffffp-2),228514), + GreedyWalkResult(static_cast(-0x1.f1c7ac0000000p-1),750819), + GreedyWalkResult(static_cast(-0x1.67f6720000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.31a6600000001p-6),341861), + GreedyWalkResult(static_cast(-0x1.61c1080000000p-3),790506), + GreedyWalkResult(static_cast(-0x1.aaa3780000000p-2),550771), + GreedyWalkResult(static_cast(-0x1.3fa68a0000001p-6),160291), + GreedyWalkResult(static_cast(-0x1.38c0b20000000p-1),379199), + GreedyWalkResult(static_cast(-0x1.ee68980000001p-2),318485), + GreedyWalkResult(static_cast(-0x1.dd852c0000001p-2),655315), + GreedyWalkResult(static_cast(-0x1.06fa43fffffffp+0),790506), + GreedyWalkResult(static_cast(-0x1.07007e0000000p+0),926790), + GreedyWalkResult(static_cast(-0x1.f352a1fffffffp-1),523435), + GreedyWalkResult(static_cast(-0x1.c6d6160000000p-1),169991), + GreedyWalkResult(static_cast(-0x1.090c620000000p-5),168533), + GreedyWalkResult(static_cast(-0x1.19f6860000000p+0),239334), + GreedyWalkResult(static_cast(-0x1.e3f8580000001p-2),255916), + GreedyWalkResult(static_cast(-0x1.2148180000000p-1),206826), + GreedyWalkResult(static_cast(-0x1.0487660000000p-2),494402), + GreedyWalkResult(static_cast(-0x1.be5ea00000000p-3),532480), + GreedyWalkResult(static_cast(-0x1.114b0a0000000p-3),294738), + GreedyWalkResult(static_cast(-0x1.1e0a2a0000000p-7),379350), + GreedyWalkResult(static_cast(-0x1.22f06bfffffffp+0),239334), + GreedyWalkResult(static_cast(-0x1.bc42c20000000p-1),133288), + GreedyWalkResult(static_cast(-0x1.9ec387fffffffp-2),495101), + GreedyWalkResult(static_cast(-0x1.ab66b80000000p-3),115894), + GreedyWalkResult(static_cast(-0x1.9be6e80000000p-4),513240), + GreedyWalkResult(static_cast(-0x1.4cdc7ffffffffp-6),973080), + GreedyWalkResult(static_cast(-0x1.c7a31c0000000p-7),764589), + GreedyWalkResult(static_cast(-0x1.a35f1c0000000p-8),115043), + GreedyWalkResult(static_cast(-0x1.3422a00000000p-1),228514), + GreedyWalkResult(static_cast(-0x1.5a4aa60000000p-4), 49557), + GreedyWalkResult(static_cast(-0x1.06eddc0000000p-2),226268), + GreedyWalkResult(static_cast(-0x1.d46bde0000000p-1),790506), + GreedyWalkResult(static_cast(-0x1.02e72c0000000p-3),294738), + GreedyWalkResult(static_cast(-0x1.e33abffffffffp-2),112248), + GreedyWalkResult(static_cast(-0x1.ae74060000001p-4),133288), + GreedyWalkResult(static_cast(-0x1.272a2bfffffffp-7),850826), + GreedyWalkResult(static_cast(-0x1.357f25fffffffp-2),239334), + GreedyWalkResult(static_cast(-0x1.33c9f1fffffffp-3), 25893), + GreedyWalkResult(static_cast(-0x1.771fdc0000001p-5),305162), + GreedyWalkResult(static_cast(-0x1.18a1080000000p-4),729175), + GreedyWalkResult(static_cast(-0x1.46ad1e0000000p-4),790506), + GreedyWalkResult(static_cast(-0x1.0a53300000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.783f4e0000000p-6),546811), + GreedyWalkResult(static_cast(-0x1.3f05b60000000p-3),239334), + GreedyWalkResult(static_cast(-0x1.602d5c0000000p-3),463324), + GreedyWalkResult(static_cast(-0x1.c8f2b20000000p-5),513240), + GreedyWalkResult(static_cast(-0x1.0bde920000000p+0),236872), + GreedyWalkResult(static_cast(-0x1.8eb3fe0000000p-1),168533), + GreedyWalkResult(static_cast(-0x1.d981120000002p-3),849285), + GreedyWalkResult(static_cast(-0x1.d8151a0000001p-1),133288), + GreedyWalkResult(static_cast(-0x1.c231ec0000000p-1),790506), + GreedyWalkResult(static_cast(-0x1.c742700000000p-1),239334), + GreedyWalkResult(static_cast(-0x1.2a6c6a0000000p+0),945767), + GreedyWalkResult(static_cast(-0x1.5b8c5bfffffffp-2),294738), + GreedyWalkResult(static_cast(-0x1.391a700000000p-12),562015), + GreedyWalkResult(static_cast(-0x1.896b960000000p-1),969505), + GreedyWalkResult(static_cast(-0x1.28e7fe0000000p-3),228514), + GreedyWalkResult(static_cast(-0x1.577a11fffffffp-4),348136), + GreedyWalkResult(static_cast(-0x1.43b7f80000000p-4),950108), + GreedyWalkResult(static_cast(-0x1.7e64600000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.97ebe20000000p-5),392823), + GreedyWalkResult(static_cast(-0x1.a856440000000p-3),793084), + GreedyWalkResult(static_cast(-0x1.84531a0000000p-6),986292), + GreedyWalkResult(static_cast(-0x1.7c80d40000000p-4),186838), + GreedyWalkResult(static_cast(-0x1.0c56e5fffffffp+0),294738), + GreedyWalkResult(static_cast(-0x1.72c0da0000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.1844d00000000p-5),606365), + GreedyWalkResult(static_cast(-0x1.52a5d40000000p-10),470059), + GreedyWalkResult(static_cast(-0x1.7d31400000000p-1),738101), + GreedyWalkResult(static_cast(-0x1.c47df00000000p-7),710471), + GreedyWalkResult(static_cast(-0x1.dc3ccbfffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.5e773c0000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.7ffd660000000p-2),920345), + GreedyWalkResult(static_cast(-0x1.ab0dc00000001p-2),677155), + GreedyWalkResult(static_cast(-0x1.7f8db00000000p-5),973080), + GreedyWalkResult(static_cast(-0x1.add3b60000000p-1),293302), + GreedyWalkResult(static_cast(-0x1.e0328c0000000p-4),758625), + GreedyWalkResult(static_cast(-0x1.6022ce0000000p-5),666073), + GreedyWalkResult(static_cast(-0x1.a1d241fffffffp-4),226268), + GreedyWalkResult(static_cast(-0x1.cec5e60000000p-2),294738), + GreedyWalkResult(static_cast(-0x1.893f260000000p-3),855760), + GreedyWalkResult(static_cast(-0x1.0790c00000000p-2),145893), + GreedyWalkResult(static_cast(-0x1.49456ffffffffp-7),215955), + GreedyWalkResult(static_cast(-0x1.71b1bc0000001p-5),312088), + GreedyWalkResult(static_cast(-0x1.8b1c580000000p-1),729175), + GreedyWalkResult(static_cast(-0x1.2010d20000000p-4),142436), + GreedyWalkResult(static_cast(-0x1.c33ecc0000000p-4),280878), + GreedyWalkResult(static_cast(-0x1.6b1dce0000000p-2),444780), + GreedyWalkResult(static_cast(-0x1.f76bb60000001p-2),294738), + GreedyWalkResult(static_cast(-0x1.87151ffffffffp-2),294738), + GreedyWalkResult(static_cast(-0x1.f522ae0000000p-2), 9333), + GreedyWalkResult(static_cast(-0x1.77d5c40000001p-4),114191), + GreedyWalkResult(static_cast(-0x1.f7f4edfffffffp-5),239334), + GreedyWalkResult(static_cast(-0x1.1c46b00000000p-1),270226), + GreedyWalkResult(static_cast(-0x1.a4f43bfffffffp-6),140906), + GreedyWalkResult(static_cast(-0x1.8952480000000p-1),670146), + GreedyWalkResult(static_cast(-0x1.ca891c0000000p-7),973080), + GreedyWalkResult(static_cast(-0x1.e36b85fffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.1aaf580000000p-3),909372), + GreedyWalkResult(static_cast(-0x1.8116920000000p-8), 51434), + GreedyWalkResult(static_cast(-0x1.acc07e0000000p-1), 26012), + GreedyWalkResult(static_cast(-0x1.a2316c0000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.3a68660000000p-3),628152), + GreedyWalkResult(static_cast(-0x1.c199e80000000p-2),907223), + GreedyWalkResult(static_cast(-0x1.8bfc920000000p-3), 16476), + GreedyWalkResult(static_cast(-0x1.c9b8520000000p-5),568921), + GreedyWalkResult(static_cast(-0x1.be82e20000000p-2),134880), + GreedyWalkResult(static_cast(-0x1.8cabe60000001p-2),660609), + GreedyWalkResult(static_cast(-0x1.7222980000000p-1),118809), + GreedyWalkResult(static_cast(-0x1.b313ea0000000p-1),842476), + GreedyWalkResult(static_cast(-0x1.8b56380000000p-7), 38538), + GreedyWalkResult(static_cast(-0x1.3e74440000000p-3),729175), + GreedyWalkResult(static_cast(-0x1.6349900000000p-9),136557), + GreedyWalkResult(static_cast(-0x1.2128060000001p+0),672634), + GreedyWalkResult(static_cast(-0x1.25d0560000001p-8),314066), + GreedyWalkResult(static_cast(-0x1.206c1a0000000p+0),288181), + GreedyWalkResult(static_cast(-0x1.696a200000001p-3),114191), + GreedyWalkResult(static_cast(-0x1.1a74180000000p-1),226268), + GreedyWalkResult(static_cast(-0x1.608e8a0000000p-2),239334), + GreedyWalkResult(static_cast(-0x1.e583780000001p-1),854962), + GreedyWalkResult(static_cast(-0x1.cdfae5fffffffp-1),288181), + GreedyWalkResult(static_cast(-0x1.53c3200000001p-5),926790), + GreedyWalkResult(static_cast(-0x1.a8f37bfffffffp-5),164698), + GreedyWalkResult(static_cast(-0x1.e1399ffffffffp-7),517512), + GreedyWalkResult(static_cast(-0x1.adf8240000000p-3),587902), + GreedyWalkResult(static_cast(-0x1.f91ca60000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.b717880000000p-3), 70417), + GreedyWalkResult(static_cast(-0x1.57b0760000000p-4),939764), + GreedyWalkResult(static_cast(-0x1.1de1ca0000000p+0), 74899), + GreedyWalkResult(static_cast(-0x1.c67da40000000p-2),114191), + GreedyWalkResult(static_cast(-0x1.64c96c0000001p-2),261103), + GreedyWalkResult(static_cast(-0x1.54c6240000000p-1),107308), + GreedyWalkResult(static_cast(-0x1.0274f60000000p-2),236872), + GreedyWalkResult(static_cast(-0x1.5b05140000000p-1),969505), + GreedyWalkResult(static_cast(-0x1.1a4ca80000000p-6),950108), + GreedyWalkResult(static_cast(-0x1.de24900000000p-1),836318), + GreedyWalkResult(static_cast(-0x1.5c834e0000000p-3),228059), + GreedyWalkResult(static_cast(-0x1.682d5c0000000p-3),107308), + GreedyWalkResult(static_cast(-0x1.b96de80000000p-1),532480), + GreedyWalkResult(static_cast(-0x1.f1c5680000000p-1),186838), + GreedyWalkResult(static_cast(-0x1.d87015fffffffp-3),236872), + GreedyWalkResult(static_cast(-0x1.992d1ffffffffp-2),884850), + GreedyWalkResult(static_cast(-0x1.38d1580000001p-6),986292), + GreedyWalkResult(static_cast(-0x1.a59a700000001p-3),550771), + GreedyWalkResult(static_cast(-0x1.bb07fe0000001p-5),531816), + GreedyWalkResult(static_cast(-0x1.48fa060000000p-1),128603), + GreedyWalkResult(static_cast(-0x1.81b2000000001p-7),129055), + GreedyWalkResult(static_cast(-0x1.4bfc5bfffffffp-2),576030), + GreedyWalkResult(static_cast(-0x1.4683200000000p-1),727476), + GreedyWalkResult(static_cast(-0x1.9165800000000p-5), 38538), + GreedyWalkResult(static_cast(-0x1.2b59be0000000p-3),941181), + GreedyWalkResult(static_cast(-0x1.21086e0000000p-5),467026), + GreedyWalkResult(static_cast(-0x1.1fb5700000000p-7),986292), + GreedyWalkResult(static_cast(-0x1.3fa0620000000p-2), 40323), + GreedyWalkResult(static_cast(-0x1.d2b8bdfffffffp-2),355312), + GreedyWalkResult(static_cast(-0x1.ec8a43fffffffp-2),532480), + GreedyWalkResult(static_cast(-0x1.eeaace0000000p-9),385014), + GreedyWalkResult(static_cast(-0x1.5649140000000p-1),842476), + GreedyWalkResult(static_cast(-0x1.49e3ae0000001p-6), 29163), + GreedyWalkResult(static_cast(-0x1.b53db20000001p-5),442413), + GreedyWalkResult(static_cast(-0x1.5aa6380000000p-3),909721), + GreedyWalkResult(static_cast(-0x1.cdc0f80000000p-3),450479), + GreedyWalkResult(static_cast(-0x1.c9aab80000000p-2),541408), + GreedyWalkResult(static_cast(-0x1.0d78740000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.1a48820000000p-6),810043), + GreedyWalkResult(static_cast(-0x1.3a76fc0000000p-1),804725), + GreedyWalkResult(static_cast(-0x1.2f318a0000000p-7),562579), + GreedyWalkResult(static_cast(-0x1.6c91920000000p-2),270226), + GreedyWalkResult(static_cast(-0x1.9ac5940000000p-4),263560), + GreedyWalkResult(static_cast(-0x1.42bc8c0000000p-1),112754), + GreedyWalkResult(static_cast(-0x1.906b7c0000000p-1),909721), + GreedyWalkResult(static_cast(-0x1.3586ac0000000p-7), 53791), + GreedyWalkResult(static_cast(-0x1.69ef5a0000000p-3),385014), + GreedyWalkResult(static_cast(-0x1.4e4f3e0000000p-3),294738), + GreedyWalkResult(static_cast(-0x1.b379440000000p-1),980037), + GreedyWalkResult(static_cast(-0x1.1a94380000000p+0),624004), + GreedyWalkResult(static_cast(-0x1.5e22e00000001p-8), 36331), + GreedyWalkResult(static_cast(-0x1.919a7c0000000p-1),883883), + GreedyWalkResult(static_cast(-0x1.0313ea0000000p+0),117555), + GreedyWalkResult(static_cast(-0x1.8781320000000p-2),467026), + GreedyWalkResult(static_cast(-0x1.8504900000000p-2),236872), + GreedyWalkResult(static_cast(-0x1.2e79740000000p-2),827608), + GreedyWalkResult(static_cast(-0x1.91ac000000000p-5),355549), + GreedyWalkResult(static_cast(-0x1.e0b6b80000000p-6),973080), + GreedyWalkResult(static_cast(-0x1.ae8bd00000000p-1), 26012), + GreedyWalkResult(static_cast(-0x1.edd4cc0000001p-5),587902), + GreedyWalkResult(static_cast(-0x1.1191160000000p-6),750819), + GreedyWalkResult(static_cast(-0x1.3c69140000000p-2),192244), + GreedyWalkResult(static_cast(-0x1.30a7540000000p+0),804725), + GreedyWalkResult(static_cast(-0x1.77bda40000002p-5),654035), + GreedyWalkResult(static_cast(-0x1.f0496e0000001p-1), 2251), + GreedyWalkResult(static_cast(-0x1.788009fffffffp-4),439426), + GreedyWalkResult(static_cast(-0x1.3527f9fffffffp+0),354262), + GreedyWalkResult(static_cast(-0x1.1914b20000000p+0), 16476), + GreedyWalkResult(static_cast(-0x1.4b03460000000p-4),648421), + GreedyWalkResult(static_cast(-0x1.25ae300000000p-1),292300), + GreedyWalkResult(static_cast(-0x1.cd467c0000000p-6), 47898), + GreedyWalkResult(static_cast(-0x1.e082960000001p-3),169790), + GreedyWalkResult(static_cast(-0x1.38970e0000000p-5),495101), + GreedyWalkResult(static_cast(-0x1.d88693fffffffp-2),348136), + GreedyWalkResult(static_cast(-0x1.13046c0000000p-1),439129), + GreedyWalkResult(static_cast(-0x1.ed2e720000001p-4),749981), + GreedyWalkResult(static_cast(-0x1.b162180000000p-5),864388), + GreedyWalkResult(static_cast(-0x1.458a1a0000000p-2),121683), + GreedyWalkResult(static_cast(-0x1.ffddf40000000p-6), 82234), + GreedyWalkResult(static_cast(-0x1.c99b320000001p-6),495323), + GreedyWalkResult(static_cast(-0x1.aa13de0000000p-3),226268), + GreedyWalkResult(static_cast(-0x1.36671e0000000p-4),236872), + GreedyWalkResult(static_cast(-0x1.276aaa0000000p-2),467026), + GreedyWalkResult(static_cast(-0x1.41718e0000000p-6),973080), + GreedyWalkResult(static_cast(-0x1.39280c0000000p-2),228514), + GreedyWalkResult(static_cast(-0x1.8156020000000p-6),854497), + GreedyWalkResult(static_cast(-0x1.075a840000000p+0),930775), + GreedyWalkResult(static_cast(-0x1.0b01560000000p-1), 52041), + GreedyWalkResult(static_cast(-0x1.fabeec0000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.794f3a0000000p-1),384841), + GreedyWalkResult(static_cast(-0x1.d9d54dfffffffp-1),419057), + GreedyWalkResult(static_cast(-0x1.c27da80000000p-2),219992), + GreedyWalkResult(static_cast(-0x1.0d06660000000p-5),563395), + GreedyWalkResult(static_cast(-0x1.7ee86e0000000p-1),348136), + GreedyWalkResult(static_cast(-0x1.a219b9fffffffp-3),969505), + GreedyWalkResult(static_cast(-0x1.434a760000000p-4), 16476), + GreedyWalkResult(static_cast(-0x1.6cf4380000000p-1),677921), + GreedyWalkResult(static_cast(-0x1.94c9c00000000p-6),901398), + GreedyWalkResult(static_cast(-0x1.c625540000000p-5),932100), + GreedyWalkResult(static_cast(-0x1.2309d40000000p-1),677155), + GreedyWalkResult(static_cast(-0x1.3719a60000000p-4),112754), + GreedyWalkResult(static_cast(-0x1.2c1eba0000000p-6),527498), + GreedyWalkResult(static_cast(-0x1.affd100000000p-1),909721), + GreedyWalkResult(static_cast(-0x1.09db9c0000000p-2),790506), + GreedyWalkResult(static_cast(-0x1.b991e00000000p-4),535044), + GreedyWalkResult(static_cast(-0x1.2c3aec0000000p-8),938124), + GreedyWalkResult(static_cast(-0x1.cce0d20000000p-1),496356), + GreedyWalkResult(static_cast(-0x1.d80a4a0000000p-8),776790), + GreedyWalkResult(static_cast(-0x1.b3f6ec0000000p-1),749772), + GreedyWalkResult(static_cast(-0x1.d370f60000000p-1),441230), + GreedyWalkResult(static_cast(-0x1.17859e0000000p+0), 12009), + GreedyWalkResult(static_cast(-0x1.552dde0000000p-2),228514), + GreedyWalkResult(static_cast(-0x1.1e56f40000000p+0), 56131), + GreedyWalkResult(static_cast(-0x1.5b74140000000p-4),186084), + GreedyWalkResult(static_cast(-0x1.2bc8580000000p+0),870888), + GreedyWalkResult(static_cast(-0x1.03ba840000000p+0),385014), + GreedyWalkResult(static_cast(-0x1.9e8ea80000000p-2),114191), + GreedyWalkResult(static_cast(-0x1.9181880000000p-6),517512), + GreedyWalkResult(static_cast(-0x1.fd3e6a0000000p-3),255668), + GreedyWalkResult(static_cast(-0x1.d793e5fffffffp-6),511753), + GreedyWalkResult(static_cast(-0x1.335bf00000000p-6),679881), + GreedyWalkResult(static_cast(-0x1.98bd340000000p-1), 56131), + GreedyWalkResult(static_cast(-0x1.37253c0000000p-3),337863), + GreedyWalkResult(static_cast(-0x1.55a79e0000000p-2),270226), + GreedyWalkResult(static_cast(-0x1.f2ead00000001p-3),430269), + GreedyWalkResult(static_cast(-0x1.f45e060000002p-3),226356), + GreedyWalkResult(static_cast(-0x1.c435d60000001p-9), 81654), + GreedyWalkResult(static_cast(-0x1.1ea9580000000p+0),550771), + GreedyWalkResult(static_cast(-0x1.cc1a520000000p-2),444956), + GreedyWalkResult(static_cast(-0x1.9428000000000p-2),914163), + GreedyWalkResult(static_cast(-0x1.8f2a440000000p-2), 40323), + GreedyWalkResult(static_cast(-0x1.077cdc0000000p+0),582680), + GreedyWalkResult(static_cast(-0x1.31819c0000000p-3),292300), + GreedyWalkResult(static_cast(-0x1.5ae2840000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.0f86240000000p-1),854962), + GreedyWalkResult(static_cast(-0x1.e4b8040000000p-2), 5217), + GreedyWalkResult(static_cast(-0x1.92a3020000000p-6),866106), + GreedyWalkResult(static_cast(-0x1.4c2bd40000000p-3),560074), + GreedyWalkResult(static_cast(-0x1.96bfae0000000p-2),225945), + GreedyWalkResult(static_cast(-0x1.7cfb9a0000000p-6),986292), + GreedyWalkResult(static_cast(-0x1.809e320000001p-5),890893), + GreedyWalkResult(static_cast(-0x1.1156de0000000p-1),313671), + GreedyWalkResult(static_cast(-0x1.eb64960000000p-1), 23136), + GreedyWalkResult(static_cast(-0x1.5a97fa0000000p-2),228059), + GreedyWalkResult(static_cast(-0x1.2f87c20000001p-1),945767), + GreedyWalkResult(static_cast(-0x1.45a1460000000p-4), 29348), + GreedyWalkResult(static_cast(-0x1.ddef220000001p-3),580161), + GreedyWalkResult(static_cast(-0x1.0b9e120000000p-5),179074), + GreedyWalkResult(static_cast(-0x1.f977160000001p-4),141149), + GreedyWalkResult(static_cast(-0x1.b366bc0000000p-1),660609), + GreedyWalkResult(static_cast(-0x1.7009520000000p-2),467026), + GreedyWalkResult(static_cast(-0x1.08adbe0000000p-3),550091), + GreedyWalkResult(static_cast(-0x1.c989580000000p-4),168533), + GreedyWalkResult(static_cast(-0x1.56433a0000000p-1),672634), + GreedyWalkResult(static_cast(-0x1.dbe0b00000000p-5),667763), + GreedyWalkResult(static_cast(-0x1.11c0620000000p+0),294738), + GreedyWalkResult(static_cast(-0x1.d6d5560000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.899de00000000p-1),239334), + GreedyWalkResult(static_cast(-0x1.fc2835fffffffp-4),550771), + GreedyWalkResult(static_cast(-0x1.28141e0000000p-6),986292), + GreedyWalkResult(static_cast(-0x1.abb32c0000000p-1),134340), + GreedyWalkResult(static_cast(-0x1.2c2b640000001p-3),926855), + GreedyWalkResult(static_cast(-0x1.3447780000000p-3), 47688), + GreedyWalkResult(static_cast(-0x1.5fb8300000000p-6),226268), + GreedyWalkResult(static_cast(-0x1.73cba7fffffffp-4), 40323), + GreedyWalkResult(static_cast(-0x1.b99f040000000p-1), 16476), + GreedyWalkResult(static_cast(-0x1.6b9ba60000000p-1),112754), + GreedyWalkResult(static_cast(-0x1.d3aa360000000p-1),192244), + GreedyWalkResult(static_cast(-0x1.25282a0000000p+0),275023), + GreedyWalkResult(static_cast(-0x1.16c09a0000000p-5), 56131), + GreedyWalkResult(static_cast(-0x1.bdd6720000000p-3),667763), + GreedyWalkResult(static_cast(-0x1.7421400000000p-1),587902), + GreedyWalkResult(static_cast(-0x1.dfa079fffffffp-9),630231), + GreedyWalkResult(static_cast(-0x1.debb760000001p-2),778627), + GreedyWalkResult(static_cast(-0x1.3589be0000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.a659d00000000p-3),353498), + GreedyWalkResult(static_cast(-0x1.9f913bfffffffp-4),936836), + GreedyWalkResult(static_cast(-0x1.3b78740000000p-3),504419), + GreedyWalkResult(static_cast(-0x1.42611c0000000p-3),107308), + GreedyWalkResult(static_cast(-0x1.4e66860000000p-6),439809), + GreedyWalkResult(static_cast(-0x1.4a79000000000p-1),513240), + GreedyWalkResult(static_cast(-0x1.41902a0000000p+0),774981), + GreedyWalkResult(static_cast(-0x1.4850a60000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.a7bf000000000p-1),236872), + GreedyWalkResult(static_cast(-0x1.9d67d60000001p-5),517512), + GreedyWalkResult(static_cast(-0x1.c908860000000p-2),854962), + GreedyWalkResult(static_cast(-0x1.63e9520000000p-2),513240), + GreedyWalkResult(static_cast(-0x1.e423200000000p-5),295526), + GreedyWalkResult(static_cast(-0x1.91894ffffffffp-2),476414), + GreedyWalkResult(static_cast(-0x1.29ba4a0000000p-4),774219), + GreedyWalkResult(static_cast(-0x1.a577500000000p-1),582680), + GreedyWalkResult(static_cast(-0x1.de39c80000000p-2),909721), + GreedyWalkResult(static_cast(-0x1.f75ad40000001p-1),385014), + GreedyWalkResult(static_cast(-0x1.93794a0000000p-1),750819), + GreedyWalkResult(static_cast(-0x1.5f65ec0000000p-3),294738), + GreedyWalkResult(static_cast(-0x1.23f7820000000p-6),786537), + GreedyWalkResult(static_cast(-0x1.a4f01e0000000p-1),239334), + GreedyWalkResult(static_cast(-0x1.218c620000000p-12),134340), + GreedyWalkResult(static_cast(-0x1.33a59e0000000p-1), 40323), + GreedyWalkResult(static_cast(-0x1.c9920c0000000p-2),523435), + GreedyWalkResult(static_cast(-0x1.18be840000000p-2),865184), + GreedyWalkResult(static_cast(-0x1.0442d60000000p-1),729175), + GreedyWalkResult(static_cast(-0x1.047e940000000p+0),255668), + GreedyWalkResult(static_cast(-0x1.0d97ac0000000p-1),239334), + GreedyWalkResult(static_cast(-0x1.2a5e4e0000001p-4),660609), + GreedyWalkResult(static_cast(-0x1.f4887bfffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.a8d50c0000001p-2),531816), + GreedyWalkResult(static_cast(-0x1.8e5e300000001p-4),541780), + GreedyWalkResult(static_cast(-0x1.06e1a40000000p-2),236872), + GreedyWalkResult(static_cast(-0x1.2e98940000000p-5),385014), + GreedyWalkResult(static_cast(-0x1.1d7bb60000000p-4),320041), + GreedyWalkResult(static_cast(-0x1.93514a0000000p-6), 38538), + GreedyWalkResult(static_cast(-0x1.fe2429fffffffp-2),292300), + GreedyWalkResult(static_cast(-0x1.161f500000000p-6), 38538), + GreedyWalkResult(static_cast(-0x1.3d90900000000p-6),318039), + GreedyWalkResult(static_cast(-0x1.01c5040000000p-2),532480), + GreedyWalkResult(static_cast(-0x1.4f30960000000p-4),223261), + GreedyWalkResult(static_cast(-0x1.8a9b3c0000000p-4),382537), + GreedyWalkResult(static_cast(-0x1.02d07a0000000p-4),790506), + GreedyWalkResult(static_cast(-0x1.9527260000001p-2),294738), + GreedyWalkResult(static_cast(-0x1.047eea0000000p-1),886263), + GreedyWalkResult(static_cast(-0x1.d0deba0000000p-1),278930), + GreedyWalkResult(static_cast(-0x1.5c2d320000000p-1),236872), + GreedyWalkResult(static_cast(-0x1.f1670a0000000p-8),580161), + GreedyWalkResult(static_cast(-0x1.1426ce0000000p-3),550771), + GreedyWalkResult(static_cast(-0x1.b5f0ee0000000p-5),517512), + GreedyWalkResult(static_cast(-0x1.efd5180000000p-6),696486), + GreedyWalkResult(static_cast(-0x1.f1b0440000000p-6),118809), + GreedyWalkResult(static_cast(-0x1.28d45c0000000p-1),854962), + GreedyWalkResult(static_cast(-0x1.f18c5e0000000p-1),184077), + GreedyWalkResult(static_cast(-0x1.50e1320000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.fb43600000000p-2),467026), + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp new file mode 100644 index 000000000..ec4a799d7 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp @@ -0,0 +1,9 @@ +#pragma once +#include +#include +namespace ipnsw { + using GreedyWalkResult = std::pair; + extern std::vector GREEDY_WALK_RESULTS; + static constexpr int GWR_DIST = 0; + static constexpr int GWR_VERT = 1; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GroupData.hpp b/examples/sdh-eval-workloads/ipnsw/GroupData.hpp new file mode 100644 index 000000000..b9052ab23 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GroupData.hpp @@ -0,0 +1,10 @@ +#include +namespace ipnsw { + struct GroupData { + hb_mc_eva_t seen_mem; + hb_mc_eva_t candidates_mem; + hb_mc_eva_t results_mem; + hb_mc_eva_t curr; + hb_mc_eva_t n_results; + }; +}; diff --git a/examples/sdh-eval-workloads/ipnsw/IO.hpp b/examples/sdh-eval-workloads/ipnsw/IO.hpp new file mode 100644 index 000000000..52f0bad5b --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IO.hpp @@ -0,0 +1,255 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ipnsw { + //using graph_tools::Graph; + //using graph_tools::Graph500Data; + + class Parser { + public: + using OptionTable = std::map; + + Parser(){} + + void parse(int argc, char *argv[]) { + int pos = 0; + int arg = 0; + + while (arg < argc) { + std::string argstr = std::string(argv[arg]); + if (ipnsw::startswith(argstr, "--")) { + // optional argument + if (++arg >= argc) { + throw std::runtime_error("'" + argstr + "' requries an argument"); + } + _options[argstr] = std::string(argv[arg]); + + } else { + // positional argument + switch (pos++) { + case 0: + _exe = argstr; + break; + + case 1: + _ucode = argstr; + break; + + case 2: + _version = argstr; + break; + + case 3: + _data = argstr; + break; + + case 4: + _queries = argstr; + break; + + case 5: + case 6: + case 7: + case 8: + _graphs.push_back(argstr); + break; + + default: + break; + } + } + arg++; + }; + + // _exe = std::string(argv[0]); + // _ucode = std::string(argv[1]); + // _version = std::string(argv[2]); + // _data = std::string(argv[3]); + // _queries = std::string(argv[4]); + // // graphs + // for (int i = 5; i < argc; ++i) { + // _graphs.push_back(std::string(argv[i])); + // } + } + + std::string str() const { + std::stringstream ss; + ss << "ucode: " << _ucode << "\n" + << "version: " << _version << "\n" + << "exe: " << _exe << "\n" + << "data: " << _data << "\n" + << "queries: " << _queries << "\n"; + + for (int i = 0; i < _graphs.size(); ++i) { + ss << "graph " << i << ": " << _graphs[i] << "\n"; + } + + return ss.str(); + } + + std::string option(const std::string &opt) const { + auto it = _options.find(opt); + if (it != _options.end()) + return it->second; + + return ""; + } + + std::vector do_queries() const { + std::string do_queries_str = option("--queries"); + if (do_queries_str.empty()) { + return {}; + } + + std::vector _do_queries; + size_t pos = 0; + size_t at = 0; + + while ((at = do_queries_str.find(",", pos)) != std::string::npos) { + do_queries_str.replace(at, 1, " "); + pos = at+1; + } + + std::stringstream ss(do_queries_str); + while (ss.good()) { + int q; + ss >> q; + _do_queries.push_back(q); + } + + return _do_queries; + } + + int num_iproducts() const { + int n = 100; + auto s = option("--num-iproducts"); + if (!s.empty()) { + n = from_string(s); + } + return n; + } + + int grid_x() const { + auto s = option("--grid-x"); + if (!s.empty()) + return from_string(s); + else + return 1; + } + + int grid_y() const { + auto s = option("--grid-y"); + if (!s.empty()) + return from_string(s); + else + return 1; + } + + int grp_x() const { + auto s = option("--group-x"); + if (!s.empty()) + return from_string(s); + else + return 1; + } + + int grp_y() const { + auto s = option("--group-y"); + if (!s.empty()) + return from_string(s); + else + return 1; + } + + std::string ucode() const { return _ucode; } + std::string version() const { return _version; } + std::string exe() const { return _exe; } + std::vector graphs() const { return _graphs; } + std::string graph(int i) const { return _graphs[i]; } + std::string data() const { return _data; } + std::string queries() const { return _queries; } + + std::string _ucode; + std::string _version; + std::string _exe; + std::vector _graphs; + std::string _data; + std::string _queries; + OptionTable _options; + }; + + class IO { + public: + IO() {} + IO(const Parser &p): _parser(p) {} + + + graph_tools::Graph graph(int i) { + std::cout << "Reading graph " << i << ": " + << _parser._graphs[i] << std::endl; + + graph_tools::Graph500Data d = graph_tools::Graph500Data::FromASCIIFile(_parser._graphs[i]); + return graph_tools::Graph::FromGraph500Data(d); + } + + std::vector graphs() { + std::vector graphs; + for (int i = 0; i < _parser._graphs.size(); ++i) + graphs.push_back(graph(i)); + + return graphs; + } + + template + std::vector read(const std::string & fname) { + int r; + struct stat st; + + std::cerr << "Opening " << fname << std::endl; + + r = stat(fname.c_str(), &st); + if (r != 0) { + auto s = fname + ": " + std::string(strerror(errno)); + throw std::runtime_error(s); + } + std::vector v(st.st_size/sizeof(T)); + + FILE *f = fopen(fname.c_str(), "rb"); + if (!f) { + auto s = fname + ": " + std::string(strerror(errno)); + throw std::runtime_error(s); + } + + fread(&v[0], st.st_size, 1, f); + fclose(f); + return v; + } + + template + std::vector> + database() { + using array = std::array; + return read(_parser._data); + } + + template + std::vector> + queries() { + using array = std::array; + return read(_parser._queries); + } + + std::string ucode() const { return _parser._ucode; } + std::vector do_queries() const { return _parser.do_queries(); } + + Parser _parser; + }; + +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp new file mode 100644 index 000000000..55e410789 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp @@ -0,0 +1,17 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IPNSWResultReader.hpp" +namespace ipnsw { + class IPNSWFactory { + public: + std::unique_ptr KernelRunner()const { + return std::unique_ptr(_KernelRunner()); + } + std::unique_ptr ResultReader()const { + return std::unique_ptr(_ResultReader()); + } + protected: + virtual IPNSWKernelRunner* _KernelRunner()const = 0; + virtual IPNSWResultReader* _ResultReader()const = 0; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp new file mode 100644 index 000000000..4f942db01 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp @@ -0,0 +1,69 @@ +#pragma once +#include +#include +#include +#include + +namespace ipnsw { + class Graph { + public: + Graph() : Graph(graph_tools::Graph()) {} + Graph(const graph_tools::Graph &g) : _graph(g) {} + Graph(graph_tools::Graph &&g) : _graph(g) {} + + void initialize_on_device() { + using hammerblade::host::HammerBlade; + HammerBlade::Ptr hb = HammerBlade::Get(); + + auto & offsets = _graph.get_offsets(); + auto & neighbors = _graph.get_neighbors(); + + _offsets = hb->alloc(offsets.size() * sizeof(offsets[0])); + _neighbors = hb->alloc(neighbors.size() * sizeof(neighbors[0])); + + hb->push_write(_offsets, &offsets[0], offsets.size() * sizeof(offsets[0])); + hb->push_write(_neighbors, &neighbors[0], neighbors.size() * sizeof(neighbors[0])); + } + + graph_tools::Graph & graph() { return _graph; } + const graph_tools::Graph & graph() const { return _graph; } + hb_mc_eva_t offsets() const { return _offsets; } + hb_mc_eva_t neighbors() const { return _neighbors; } + + static hb_mc_eva_t InitializeMetadataOnDevice(const std::vector & Gs) { + using hammerblade::host::HammerBlade; + HammerBlade::Ptr hb = HammerBlade::Get(); + struct metadata { + hb_mc_eva_t offset; + hb_mc_eva_t neighbors; + int V; + int E; + }; + + std::vector metad; + for (auto & g : Gs) { + std::cout << "Host: offset = " << std::hex << g.offsets() << " neighbors = " << g.neighbors() << std::endl; + std::cout << std::dec; + metadata m = { + .offset = g.offsets(), + .neighbors = g.neighbors(), + g.graph().num_nodes(), + g.graph().num_edges() + }; + metad.push_back(m); + } + + hb_mc_eva_t metadata = hb->alloc(sizeof(struct metadata) * metad.size()); + hb->push_write(metadata, &metad[0], sizeof(struct metadata) * metad.size()); + hb->sync_write(); + + return metadata; + } + + private: + graph_tools::Graph _graph; + + hb_mc_eva_t _offsets; + hb_mc_eva_t _neighbors; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp new file mode 100644 index 000000000..1604cb93e --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp @@ -0,0 +1,40 @@ +#pragma once +#include "HammerBlade.hpp" +#include +#include +namespace ipnsw { + class IPNSWRunner; // forward declaration + + class IPNSWKernelRunner { + public: + using HammerBlade = hammerblade::host::HammerBlade; + using Dim = hammerblade::host::Dim; + IPNSWKernelRunner(){} + + protected: + virtual std::string kernelName(const IPNSWRunner & runner) const =0; + virtual std::vector argv(const IPNSWRunner & runner) const =0; + + public: + virtual Dim gd(const IPNSWRunner &runner) const { + return Dim(1,1); + } + virtual Dim tgd(const IPNSWRunner &runner) const { + return Dim(1,1); + } + + public: + virtual void beforeLaunchKernel(const IPNSWRunner &runner) { } + virtual void afterLaunchKernel(const IPNSWRunner &runner) { } + + void runKernel(IPNSWRunner &runner) { + HammerBlade::Ptr hb = HammerBlade::Get(); + hb->push_jobv(gd(runner), + tgd(runner), + kernelName(runner), + argv(runner)); + hb->exec(); + } + }; + +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp new file mode 100644 index 000000000..19eaff181 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp @@ -0,0 +1,13 @@ +#pragma once +#include "HammerBlade.hpp" +namespace ipnsw { + class IPNSWRunner; + + class IPNSWResultReader { + protected: + using HammerBlade = hammerblade::host::HammerBlade; + + public: + virtual void readResults(const IPNSWRunner & runner) {} + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp new file mode 100644 index 000000000..feebf121d --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp @@ -0,0 +1,339 @@ +#pragma once +#include "IO.hpp" +#include "HammerBlade.hpp" +#include "IPNSWGraph.hpp" +#include "IPNSWFactory.hpp" +#include "IPNSWKernelRunner.hpp" +#include "IPNSWResultReader.hpp" +#include "GreedyWalkResults.hpp" +#include "GroupData.hpp" +#include + +namespace ipnsw { + + class IPNSWRunnerConfig { + public: + typedef enum { + Dense, + BitVector, + Sparse, + } SetType; + + IPNSWRunnerConfig(): + _set_type(BitVector), + _grid_x(1), + _grid_y(1), + _grp_x(1), + _grp_y(1) { + } + + SetType set_type() const { return _set_type; } + SetType & set_type() { return _set_type; } + + std::string set_type_str() const { + switch (set_type()) { + case Dense: + return "Dense"; + case BitVector: + return "Dense Bit Vector"; + case Sparse: + return "Sparse"; + } + } + + int & grid_x() { return _grid_x; } + int grid_x() const { return _grid_x; } + int & grid_y() { return _grid_y; } + int grid_y() const { return _grid_y; } + + int & grp_x() { return _grp_x; } + int grp_x() const { return _grp_x; } + int & grp_y() { return _grp_y; } + int grp_y() const { return _grp_y; } + + private: + SetType _set_type; + int _grid_x; + int _grid_y; + int _grp_x; + int _grp_y; + }; + + class IPNSWRunner { + public: + //static constexpr int QUERY = 276; // fewest dot products for greedy walk + //static constexpr int QUERY = 472; // fewest dot products for beam search + //static constexpr int QUERY = 427; + //static constexpr int QUERY = 355; + //static constexpr int QUERY = 2; + static constexpr int QUERY = 188; + //static constexpr int QUERY = 229; + //static constexpr int QUERY = 490; + //static constexpr int QUERY = 16; + //static constexpr int QUERY = 461; + //static constexpr int QUERY = 470; + + + static constexpr size_t CANDIDATES_MAX = 513; + static constexpr size_t RESULTS_MAX = 129; + + using HammerBlade = hammerblade::host::HammerBlade; + using Dim = hammerblade::host::Dim; + + IPNSWRunner(const Parser &p, + std::unique_ptr & fact) : + IPNSWRunner(p, fact, IPNSWRunnerConfig()) { + } + + IPNSWRunner(const Parser &p, + std::unique_ptr & fact, + const IPNSWRunnerConfig &cfg): + _factory(std::move(fact)), + _cfg(cfg) { + _io = std::unique_ptr(new IO(p)); + _hb = HammerBlade::Get(); + _kernel_runner = _factory->KernelRunner(); + _result_reader = _factory->ResultReader(); + } + + virtual ~IPNSWRunner() { delete _hb; } + + void readInput() { + auto graphs = _io->graphs(); + _graphs = { + Graph(std::move(graphs[3])), + Graph(std::move(graphs[2])), + Graph(std::move(graphs[1])), + Graph(std::move(graphs[0])) + }; + + _db = _io->database(); + _queries = _io->queries(); + } + + void loadProgram() { + _hb->load_application(ucodePath()); + } + + void initializeDeviceMemoryDB() { + std::cout << "Initializing database " << std::endl; + _db_dev = _hb->alloc(_db.size() * sizeof(_db[0])); + _hb->push_write(_db_dev, &_db[0], _db.size() * sizeof(_db[0])); + } + + void initializeDeviceMemoryQuery() { + std::cout << "Initializing query " << std::endl; + + std::vector do_queries = _io->do_queries(); + if (do_queries.empty()) { + do_queries = {QUERY}; + } + + _query_dev = _hb->alloc(sizeof(_queries[0]) * do_queries.size()); + + for (hb_mc_eva_t qidx = 0; qidx < do_queries.size(); ++qidx) { + int query = do_queries[qidx]; + _hb->push_write(_query_dev + qidx * sizeof(_queries[query]), + &_queries[query], + sizeof(_queries[query])); + } + } + + size_t seen_dev_size_per_group() const { + size_t size, words; + switch (_cfg.set_type()) { + case IPNSWRunnerConfig::Dense: + case IPNSWRunnerConfig::Sparse: + return _db.size() * sizeof(int); + case IPNSWRunnerConfig::BitVector: + words = _db.size()/32; + if (_db.size() % 32 != 0) + words += 1; + return words * sizeof(int); + } + } + void initializeDeviceMemorySeen() { + std::cout << "Initializing seen set " << std::endl; + for (int i = 0; i < numGroups(); ++i) { + hb_mc_eva_t dev = _hb->alloc(seen_dev_size_per_group()); + _seen_dev.push_back(dev); + } + } + + void initializeDeviceMemoryGraphs() { + for (auto & graph : _graphs) + graph.initialize_on_device(); + + _graph_metadata_dev = Graph::InitializeMetadataOnDevice(_graphs); + } + + void initializeDeviceVCurrDCurr() { + _curr_dev = _hb->alloc(sizeof(GreedyWalkResult) * numGroups()); + hb_mc_eva_t grp = 0; + std::cout << std::hex; + std::cout << "_curr_dev=" << std::hex << _curr_dev << std::endl; + std::cout << " curr(" << std::dec << grp << ")=" << std::hex << curr_dev(grp) << std::endl; + std::cout << "v_curr(" << std::dec << grp << ")=" << std::hex << v_curr_dev(grp) << std::endl; + std::cout << "d_curr(" << std::dec << grp << ")=" << std::hex << d_curr_dev(grp) << std::endl; + std::cout << std::dec; + } + + size_t candidates_dev_size_per_group() const { + return sizeof(GreedyWalkResult) * CANDIDATES_MAX; + } + + void initializeDeviceCandidateDev() { + for (int i = 0; i < numGroups(); ++i) { + hb_mc_eva_t dev = _hb->alloc(candidates_dev_size_per_group()); + _candidates_dev.push_back(dev); + } + } + + size_t results_dev_size_per_group() const { + return sizeof(GreedyWalkResult) * RESULTS_MAX; + } + + void initializeDeviceResultsDev() { + for (int i = 0; i < numGroups(); ++i) { + hb_mc_eva_t dev = _hb->alloc(results_dev_size_per_group()); + _results_dev.push_back(dev); + } + } + + void initializeDeviceNResultsDev() { + _n_results_dev = _hb->alloc(sizeof(int) * numGroups()); + } + + void initializeGroupData() { + _group_data_dev = _hb->alloc(sizeof(GroupData) * numGroups()); + for (int i = 0; i < numGroups(); ++i) { + GroupData gd = { + .seen_mem = seen_dev(i), + .candidates_mem = candidates_dev(i), + .results_mem = results_dev(i), + .curr = curr_dev(i), + .n_results = n_results_dev(i), + }; + _hb->push_write(group_data_dev(i), &gd, sizeof(gd)); + } + } + + void initializeDeviceMemory() { + initializeDeviceMemoryDB(); + initializeDeviceMemoryQuery(); + initializeDeviceMemorySeen(); + initializeDeviceMemoryGraphs(); + initializeDeviceVCurrDCurr(); + initializeDeviceCandidateDev(); + initializeDeviceResultsDev(); + initializeDeviceNResultsDev(); + initializeGroupData(); + } + + void runKernel() { + _kernel_runner->beforeLaunchKernel(*this); + // sync + std::cout << "Starting DMA" << std::endl; + _hb->sync_rw(); + std::cout << "Launching kernel" << std::endl; + _kernel_runner->runKernel(*this); + _kernel_runner->afterLaunchKernel(*this); + } + + void readResults() { + _result_reader->readResults(*this); + + } + + void run() { + readInput(); + loadProgram(); + initializeDeviceMemory(); + runKernel(); + readResults(); + } + + ///////////// + // Getters // + ///////////// + std::string ucodePath() const { + return _io->ucode(); + } + + hb_mc_eva_t db_dev() const { return _db_dev; } + hb_mc_eva_t query_dev(hb_mc_eva_t qidx) const { + return _query_dev + qidx * sizeof(_queries[qidx]); + } + + hb_mc_eva_t seen_dev(hb_mc_eva_t grp) const { + return _seen_dev[grp]; + } + + hb_mc_eva_t curr_dev(hb_mc_eva_t grp = 0) const { + return _curr_dev + (grp*sizeof(GreedyWalkResult)); + } + + hb_mc_eva_t v_curr_dev(hb_mc_eva_t grp) const { + return curr_dev(grp) + sizeof(float); + } + hb_mc_eva_t d_curr_dev(hb_mc_eva_t grp) const { + return curr_dev(grp); + } + + hb_mc_eva_t graph_metadata_dev() const { return _graph_metadata_dev; } + + hb_mc_eva_t candidates_dev(hb_mc_eva_t grp) const { + return _candidates_dev[grp]; + } + + hb_mc_eva_t results_dev(hb_mc_eva_t grp) const { + return _results_dev[grp]; + } + + hb_mc_eva_t n_results_dev(hb_mc_eva_t grp) const { + return _n_results_dev + grp * sizeof(int); + } + + hb_mc_eva_t group_data_dev(hb_mc_eva_t grp) const { + return _group_data_dev + grp * sizeof(GroupData); + } + + int numGroups() const { return _kernel_runner->gd(*this).x() * _kernel_runner->gd(*this).y(); } + + const std::vector> & db() const { return _db; } + + const IPNSWRunnerConfig & cfg() const { return _cfg; } + ///////////// + // Setters // + ///////////// + + private: + IPNSWRunnerConfig _cfg; + + public: + std::unique_ptr _io; + + private: + std::vector _graphs; + std::vector> _db; + std::vector> _queries; + std::vector _group_data; + HammerBlade::Ptr _hb; + + // device pointers + hb_mc_eva_t _db_dev; + hb_mc_eva_t _query_dev; + std::vector _seen_dev; + hb_mc_eva_t _curr_dev; + hb_mc_eva_t _graph_metadata_dev; + std::vector _candidates_dev; + std::vector _results_dev; + hb_mc_eva_t _n_results_dev; + hb_mc_eva_t _group_data_dev; + + // composites + std::unique_ptr _kernel_runner; + std::unique_ptr _result_reader; + std::unique_ptr _factory; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp new file mode 100644 index 000000000..ff0468903 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp @@ -0,0 +1,19 @@ +#pragma once +#include "IPNSWFactory.hpp" +#include "IProductUBmkKernelRunner.hpp" +#include "IProductUBmkResultReader.hpp" +namespace ipnsw { + class IProductUBmkFactory : public IPNSWFactory { + public: + IProductUBmkFactory(int iterations = 10): + _iterations(iterations) { + } + + protected: + virtual IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkKernelRunner(_iterations); } + virtual IPNSWResultReader *_ResultReader() const { return new IProductUBmkResultReader; } + + int _iterations; + }; +} + diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp new file mode 100644 index 000000000..1ee4da763 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp @@ -0,0 +1,30 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IPNSWRunner.hpp" + +namespace ipnsw { + class IProductUBmkKernelRunner : public IPNSWKernelRunner { + public: + IProductUBmkKernelRunner(int iterations = 10) : + IPNSWKernelRunner(), + _iterations(iterations) { + } + + private: + std::string kernelName(const IPNSWRunner & runner) const { + return "inner_product_ubmk"; + } + + virtual std::vector argv(const IPNSWRunner & runner) const { + std::vector argv = { + runner.db_dev(), // database + runner.query_dev(0), // query + static_cast(_iterations), // number of inner products + }; + return argv; + } + + protected: + int _iterations; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp new file mode 100644 index 000000000..964cc2d8e --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp @@ -0,0 +1,20 @@ +#pragma once +#include "IPNSWFactory.hpp" +#include "IProductUBmkKernelRunner.hpp" +#include "IProductUBmkResultReader.hpp" +#include "IProductUBmkFactory.hpp" +#include "IProductUBmkParallelKernelRunner.hpp" + +namespace ipnsw { + class IProductUBmkParallelFactory : public IProductUBmkFactory { + public: + IProductUBmkParallelFactory(int itertions = 10): + IProductUBmkFactory(itertions) { + } + + private: + IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkParallelKernelRunner(_iterations); } + + }; +} + diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp new file mode 100644 index 000000000..668114fb2 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp @@ -0,0 +1,64 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IProductUBmkKernelRunner.hpp" +#include "IPNSWRunner.hpp" +#include "HammerBlade.hpp" +#include + +namespace ipnsw { + class IProductUBmkParallelKernelRunner : public IProductUBmkKernelRunner { + public: + IProductUBmkParallelKernelRunner(int iterations = 10) : + IProductUBmkKernelRunner(iterations) { + } + + private: + using HammerBlade = hammerblade::host::HammerBlade; + + void beforeLaunchKernel(const IPNSWRunner &runner) { + HammerBlade::Ptr _hb = HammerBlade::Get(); + + _visit.clear(); + + for (int i = 0; i < _iterations * runner.numGroups(); ++i) { + _visit.push_back((i*3) % runner.db().size()); + } + std::random_shuffle(_visit.begin(), _visit.end()); + + _visit_dev = _hb->alloc(sizeof(int) * _visit.size()); + + std::cout << "beforeLaunchKernel called: _visit_dev = " << std::hex << _visit_dev << std::endl; + std::cout << std::dec; + + _hb->push_write(_visit_dev, &_visit[0], sizeof(int) * _visit.size()); + } + + std::vector argv(const IPNSWRunner & runner) const { + std::cout << "Called" << std::endl; + std::vector argv = { + runner.db_dev(), // database + runner.query_dev(0), // query + static_cast(_iterations), // number of inner products + _visit_dev, // vectors to visit + }; + return argv; + } + + void afterLaunchKernel(const IPNSWRunner &runner) { + HammerBlade::Ptr _hb = HammerBlade::Get(); + _hb->free(_visit_dev); + _visit.clear(); + } + + virtual Dim gd(const IPNSWRunner &runner) const { + return Dim(runner.cfg().grid_x(),runner.cfg().grid_y()); + } + + virtual Dim tgd(const IPNSWRunner &runner) const { + return Dim(runner.cfg().grp_x(),runner.cfg().grp_y()); + } + + hb_mc_eva_t _visit_dev; + std::vector _visit; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp new file mode 100644 index 000000000..300990b18 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp @@ -0,0 +1,12 @@ +#pragma once +#include "IPNSWRunner.hpp" +#include "IPNSWResultReader.hpp" + +namespace ipnsw { + class IProductUBmkResultReader : public IPNSWResultReader { + public: + void readResults(const IPNSWRunner & runner) { + std::cout << "Done" << std::endl; + } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile new file mode 100644 index 000000000..a8f6da2c5 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/Makefile @@ -0,0 +1,184 @@ +##################### +# Standard includes # +##################### +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) +include $(REPLICANT_PATH)/environment.mk + +all: + +################## +# Prepare inputs # +################## +ipnsw-eval-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw +ipnsw-inputs = $(ipnsw-eval-dir)/data/database_music100.bin +ipnsw-inputs += $(ipnsw-eval-dir)/data/query_music100.bin +ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_0 +ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_1 +ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_2 +ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_3 + +ipnsw-input := $(ipnsw-eval-dir)/data/database_music100.bin +# this rule generates all the inputs, but we just target one +# to avoid running this more than once +$(ipnsw-input): + cd $(ipnsw-eval-dir) && bash prep.sh + +####################################### +# Base clase run directory generation # +####################################### +# $1 = name +# $2 = version +# $3 = args +define run-dir +run/$1/kernel.cpp: kernel/$2/kernel.cpp + @mkdir -p $$(dir $$@) + @cp $$< $$@ + @echo "MAKING $$@" + +run/$1/Makefile: template.mk + @mkdir -p $$(dir $$@) + @cat $$< > $$@ + @echo "C_ARGS += $3" >> $$@ + @echo "MAKING $$@" + +.PHONY: generate-$1 build-$1 purge-$1 run-$1 profile-$1 + +generate-$1: run/$1/Makefile run/$1/kernel.cpp + +purge-$1: + rm -rf run/$1 + +build-$1: generate-$1 + +$(MAKE) -C run/$1 main.riscv + +exec-$1: generate-$1 + +$(MAKE) -C run/$1 main.exec.log + +profile-$1: generate-$1 + +$(MAKE) -C run/$1 main.profile.log + +debug-$1: generate-$1 + +$(MAKE) -C run/$1/main.debug.log + +saif-$1: generate-$1 + +$(MAKE) -C run/$1/main.saifgen.log +endef + +################################# +# Common command line arguments # +################################# +C_ARGS += $(ipnsw-inputs) + +############### +# Greedy Walk # +############### +# greedy-walk version -> dimensions + # inner product with ipc=0.3 (8x4) +greedy_walk-grp-x := 1 +greedy_walk-grp-y := 1 +# inner product with ipc=0.43 (8x4) +greedy_walk_v1-grp-x := 1 +greedy_walk_v1-grp-y := 1 +# inner product with FLOPS/cycle=0.2 (8x4) +greedy_walk_v2-grp-x := 1 +greedy_walk_v2-grp-y := 1 +# inner product with FLOPS/cycle=0.26 (8x4) +greedy_walk_v3-grp-x := 1 +greedy_walk_v3-grp-y := 1 +# inner product v4-serial +greedy_walk_v3-ipv4serial-grp-x := 1 +greedy_walk_v3-ipv4serial-grp-y := 1 +# greedy_walk_v3 + ParallelInnerProduct_v1 +greedy_walk_v4-grp-x := 2 +greedy_walk_v4-grp-y := 2 + +############### +# Beam Search # +############### +# beam-search version -> dimensions + +# very slow - uses a very dumb sparse set +beam_search-grp-x := 1 +beam_search-grp-y := 1 +# dense set - inner product with ipc=0.3 (8x4) +beam_search_v1-grp-x := 1 +beam_search_v1-grp-y := 1 +# dense set - inner product with ipc=0.43 (8x4) +beam_search_v2-grp-x := 1 +beam_search_v2-grp-y := 1 +# + inner_product_v2 (flops/cycle=0.2039) (8x4) +beam_search_v3-grp-x := 1 +beam_search_v3-grp-y := 1 +# + inner_product_v3 (flops/cycle=0.2663) (8x4) +beam_search_v4-grp-x := 1 +beam_search_v4-grp-y := 1 +# + Bit vector for dense set +beam_search_v5-grp-x := 1 +beam_search_v5-grp-y := 1 +# + Bit vector for dense set + inner product v4 seria; +beam_search_v5-ipv4serial-grp-x := 1 +beam_search_v5-ipv4serial-grp-y := 1 +# beam_search_v5 + inner_product_parallel_v3 +beam_search_v6-grp-x := 2 +beam_search_v6-grp-y := 2 +# beam_search_v6 but with 1x2 tile group +beam_search_v7-grp-x := 1 +beam_search_v7-grp-y := 2 +# beam_search_v5 but edges of candidates traversed in parallel +beam_search_v8-grp-x := 4 +beam_search_v8-grp-y := 4 +# combination of beam_search_v8 + beam_search_v6 +beam_search_v9-grp-x := 4 +beam_search_v9-grp-y := 4 +# beam_search_v5 but edges of candidates traversed in parallel +beam_search_v10-grp-x := 8 +beam_search_v10-grp-y := 4 + +# $1 = version +# $2 = query +run-name = $(1)_query$(2) +define run +$(eval $(call run-dir,$(call run-name,$1,$2),$1,\ +$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call run-name,$1,$2)/kernel.riscv \ +$1 \ +$(C_ARGS) \ +--queries $(2) \ +--group-x $($(1)-grp-x) \ +--group-y $($(1)-grp-y) \ +)) +generate: generate-$(call run-name,$1,$2) +purge: purge-$(call run-name,$1,$2) +build: build-$(call run-name,$1,$2) +exec: exec-$(call run-name,$1,$2) +profile: profile-$(call run-name,$1,$2) +debug: debug-$(call run-name,$1,$2) +saifgen: saifgen-$(call run-name,$1,$2) + +saifgen-$(call run-name,$1,$2): $(ipnsw-input) +profile-$(call run-name,$1,$2): $(ipnsw-input) +debug-$(call run-name,$1,$2): $(ipnsw-input) +exec-$(call run-name,$1,$2): $(ipnsw-input) + +endef +.PHONY: generate +.PHONY: purge +.PHONY: build +.PHONY: exec +.PHONY: profile +.PHONY: debug +.PHONY: saifgen + +############################################################# +# Define which queries we want to run and instantiate rules # +############################################################# +greedy-walk-queries := 4 16 229 276 461 470 490 +$(foreach q,$(greedy-walk-queries),$(eval $(call run,greedy_walk_v4,$(q)))) + +beam-search-queries := 2 188 229 355 427 472 +beam-search-queries += 25 74 112 140 148 178 +beam-search-queries += 214 244 278 302 331 +beam-search-queries += 396 420 452 489 511 +$(foreach q,$(beam-search-queries),$(eval $(call run,beam_search_v10,$(q)))) + +.PHONY: all +all: exec diff --git a/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp b/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp new file mode 100644 index 000000000..39e09b1a9 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp @@ -0,0 +1,17 @@ +#pragma once +#include +#include + +namespace ipnsw { + static bool startswith(const std::string &st, const std::string &prefix) { + return st.rfind(prefix, 0) == 0; + } + + template + T from_string(const std::string &str) { + std::stringstream ss(str); + T v; + ss >> v; + return v; + } +} diff --git a/examples/sdh-eval-workloads/ipnsw/graph-tools b/examples/sdh-eval-workloads/ipnsw/graph-tools new file mode 160000 index 000000000..a7304c67c --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/graph-tools @@ -0,0 +1 @@ +Subproject commit a7304c67c34070877e57719fd183c4a5ee569904 diff --git a/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers b/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers new file mode 160000 index 000000000..9a26b6d0c --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers @@ -0,0 +1 @@ +Subproject commit 9a26b6d0cbe04a9cc627cce7049a0ba97ca66621 diff --git a/examples/sdh-eval-workloads/ipnsw/hb-prog-eval b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval new file mode 160000 index 000000000..f113c0865 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval @@ -0,0 +1 @@ +Subproject commit f113c0865d2d9491551dab8f8b500445b75429bc diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp new file mode 100644 index 000000000..23f2b20d1 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp @@ -0,0 +1,80 @@ +#include "bsg_manycore_regression.h" +#include "ipnsw.hpp" +#include "HammerBlade.hpp" +#include "Graph500Data.hpp" +#include "Graph.hpp" +#include "IO.hpp" +#include "IPNSWGraph.hpp" +#include "IPNSWRunner.hpp" +#include "IProductUBmkKernelRunner.hpp" +#include "IProductUBmkResultReader.hpp" +#include "IProductUBmkFactory.hpp" +#include "IProductUBmkParallelFactory.hpp" +#include "BeamSearchKernelRunner.hpp" +#include "BeamSearchResultReader.hpp" +#include "BeamSearchFactory.hpp" +#include "GreedyWalkKernelRunner.hpp" +#include "GreedyWalkResultReader.hpp" +#include "GreedyWalkFactory.hpp" +#include "GreedyWalkResults.hpp" +#include "StringHelpers.hpp" +#include +#include + +using namespace ipnsw; + +int Main(int argc, char *argv[]) +{ + Parser args; + args.parse(argc, argv); + + std::cout << args.str() << std::endl; + + std::unique_ptr runner; + std::unique_ptr factory; + + IPNSWRunnerConfig cfg; + cfg.grid_x() = args.grid_x(); + cfg.grid_y() = args.grid_y(); + cfg.grp_x() = args.grp_x(); + cfg.grp_y() = args.grp_y(); + + if (ipnsw::startswith(args.version(), "greedy_walk")) { + factory = std::unique_ptr(new GreedyWalkFactory); + } else if (ipnsw::startswith(args.version(), "beam_search")) { + factory = std::unique_ptr(new BeamSearchFactory); + } else if (ipnsw::startswith(args.version(), "iproduct_ubmk")) { + /* parse the number of inner products */ + std::cout << "num inner products " << args.num_iproducts() << std::endl; + int n_iproducts = args.num_iproducts(); + + bool parallel = args.version().find("parallel") != std::string::npos; + if (parallel) { + factory = std::unique_ptr(new IProductUBmkParallelFactory(n_iproducts)); + } else { + factory = std::unique_ptr(new IProductUBmkFactory(n_iproducts)) ; + } + + } else if (args._version == "debug") { + /* just for debugging */ + std::cout << "--num-iproducts=" << args.num_iproducts() << std::endl; + std::cout << "--queries=" << std::endl; + std::cout << "--group-x=" << args.grp_x() << std::endl; + std::cout << "--group-y=" << args.grp_y() << std::endl; + auto do_queries = args.do_queries(); + for (auto q : do_queries) { + std::cout << q << " "; + } + std::cout << std::endl; + return 0; + } else { + return 0; + } + + runner = std::unique_ptr(new IPNSWRunner(args, factory, cfg)); + runner->run(); + + return 0; +} + +declare_program_main("IPNSW", Main); diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp new file mode 100644 index 000000000..9c91c72bd --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp @@ -0,0 +1,36 @@ +// Copyright (c) 2019, University of Washington All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// Redistributions of source code must retain the above copyright notice, this list +// of conditions and the following disclaimer. +// +// Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// +// Neither the name of the copyright holder nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp new file mode 100644 index 000000000..a75d5b1bf --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp @@ -0,0 +1,182 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product(v0, v1)) + + int ipnsw_beam_search(const graph *Gs, const float *database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DynSet> seen(seen_mem, N_V); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); + //v_worst = std::get<1>(results.top()); + bsg_print_int(-v_best); + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + bsg_print_int(dst); + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp new file mode 100644 index 000000000..9ee2ce5e7 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp @@ -0,0 +1,188 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, const float *database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSetseen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp new file mode 100644 index 000000000..d55e7e900 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp @@ -0,0 +1,279 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 8 +#define BSG_TILE_GROUP_Y_DIM 4 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + +#define distance(v0, v1) \ + (-1 * inner_product_v4_serial(v0, v1)) + + + static constexpr int SYNC_INV = -1; + static constexpr int SYNC_DONE = -2; + + void ipnsw_distance_slave(bsg_attr_remote const float *__restrict database, + const float *query, + int *dst_p, + float *distance_p, + int *done_p, + DenseSet_v1 *seen) + { + float *result = bsg_tile_group_remote_pointer(0, 0, &distance_p[__bsg_id]); + int *done = bsg_tile_group_remote_pointer( 0, 0, &done_p[__bsg_id]); + while (true) { + int dst = sleep_until_valid(dst_p, SYNC_INV); + if (dst == SYNC_DONE) + break; + + if (!seen->in(dst)) { + seen->atomic_insert(dst); + //bsg_print_int(dst); + float tmp = distance(query, &database[dst * VSIZE]); + //bsg_print_float(tmp); + *result = tmp; + } else { + *result = -INFINITY; + } + *done = 1; + } + } + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, + int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + int dst_slave = SYNC_INV; + float dist_result[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + int dist_done [BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + + if (__bsg_id != 0) { + ipnsw_distance_slave(database, q, &dst_slave, dist_result, dist_done, &seen); + } else { + bsg_saif_start(); + int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x) + for (int y = 0; y < BSG_TILE_GROUP_Y_DIM; ++y) { + dst_slave_ptr[bsg_x_y_to_id(x,y)] + = bsg_tile_group_remote_pointer(x, y, &dst_slave); + dist_result[bsg_x_y_to_id(x,y)] = INFINITY; + dist_done[bsg_x_y_to_id(x,y)] = 0; + } + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + + // traverse neighbors + for (int dst_i = 0; + dst_i < degree; + dst_i += BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM) { + // read-in work + int dst_n = std::min(BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM, degree-dst_i); + int dst_v[dst_n]; + memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v)); + + // delegate work + int dst; + for (int dst_j = 1; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + *dst_slave_ptr[dst_j] = dst; + } + // work myself + { + dst = dst_v[0]; + if (!seen.in(dst)) { + seen.atomic_insert(dst); + dist_result[0] = distance(q, &database[dst * VSIZE]); + } else { + dist_result[0] = -INFINITY; + } + dist_done[0] = 1; + } + // reduce + for (int dst_j = 0; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + bsg_wait_local_int_asm_blind(&dist_done[dst_j], 1); + dist_done[dst_j] = 0; + float d_neib = dist_result[dst_j]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_float(d_neib); +#endif + // already seen? + if (d_neib == -INFINITY) + continue; + + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + *n_results = n_res; + bsg_saif_end(); + + } + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp new file mode 100644 index 000000000..b0f374a4c --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp @@ -0,0 +1,189 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product_v1(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSetseen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp new file mode 100644 index 000000000..f98216636 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp @@ -0,0 +1,189 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product_v2(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSetseen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp new file mode 100644 index 000000000..01f62555f --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp @@ -0,0 +1,189 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product_v3(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSetseen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp new file mode 100644 index 000000000..1ced33c51 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp @@ -0,0 +1,192 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + +#define distance(v0, v1) \ + (-1 * inner_product_v4_serial(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp new file mode 100644 index 000000000..7073bb548 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp @@ -0,0 +1,192 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + +#define distance(v0, v1) \ + (-1 * inner_product_v3(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp new file mode 100644 index 000000000..e88095dea --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp @@ -0,0 +1,195 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +using InnerProduct = InnerProductParallel_v1; + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + + // Pepare other tiles for parallel inner products + InnerProduct ip(database, q); + + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + ip.init(); + + if (__bsg_id == 0) { + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = -1 * ip.inner_product(dst); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + //ip.exit(); + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + *n_results = n_res; + } + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp new file mode 100644 index 000000000..37d995573 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp @@ -0,0 +1,194 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +using InnerProduct = InnerProductParallel_v1; + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + + // Pepare other tiles for parallel inner products + InnerProduct ip(database, q); + + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + ip.init(); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = -1 * ip.inner_product(dst); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + //ip.exit(); + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + + bsg_cuda_print_stat_end(0); + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp new file mode 100644 index 000000000..d87eaf3bd --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp @@ -0,0 +1,270 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + +#define distance(v0, v1) \ + (-1 * inner_product_v4_serial(v0, v1)) + + + static constexpr int SYNC_INV = -1; + static constexpr int SYNC_DONE = -2; + + void ipnsw_distance_slave(bsg_attr_remote const float *__restrict database, + const float *query, + int *dst_p, + float *distance_p, + DenseSet_v1 *seen) + { + float *result = bsg_tile_group_remote_pointer(0, 0, &distance_p[__bsg_id]); + while (true) { + int dst = sleep_until_valid(dst_p, SYNC_INV); + if (dst == SYNC_DONE) + break; + + if (!seen->in(dst)) { + seen->atomic_insert(dst); + //bsg_print_int(dst); + float tmp = distance(query, &database[dst * VSIZE]); + //bsg_print_float(tmp); + *result = tmp; + } else { + *result = -INFINITY; + } + } + } + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, + int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + int dst_slave = SYNC_INV; + float dist_result[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + + if (__bsg_id != 0) { + ipnsw_distance_slave(database, q, &dst_slave, dist_result, &seen); + } else { + + int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x) + for (int y = 0; y < BSG_TILE_GROUP_Y_DIM; ++y) { + dst_slave_ptr[bsg_x_y_to_id(x,y)] + = bsg_tile_group_remote_pointer(x, y, &dst_slave); + dist_result[bsg_x_y_to_id(x,y)] = INFINITY; + } + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + + // traverse neighbors + for (int dst_i = 0; + dst_i < degree; + dst_i += BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM) { + // read-in work + int dst_n = std::min(BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM, degree-dst_i); + int dst_v[dst_n]; + memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v)); + + // delegate work + int dst; + for (int dst_j = 1; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + *dst_slave_ptr[dst_j] = dst; + } + // work myself + { + dst = dst_v[0]; + if (!seen.in(dst)) { + seen.atomic_insert(dst); + dist_result[0] = distance(q, &database[dst * VSIZE]); + } else { + dist_result[0] = -INFINITY; + } + } + // reduce + for (int dst_j = 0; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + float d_neib = sleep_until_valid(&dist_result[dst_j], INFINITY); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_float(d_neib); +#endif + // already seen? + if (d_neib == -INFINITY) + continue; + + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + *n_results = n_res; + + } + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp new file mode 100644 index 000000000..69def7bdd --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp @@ -0,0 +1,249 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 4 +#define BSG_TILE_GROUP_Y_DIM 4 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +// Uncomment to turn on debugging +#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + + using InnerProduct = InnerProductParallel_Y; + + static constexpr int SYNC_INV = -1; + static constexpr int SYNC_DONE = -2; + + void ipnsw_x_master(bsg_attr_remote const float *__restrict database, + const float *query, + int *dst_p, + float *distance_p, + DenseSet_v1 *seen, + InnerProduct *ip_y) + { + float *result = bsg_tile_group_remote_pointer(0, 0, &distance_p[__bsg_x]); + while (true) { + int dst = sleep_until_valid(dst_p, SYNC_INV); + if (dst == SYNC_DONE) + break; + + if (!seen->in(dst)) { + seen->atomic_insert(dst); + //bsg_print_int(dst); + *result = -1.0 * ip_y->inner_product(dst); + } else { + *result = -INFINITY; + } + } + } + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, + int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + InnerProduct ip_y(database, q); + ip_y.init(); + + int dst_slave = SYNC_INV; + float dist_result[BSG_TILE_GROUP_X_DIM]; + + if (__bsg_y == 0) { + if (__bsg_x == 0) { + + int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM]; + for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x) { + dst_slave_ptr[x] = bsg_tile_group_remote_pointer(x, 0, &dst_slave); + dist_result[x] = INFINITY; + } + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + + // traverse neighbors + for (int dst_i = 0; + dst_i < degree; + dst_i += BSG_TILE_GROUP_X_DIM) { + // read-in work + int dst_n = std::min(BSG_TILE_GROUP_X_DIM, degree-dst_i); + int dst_v[dst_n]; + memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v)); + + // delegate work + int dst; + for (int dst_j = 1; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + *dst_slave_ptr[dst_j] = dst; + } + // work myself + { + dst = dst_v[0]; + if (!seen.in(dst)) { + seen.atomic_insert(dst); + dist_result[0] = -1.0 * ip_y.inner_product(dst); + } else { + dist_result[0] = -INFINITY; + } + } + // reduce + for (int dst_j = 0; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + float d_neib = sleep_until_valid(&dist_result[dst_j], INFINITY); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_float(d_neib); +#endif + // already seen? + if (d_neib == -INFINITY) + continue; + + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + + } + + } + + // signal all columns done + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM; ++tile) + *dst_slave_ptr[tile] = SYNC_DONE; + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + *n_results = n_res; + + } else { // bsg_x != 0 + ipnsw_x_master(database, q, &dst_slave, dist_result, &seen, &ip_y); + } + } + + ip_y.exit(); + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp new file mode 100644 index 000000000..9c761e94d --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp @@ -0,0 +1,2 @@ +extern "C" int empty() { +} diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp new file mode 100644 index 000000000..385e69d8a --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, const float *database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp new file mode 100644 index 000000000..67533d6da --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product_v1(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp new file mode 100644 index 000000000..d7c2bd9c3 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product_v2(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp new file mode 100644 index 000000000..aafefe6fd --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product_v4_serial(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VCURR_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp new file mode 100644 index 000000000..99614fc8b --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product_v3(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VCURR_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp new file mode 100644 index 000000000..c60b3e125 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp @@ -0,0 +1,152 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +using InnerProduct = InnerProductParallel_v1; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +#define DEBUG_GREEDY_VIS_TR + + int ipnsw_greedy_search (const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + InnerProduct ip(database, q); + ip.init(); + if (__bsg_id == 0) { + bsg_saif_start(); + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = -1.0 * ip.inner_product(v_curr); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = -1.0 * ip.inner_product(dst); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VCURR_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + bsg_saif_end(); + } + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp new file mode 100644 index 000000000..12da1aebb --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp @@ -0,0 +1,113 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +using InnerProduct = InnerProductParallel_v1; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +#define DEBUG_GREEDY_VIS_TR + + /**/ + int ipnsw_greedy_search (const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + /* loc:2 */ + /**/ + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + /* loc:2 */ + + /* init code - can be hidden by library*/ + InnerProduct ip(database, q); + ip.init(); + if (__bsg_id == 0) { + bsg_saif_start(); + /**/ + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = -1.0 * ip.inner_product(v_curr); + + /**/ + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + /* loc:5 */ + // fetch neighbors + /**/ + for (int dst : G.neighbors(v_curr)) { + float d = -1.0 * ip.inner_product(dst); + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + } + } + } + } + /* loc: 10 */ + /**/ + *v_curr_o = v_curr; + *d_curr_o = d_curr; + } + return 0; + } + /* loc: 5 */ + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh new file mode 100644 index 000000000..1f12e76ba --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh @@ -0,0 +1 @@ +cat kernel.loc.cpp | grep loc: | cut -d: -f2 | cut -d* -f1 | awk 'BEGIN{x=0}{x = x+$1}END{print x}' diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp new file mode 100644 index 000000000..aaaf5317d --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp @@ -0,0 +1,40 @@ +#pragma once +#include +#include + +template +class DynHeap { +public: + DynHeap(T *data, int N): + _data(data), + _data_N(N), + _n(0){ + } + + void push(T i) { + _data[_n++] = i; + std::push_heap(_data, _data+_n, Comparitor()); + if (_n > _data_N) pop(); + } + + T pop() { + std::pop_heap(_data, _data+_n--, Comparitor()); + return _data[_n]; + } + + T top() const { + return _data[0]; + } + + bool empty() const { + return _n == 0; + } + + int size() const { + return _n; + } + + int _n; + int _data_N; + T *_data; +}; diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp new file mode 100644 index 000000000..95d6b291e --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp @@ -0,0 +1,6 @@ +#ifndef __HELLO_WORLD_HPP +#define __HELLO_WORLD_HPP + +#include + +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp new file mode 100644 index 000000000..6099411c2 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp @@ -0,0 +1,319 @@ +#pragma once +#include "bsg_striped_array.hpp" +#include +#include +#include +#include "sleep_until_valid.hpp" + +template +__attribute__((noinline)) +FLOAT_T inner_product(const FLOAT_T *__restrict a, const FLOAT_T *__restrict b) +{ + FLOAT_T r = 0.0; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) { + #pragma GCC unroll 32 + for (int j = 0; j < BSIZE; ++j) { + r += a[i + j]*b[i + j]; + } + } + return r; +} + + +template +__attribute__((noinline)) +FLOAT_T inner_product_v1(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + FLOAT_T r = 0.0; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) { + #pragma GCC unroll 32 + for (int j = 0; j < BSIZE; ++j) { + r += a[i + j]*b[i + j]; + } + } + return r; +} + + +template +__attribute__((noinline)) +FLOAT_T inner_product_v2(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + FLOAT_T r = 0.0; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) { + #pragma GCC unroll 32 + for (int j = 0; j < BSIZE; ++j) { + r = fmaf(a[i+j], b[i+j], r); + } + } + return r; +} + + + +template +__attribute__((noinline)) +FLOAT_T inner_product_v3(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + FLOAT_T r0 = 0.0, r1 = 0.0; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += 2 * BSIZE * TG_X * TG_Y) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { + r0 = fmaf(a[i+j+0*BSIZE], b[i+j+0*BSIZE], r0); + r1 = fmaf(a[i+j+1*BSIZE], b[i+j+1*BSIZE], r1); + } + } + return r0+r1; +} + +template +__attribute__((noinline)) +FLOAT_T inner_product_v4(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + register FLOAT_T r[UNROLL] = {0}; + for (int i = __bsg_id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { +#pragma bsg_unroll(32) + for (int k =0 ; k < UNROLL; ++k) { + r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]); + } + } + } + FLOAT_T rs = 0.0; + for (int i = 0; i < UNROLL; ++i) + rs += r[i]; + return rs; +} + +template +__attribute__((noinline)) +FLOAT_T inner_product_parallel_v1(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + register FLOAT_T r[UNROLL] = {0.0}; + + for (int i = __bsg_id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { +#pragma bsg_unroll(32) + for (int k =0 ; k < UNROLL; ++k) { + r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]); + } + } + } + FLOAT_T rs = 0.0; + for (int i = 0; i < UNROLL; ++i) + rs += r[i]; + + return rs; +} + + +template +__attribute__((noinline)) +FLOAT_T inner_product_v4_serial(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + register FLOAT_T r[UNROLL] = {0}; + for (int i = 0; i < VSIZE; i += UNROLL * BSIZE) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { +#pragma bsg_unroll(32) + for (int k =0 ; k < UNROLL; ++k) { + r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]); + } + } + } + FLOAT_T rs = 0.0; + for (int i = 0; i < UNROLL; ++i) + rs += r[i]; + return rs; +} + + +template +FLOAT_T inner_product_parallel_v2( + int id, + const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + register FLOAT_T r[UNROLL] = {0.0}; + + for (int i = id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_N) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { +#pragma bsg_unroll(32) + for (int k =0 ; k < UNROLL; ++k) { + r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]); + } + } + } + FLOAT_T rs = 0.0; + for (int i = 0; i < UNROLL; ++i) + rs += r[i]; + + return rs; +} + +template +class InnerProductParallel_v1 { +public: + static constexpr std::size_t VSIZE = 100; + static constexpr std::size_t TG_N = TG_X * TG_Y; + static constexpr int SYNC_DONE = -2; + static constexpr int SYNC_INV = -1; + + InnerProductParallel_v1(bsg_attr_remote const float *t1, const float *t2) { + _inf = INFINITY; + for (int i = 0; i < TG_N; ++i) + _partial[i] = _inf; + + for (int x = 0; x < TG_X; ++x) + for (int y = 0; y < TG_Y; ++y) + _t1_idx_group[bsg_x_y_to_id(x,y)] + = bsg_tile_group_remote_pointer(x,y,&_t1_idx); + + _t1 = t1; + _t2 = t2; + _t1_idx = SYNC_INV; + } + + void init() { + if (__bsg_id == 0) { + return; + } + + float p = 0.0; + int t1_idx; + float *partial_result = bsg_tile_group_remote_pointer(0, 0, &_partial[__bsg_id]); + + while (true) { + t1_idx = sleep_until_valid(&_t1_idx, SYNC_INV); + if (t1_idx == SYNC_DONE) + break; + + p = inner_product_parallel_v1(_t2, &_t1[t1_idx * VSIZE]); + *partial_result = p; + } + } + + float inner_product(int idx) { + if (__bsg_id != 0) + return 0.0; + + for (int tile = 0; tile < TG_X*TG_Y; ++tile) + *_t1_idx_group[tile] = idx; + + _partial[__bsg_id] = inner_product_parallel_v1(_t2, &_t1[idx * VSIZE]); + + float r = 0.0; + for (int tile = 0; tile +class InnerProductParallel_Y { +public: + static constexpr std::size_t VSIZE = 100; + static constexpr int SYNC_DONE = -2; + static constexpr int SYNC_INV = -1; + + InnerProductParallel_Y(bsg_attr_remote const float *t1, const float *t2) { + _inf = INFINITY; + for (int i = 0; i < TG_Y; ++i) + _partial[i] = _inf; + + for (int y = 0; y < TG_Y; ++y) + _t1_idx_group[y] = bsg_tile_group_remote_pointer(__bsg_x, y, &_t1_idx); + + _t1 = t1; + _t2 = t2; + _t1_idx = SYNC_INV; + } + + void init() { + if (__bsg_y == 0) { + return; + } + + float p = 0.0; + int t1_idx; + float *partial_result = bsg_tile_group_remote_pointer(__bsg_x, 0, &_partial[__bsg_y]); + + while (true) { + t1_idx = sleep_until_valid(&_t1_idx, SYNC_INV); + if (t1_idx == SYNC_DONE) + break; + + p = inner_product_parallel_v2(__bsg_y, _t2, &_t1[t1_idx * VSIZE]); + *partial_result = p; + } + } + + float inner_product(int idx) { + if (__bsg_y != 0) + return 0.0; + + for (int tile = 0; tile < TG_Y; ++tile) + *_t1_idx_group[tile] = idx; + + _partial[__bsg_y] = inner_product_parallel_v2(__bsg_y, _t2, &_t1[idx * VSIZE]); + + float r = 0.0; + for (int tile = 0; tile +#include +template +class DynSet { +public: + DynSet(T *data, int N): + _data(data), + _data_N(N), + _n(0) { + } + + void insert(T i) { + _data[_n++] = i; + std::sort(_data, _data+_n, Comparitor()); + } + + bool in(T i) { + return std::binary_search(_data, _data+_n, i, Comparitor()); + } + + int size() const { + return _n; + } + + T *_data; + int _n; + int _data_N; +}; + +template +class DenseSet { +public: + DenseSet(int *data): + _data(data) { + } + + void insert(T i) { + _data[i] = 1; + } + + void atomic_insert(T i) { + insert(i); + } + + bool in(T i) { + return _data[i] == 1; + } + + int *_data; +}; + +template +class DenseSet_v1 { +public: + DenseSet_v1(int *data) : + _data(data){ + } + + void insert(T i) { + _data[word(i)] |= (1 << bit(i)); + } + + void atomic_insert(T i) { + int *ptr = &_data[word(i)]; + int r = 1 << bit(i); + asm volatile ("amoor.w x0, %[r], %[ptr]" : + : + [r] "r" (r), + [ptr] "m" (*ptr)); + return; + } + + bool in(T i) { + return _data[word(i)] & (1 << bit(i)); + } + + int word(T i) const { + return i >> 5; + } + + int bit(T i) const { + return i & 31; + } + int *_data; +}; diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp new file mode 100644 index 000000000..d59088d75 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp @@ -0,0 +1,28 @@ +#pragma once +#include +template +static inline T sleep_on_update(volatile T *ptr) +{ + T r; + asm volatile ("lr.w.aq %[r], %[ptr]" : + [r] "=r" (r) : + [ptr] "m" (*ptr) + ); + return r; +} + +template +static inline T sleep_until_valid(volatile T *ptr, T not_valid) +{ + T r; + + asm volatile ("lr.w %[r], %[ptr]" : + [r] "=r" (r) : + [ptr] "m" (*ptr)); + + while (r == not_valid) { + r = sleep_on_update(ptr); + } + *ptr = not_valid; + return r; +} diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp new file mode 100644 index 000000000..9fe605f3a --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp @@ -0,0 +1,180 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +//#include +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define VISIT_BUFSIZE 512 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +//#define DEBUG_SLAVE +//#define DEBUG_MASTER + +using barrier = bsg_barrier; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_parallel_v1(x,y) + +#define SYNC_DONE -1 + + __attribute__((noinline)) + int inner_product_ubmk_master(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all, + barrier *group_barrier, + std::atomic *kp, + std::atomic *rp) + { + float r = 0.0; + int visit[VISIT_BUFSIZE]; + int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id]; + + // pre-compute addresses on remote tiles + std::atomic *kp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM]; + std::atomic *rp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM]; + + for (int tile_x = 0; tile_x < BSG_TILE_GROUP_X_DIM; ++tile_x) { + for (int tile_y = 0; tile_y < BSG_TILE_GROUP_Y_DIM; ++tile_y) { + kp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, kp); + rp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, rp); + } + } + + for (int i = 0; i < N; i += VISIT_BUFSIZE) { + size_t sz = std::min(VISIT_BUFSIZE, (N-i)); + memcpy(visit, &visit_remote[i], sz*sizeof(int)); + + for (int j = 0; j < sz; ++j) { + // read k + int k = visit[j]; + + // set k on all tiles + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) + kp_group[tile]->store(k, std::memory_order_relaxed); + + // do inner product + group_barrier->sync(); // signal ready + float r_local = iproduct(query, &database[k * VSIZE]); +#ifdef DEBUG_MASTER + bsg_print_float(r_local); +#endif + rp_group[__bsg_id]->store(r_local, std::memory_order_relaxed); + group_barrier->sync(); // signal done + + // read r from all tiles + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) { + float r_remote = rp_group[tile]->load(std::memory_order_relaxed); +#ifdef DEBUG_MASTER + bsg_print_float(r_remote); +#endif + r += r_remote; + } + } + } + + return (int)r; + } + + __attribute__((noinline)) + void inner_product_ubmk_slave(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + barrier *group_barrier, + std::atomic *kp, + std::atomic *rp) + { + float r = 0.0; + int k; + + while (true) { + // load next + group_barrier->sync(); // signal ready + k = kp->load(std::memory_order_relaxed); + if (k == SYNC_DONE) + break; + + // do inner product + r = iproduct(query, &database[k * VSIZE]); + rp->store(r, std::memory_order_relaxed); +#ifdef DEBUG_SLAVE + bsg_print_float(r); +#endif + group_barrier->sync(); // signal done + } + } + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all) + { + static barrier group_barrier; + static std::atomic k; + static std::atomic r; + float rr; + + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + if (__bsg_id == 0) { + // enter master loop + rr = inner_product_ubmk_master(database, q, N, visit_remote_all, + &group_barrier, &k, &r); + } else { + // enter slave loop + inner_product_ubmk_slave(database, q, &group_barrier, &k, &r); + } + bsg_cuda_print_stat_end(0); + + return (int)(rr); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp new file mode 100644 index 000000000..df8be2dae --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp @@ -0,0 +1,154 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +//#include +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" +#include "sleep_until_valid.hpp" + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define VISIT_BUFSIZE 512 + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_MASTER +//#define DEBUG_SLAVE +#define iproduct(x,y) \ + inner_product_parallel_v1(x,y) + + #define SYNC_INV -2 + #define SYNC_DONE -1 + + __attribute__((noinline)) + int inner_product_ubmk_master(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all, + int *kp, + float *rp) + { + float r = 0.0; + int visit[VISIT_BUFSIZE]; + int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id]; + + // pre-compute addresses on remote tiles + int *kp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM]; + + for (int tile_x = 0; tile_x < BSG_TILE_GROUP_X_DIM; ++tile_x) { + for (int tile_y = 0; tile_y < BSG_TILE_GROUP_Y_DIM; ++tile_y) { + kp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, kp); + } + } + + for (int i = 0; i < N; i += VISIT_BUFSIZE) { + size_t sz = std::min(VISIT_BUFSIZE, (N-i)); + memcpy(visit, &visit_remote[i], sz*sizeof(int)); + + for (int j = 0; j < sz; ++j) { + // read k + int k = visit[j]; + + // set k on all tiles + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) + *kp_group[tile] = k; + + float r_local = iproduct(query, &database[k * VSIZE]); +#ifdef DEBUG_MASTER + bsg_print_float(r_local); +#endif + rp[__bsg_id] = r_local; + + // read r from all tiles + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) { + float r_remote = sleep_until_valid(&rp[tile], INFINITY); +#ifdef DEBUG_MASTER + bsg_print_float(r_remote); +#endif + r += r_remote; + } + } + } + + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) + *kp_group[tile] = SYNC_DONE; + + return (int)r; + } + + __attribute__((noinline)) + void inner_product_ubmk_slave(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int *kp, + float *rp) + { + float r = 0.0; + int k; + + while (true) { + // load next + k = sleep_until_valid(kp, SYNC_INV); + if (k == SYNC_DONE) + break; + + r = iproduct(query, &database[k * VSIZE]); + *rp = r; +#ifdef DEBUG_SLAVE + bsg_print_float(r); +#endif + } + } + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all) + { + static int k = SYNC_INV; + static float r [BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM] = {INFINITY}; + float rr; + + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + if (__bsg_id == 0) { + // enter master loop + rr = inner_product_ubmk_master(database, q, N, visit_remote_all, &k, r); + } else { + // enter slave loop + inner_product_ubmk_slave(database, q, &k, bsg_tile_group_remote_pointer(0,0, &r[__bsg_id])); + } + bsg_cuda_print_stat_end(0); + + return (int)(rr); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp new file mode 100644 index 000000000..be92dfcc9 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp @@ -0,0 +1,80 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +//#include +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" +#include "sleep_until_valid.hpp" + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define VISIT_BUFSIZE 512 + +using InnerProduct = InnerProductParallel_v1; +using barrier = bsg_barrier; + +#ifdef __cplusplus +extern "C" { +#endif + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all) + { + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + barrier b; + + bsg_cuda_print_stat_start(0); + + InnerProduct ip(database, q); + ip.init(); + float r = 0.0; + int visit[VISIT_BUFSIZE]; + int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id]; + + for (int i = 0; i < N; i += VISIT_BUFSIZE) { + size_t sz = std::min(VISIT_BUFSIZE, (N-i)); + memcpy(visit, &visit_remote[i], sz*sizeof(int)); + + for (int j = 0; j < sz; ++j) { + // read k + int k = visit[j]; + float rp = ip.inner_product(k); + r += rp; + } + } + + ip.exit(); + bsg_cuda_print_stat_end(0); + b.sync(); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp new file mode 100644 index 000000000..fc8dd7c82 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp @@ -0,0 +1,71 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + + int inner_product_ubmk(const float *database, const float *query, int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + const float *b = &database[i*3*VSIZE]; + r += inner_product(q,b); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp new file mode 100644 index 000000000..9d9dcbc11 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp @@ -0,0 +1,94 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define VISIT_BUFSIZE 512 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v3(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all) + { + float q[VSIZE]; + float r = 0; + int visit[VISIT_BUFSIZE]; + //int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id_x * __bsg_tile_group_id_y]; + int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id]; + //int *visit_remote = &visit_remote_all[0]; + + bsg_print_int(-1 * __bsg_tile_group_id); + bsg_print_int(N); + bsg_print_hexadecimal(reinterpret_cast(database)); + bsg_print_hexadecimal(reinterpret_cast(query)); + bsg_print_hexadecimal(reinterpret_cast(visit_remote_all)); + + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + for (int i = 0; i < N; i += VISIT_BUFSIZE) { + size_t sz = std::min(VISIT_BUFSIZE, (N-i)); + memcpy(visit, &visit_remote[i], sz*sizeof(int)); + + for (int j = 0; j < sz; ++j) { + int k = visit[j]; + //r += iproduct(q, &database[(i+j*3)*VSIZE]); + r += iproduct(q, &database[k*VSIZE]); + } + } + bsg_cuda_print_stat_end(0); + + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp new file mode 100644 index 000000000..2deb68437 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp @@ -0,0 +1,76 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v1(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + //const float *b = &database[i*3*VSIZE]; + r += iproduct(q, &database[i*3*VSIZE]); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp new file mode 100644 index 000000000..0d4fce43b --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp @@ -0,0 +1,76 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v2(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + //const float *b = &database[i*3*VSIZE]; + r += iproduct(q, &database[i*3*VSIZE]); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp new file mode 100644 index 000000000..8f1058017 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp @@ -0,0 +1,76 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v3(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + //const float *b = &database[i*3*VSIZE]; + r += iproduct(q, &database[i*3*VSIZE]); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp new file mode 100644 index 000000000..c1ab7a9ba --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp @@ -0,0 +1,76 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v4(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + //const float *b = &database[i*3*VSIZE]; + r += iproduct(q, &database[i*3*VSIZE]); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/template.mk b/examples/sdh-eval-workloads/ipnsw/template.mk new file mode 100644 index 000000000..13c1e5919 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/template.mk @@ -0,0 +1,72 @@ +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) +include $(REPLICANT_PATH)/environment.mk +include $(BSG_MACHINE_PATH)/Makefile.machine.include + +# kernel code +BSG_MANYCORE_KERNELS = kernel.riscv + +RISCV_CCPPFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/kernel/include +RISCV_CCPPFLAGS += -Dbsg_tiles_X=1 +RISCV_CCPPFLAGS += -Dbsg_tiles_Y=1 + +RISCV_TARGET_OBJECTS = kernel.rvo +kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) +RISCV_OPT_LEVEL = -O3 +include $(EXAMPLES_PATH)/cuda/riscv.mk +RISCV_LDFLAGS := $(filter-out -nostdlib,$(RISCV_LDFLAGS)) + +# host code +graphtools-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/graph-tools +hammerblade-helpers-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hammerblade-helpers + +include $(graphtools-dir)/libgraphtools.mk +include $(hammerblade-helpers-dir)/libhammerblade-helpers-host.mk + +# header files +TEST_HEADERS := $(libhammerblade-helpers-host-interface-headers) +TEST_HEADERS += $(libgraphtools-interface-headers) +TEST_HEADERS += GreedyWalkResults.hpp +TEST_HEADERS += IO.hpp +TEST_HEADERS += IPNSWGraph.hpp +TEST_HEADERS += IPNSWRunner.hpp +TEST_HEADERS += IPNSWKernelRunner.hpp +TEST_HEADERS += GreedyWalkKernelRunner.hpp +TEST_HEADERS += BeamSearchKernelRunner.hpp +TEST_HEADERS += IProductUBmkKernelRunner.hpp +TEST_HEADERS += IPNSWResultReader.hpp +TEST_HEADERS += GreedyWalkResultReader.hpp +TEST_HEADERS += BeamSearchResultReader.hpp +TEST_HEADERS += GreedyWalkResults.hpp +TEST_HEADERS += IPNSWFactory.hpp +TEST_HEADERS += GreedyWalkFactory.hpp +TEST_HEADERS += BeamSearchFactory.hpp +TEST_HEADERS += IProductUBmkFactory.hpp +TEST_HEADERS += StringHelpers.hpp + +# source files +TEST_SOURCES := GreedyWalkResults.cpp +TEST_SOURCES += ipnsw.cpp + +# cxxflags +CXXFLAGS += $(libgraphtools-interface-cxxflags) +CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags) +CXXFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw +CXXFLAGS += -DCOSIM + +# ldflags +LDFLAGS += $(libgraphtools-interface-ldflags) +LDFLAGS += $(libhammerblade-helpers-host-interface-ldflags) + +vpath %.cpp $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw +vpath %.hpp $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw + +TEST_NAME = main + +include $(EXAMPLES_PATH)/compilation.mk +include $(EXAMPLES_PATH)/link.mk + +# mark dependencies +$(TEST_OBJECTS): $(libgraphtools-interface-libraries) +$(TEST_OBJECTS): $(TEST_HEADERS) + +include $(EXAMPLES_PATH)/execution.mk