diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..f352d419a
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,12 @@
+[submodule "n"]
+	path = examples/sdh-eval-workloads/ipnsw/hb-prog-eval
+	url = git@github.com:bespoke-silicon-group/hb-prog-eval
+[submodule "examples/sdh-eval-workloads/ipnsw/graph-tools"]
+	path = examples/sdh-eval-workloads/ipnsw/graph-tools
+	url = git@github.com:bespoke-silicon-group/graph-tools
+[submodule "examples/sdh-eval-workloads/ipnsw/hammerblade-helpers"]
+	path = examples/sdh-eval-workloads/ipnsw/hammerblade-helpers
+	url = git@github.com:bespoke-silicon-group/hammerblade-helpers
+[submodule "examples/graphit/graphit-src"]
+	path = examples/graphit/graphit-src
+	url = git@github.com:bespoke-silicon-group/graphit.git 
diff --git a/examples/Makefile b/examples/Makefile
index 1bc3055e8..6fda8a5ef 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -45,7 +45,7 @@ include $(REPLICANT_PATH)/environment.mk
 include $(EXAMPLES_PATH)/link.mk
 
 # Supported example suites
-TARGETS = library spmd cuda python
+TARGETS = library spmd cuda python graphit
 
 # Define the tests that get run
 TESTS += test_loader
diff --git a/examples/cuda/riscv.mk b/examples/cuda/riscv.mk
index 87a52d511..00b37c1eb 100644
--- a/examples/cuda/riscv.mk
+++ b/examples/cuda/riscv.mk
@@ -244,7 +244,7 @@ RISCV_LDFLAGS += -Wl,--no-check-sections
 # This builds a .riscv binary for the current machine type and tile
 # group size. RISCV_TARGET_OBJECTS are .rvo files that will be linked
 # in the final binary.
-%.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) 
+kernel.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) 
 	$(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@
 
 kernel.link.clean:
diff --git a/examples/graphit/Makefile b/examples/graphit/Makefile
new file mode 100644
index 000000000..600ef53f4
--- /dev/null
+++ b/examples/graphit/Makefile
@@ -0,0 +1,61 @@
+# Copyright (c) 2019, University of Washington All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+# 
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+# 
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Makefile compiles, links, and executes examples Run `make help`
+# to see the available targets for the selected platform.
+
+# environment.mk verifies the build environment and sets the following
+# makefile variables:
+#
+# LIBRAIRES_PATH: The path to the libraries directory
+# HARDWARE_PATH: The path to the hardware directory
+# EXAMPLES_PATH: The path to the examples directory
+# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
+# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
+# CL_DIR: Path to the directory of this AWS F1 Project
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+
+include $(REPLICANT_PATH)/environment.mk
+
+# Defines REGRESSION_PREBUILD
+include $(EXAMPLES_PATH)/link.mk
+
+# Define the tests that get run
+TESTS += test_pr_nibble
+
+regression: $(TESTS)
+	@echo "GRAPHIT REGRESSION PASSED"
+
+$(TESTS): $(REGRESSION_PREBUILD)
+	$(MAKE) -C $@ regression
+
+clean: $(TESTS:=.clean)
+
+%.clean:
+	$(MAKE) -C $(@:.clean=) clean
+
+.PHONY: clean regression $(TESTS) %.clean
diff --git a/examples/graphit/graphit-src b/examples/graphit/graphit-src
new file mode 160000
index 000000000..9f4d8e9ba
--- /dev/null
+++ b/examples/graphit/graphit-src
@@ -0,0 +1 @@
+Subproject commit 9f4d8e9bacac0ed44afe7c3abde697f21457a487
diff --git a/examples/graphit/pr_nibble/Makefile b/examples/graphit/pr_nibble/Makefile
new file mode 100644
index 000000000..af6475765
--- /dev/null
+++ b/examples/graphit/pr_nibble/Makefile
@@ -0,0 +1,206 @@
+# Copyright (c) 2021, University of Washington All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+#
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+#
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Makefile compiles, links, and executes examples Run `make help`
+# to see the available targets for the selected platform.
+
+################################################################################
+# environment.mk verifies the build environment and sets the following
+# makefile variables:
+#
+# LIBRAIRES_PATH: The path to the libraries directory
+# HARDWARE_PATH: The path to the hardware directory
+# EXAMPLES_PATH: The path to the examples directory
+# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
+# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
+###############################################################################
+
+CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+
+include $(REPLICANT_PATH)/environment.mk
+SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
+CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
+GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-src
+
+GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
+# TEST_NAME is the basename of the executable
+TEST_NAME = main
+# KERNEL_NAME is the name of the CUDA-Lite Kernel
+KERNEL_NAME = pr_nibble
+HOST_TARGET := $(TEST_NAME).exec
+
+BASE_VERSIONS += hybrid
+
+ITERATIONS := 0 1 2 3 4 5 6 7 8 9
+v-from-basev-and-iter = $1-iteration-$2
+basev-from-v          = $(word 1,$(subst -iteration-, ,$1))
+iter-from-v           = $(word 2,$(subst -iteration-, ,$1))
+
+VERSIONS := $(foreach i,$(ITERATIONS),$(foreach v,$(BASE_VERSIONS),\
+        $(call v-from-basev-and-iter,$v,$i)))
+
+VERSION-DIRS := $(foreach v,$(VERSIONS),kernel/$v)
+
+.PHONY: $(VERSION-DIRS)
+$(VERSION-DIRS): 
+	cp -r $(call basev-from-v,$@) $@
+
+.PHONY: versions bleach-versions
+versions: $(VERSION-DIRS)
+bleach-versions: 
+	rm -rf $(VERSION-DIRS)
+
+DEFAULT_VERSION := hybrid
+KERNEL_DEFAULT 	:= kernel/$(DEFAULT_VERSION)/kernel.cpp
+
+###############################################################################
+# Host code compilation flags and flow
+###############################################################################
+
+# TEST_SOURCES is a list of source files that need to be compiled
+TEST_SOURCES = main.cpp
+
+DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE
+CDEFINES += 
+CXXDEFINES += 
+
+FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
+CFLAGS   += -std=c99 $(FLAGS)
+CXXFLAGS += -std=c++11 $(FLAGS) 
+
+
+# compilation.mk defines rules for compilation of C/C++
+include $(EXAMPLES_PATH)/compilation.mk
+
+# Specify any header file dependencies
+main.o: INCLUDES += -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/
+
+###############################################################################
+# Host code link flags and flow
+###############################################################################
+
+LDFLAGS += 
+
+# link.mk defines rules for linking of the final execution binary.
+include $(EXAMPLES_PATH)/link.mk
+
+###############################################################################
+# Device code compilation flow
+###############################################################################
+
+# BSG_MANYCORE_KERNELS is a list of manycore executables that should
+# be built before executing.
+BSG_MANYCORE_KERNELS = kernel.riscv 
+
+kernel.rvo: RISCV_CXX = $(RISCV_GXX)
+kernel.riscv: kernel.rvo
+
+%/kernel.rvo: RISCV_CXX = $(RISCV_GXX)
+
+# Tile Group Dimensions
+TILE_GROUP_DIM_X = 16
+TILE_GROUP_DIM_Y = 8
+RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
+RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
+
+RISCV_INCLUDES += -I$(CURRENT_PATH)/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/
+
+include $(EXAMPLES_PATH)/cuda/riscv.mk
+
+%/kernel.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo %/kernel.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT)
+	$(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@
+
+###############################################################################
+# Execution flow
+#
+# C_ARGS: Use this to pass arguments that you want to appear in argv
+#         For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
+#
+# SIM_ARGS: Use this to pass arguments to the simulator
+###############################################################################
+#C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH)
+C_ARGS ?= $(KERNEL_NAME) -g $(GRAPH_PATH)
+
+SIM_ARGS ?=
+
+# Include platform-specific execution rules
+include $(EXAMPLES_PATH)/execution.mk
+
+
+$(VERSIONS): %: kernel/%/$(HOST_TARGET).log
+
+ALIASES = vanilla_stats.csv vcache_stats.csv dramsim3epoch.json dramsim3.json dramsim3.tag.json dramsim3.txt 
+$(ALIASES): $(HOST_TARGET).log ;
+$(HOST_TARGET).log: $(HOST_TARGET) kernel.riscv 
+	./$(HOST_TARGET) $(SIM_ARGS) +c_args="kernel.riscv $(DEFAULT_VERSION) $(C_ARGS)" 2>&1 | tee $@  
+
+
+KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a)
+.PRECIOUS: $(KERNEL_ALIASES) kernel/%/kernel.riscv
+$(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ;
+kernel/%/$(HOST_TARGET).log: $(HOST_TARGET) kernel/%/kernel.riscv 
+	$(eval EXEC_PATH   := $(patsubst %/,%,$(dir $@)))
+	$(eval KERNEL_PATH := $(CURRENT_PATH)/$(EXEC_PATH))
+	$(eval _VERSION    := $(notdir $(EXEC_PATH)))
+	cd $(EXEC_PATH) && \
+	$(CURRENT_PATH)/$(HOST_TARGET) $(SIM_ARGS) +c_args="$(KERNEL_PATH)/kernel.riscv $(_VERSION) $(C_ARGS)" \
+		2>&1 | tee $(notdir $a)
+
+.PRECIOUS: %.log
+
+all-versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log)
+
+###############################################################################
+# Regression Flow
+###############################################################################
+
+regression: versions all-versions 
+	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
+
+###############################################################################
+# Default rules, help, and clean
+###############################################################################
+.DEFAULT_GOAL := help
+help:
+	@echo "Usage:"
+	@echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}"
+	@echo "      $(TEST_NAME).profile: Build executable with profilers enabled"
+	@echo "      $(TEST_NAME).debug: Build waveform executable (if VCS)"
+	@echo "      $(TEST_NAME).{profile,debug}.log: Run specific executable"
+	@echo "      clean: Remove all subdirectory-specific outputs"
+
+print-%  : ; @echo $* = $($*)
+
+version.clean:
+	rm -rf kernel/*/*{.ucli,.csv,.log,.rvo,.riscv,.vpd,.key,.dis,.ll,.ll.s}
+
+.PHONY: clean
+
+clean: bleach-versions version.clean
+
+
diff --git a/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp
new file mode 100644
index 000000000..294d564a6
--- /dev/null
+++ b/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp
@@ -0,0 +1,175 @@
+//#define DEBUG
+#include <bsg_manycore.h>
+
+#ifdef DEBUG
+#define BSG_TILE_GROUP_X_DIM 1 
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM 
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM 
+#else
+#include <bsg_set_tile_x_y.h>
+#endif
+
+#include <bsg_tile_group_barrier.hpp>
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+
+#include <pr_nibble.hpp>
+#include <cstring>
+
+#ifdef DEBUG
+#define pr_dbg(fmt, ...)            \
+        bsg_printf(fmt, ##__VA_ARGS__)
+#else
+#define pr_dbg(fmt, ...)
+#endif
+
+__attribute__((section(".dram"))) float  * __restrict p;
+__attribute__((section(".dram"))) float  * __restrict old_rank;
+__attribute__((section(".dram"))) float  * __restrict new_rank;
+__attribute__((section(".dram"))) int  * __restrict out_degree;
+
+
+template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{
+    int start, end;
+    local_range(V, &start, &end);
+    for ( int d = start; d < end; d++) {
+        int degree = in_indices[d + 1] - in_indices[d];
+        int * neighbors = &in_neighbors[in_indices[d]];
+        for(int s = 0; s < degree; s++) { 
+            if(from_vertexset[neighbors[s]]) {
+                apply_func (neighbors[s] , d);
+            }
+        } //end of loop on in neighbors
+    } //end of outer for loop
+    return 0;
+} //end of edgeset apply function 
+
+template <typename APPLY_FUNC > int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{
+    int start, end;
+    local_range(V, &start, &end);
+    for ( int s = start; s < end; s++) {
+        if(from_vertexset[s]) {
+            int degree = out_indices[s + 1] - out_indices[s];
+            int * neighbors = &out_neighbors[out_indices[s]];
+            for(int d = 0; d < degree; d++) { 
+                apply_func (s, neighbors[d]);
+    
+            }
+        } //end of loop on in neighbors
+    } //end of outer for loop
+    return 0;
+} //end of edgeset apply function 
+
+struct updateEdge
+{
+    void operator() (int src, int dst)
+    {
+        float alpha = 0.15; 
+        new_rank[dst] = (new_rank[dst] + (((((1)    - alpha) / ((1)  + alpha)) * old_rank[src]) / out_degree[src]));
+    };
+};
+struct updateSelf
+{
+    void operator() (int v)
+    {
+        float alpha = 0.15; 
+        p[v] = (p[v] + ((((2)  * alpha) / ((1)  + alpha)) * old_rank[v]));
+        new_rank[v] = (0) ;
+    };
+};
+struct filter_frontier
+{
+    bool operator() (int v)
+    {
+        float epsilon = (float) 1e-6; 
+        bool output ;
+        if(new_rank[v] == 0) return 0;
+        output = (new_rank[v]) > ((out_degree[v] * epsilon));
+        return output;
+    };
+};
+
+extern "C" int  __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) {
+    bsg_cuda_print_stat_start(tag_c);
+    barrier.sync();
+    int start, end;
+    local_range(V, &start, &end);
+    for (int iter_x = start; iter_x < end; iter_x++) {
+        if(frontier[iter_x]) {updateSelf()(iter_x);}
+    }
+    bsg_cuda_print_stat_end(tag_c);
+    barrier.sync();
+    return 0;
+}
+extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
+    barrier.sync();
+    bsg_cuda_print_stat_start(tag_c);
+    bsg_saif_start();
+    edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x);
+    bsg_saif_end();
+    bsg_cuda_print_stat_end(tag_c);
+    barrier.sync();
+    return 0;
+}
+
+ extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
+    barrier.sync(); 
+    bsg_cuda_print_stat_start(tag_c);
+    bsg_saif_start();
+    edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x);
+    bsg_saif_end();
+    bsg_cuda_print_stat_end(tag_c);
+    barrier.sync();
+    return 0;
+}
+
+extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { 
+    bsg_cuda_print_stat_start(tag_c);
+    barrier.sync();
+    int start, end;
+    local_range(V, &start, &end);
+    for (int iter_x = start; iter_x < end; iter_x++) {
+        if (iter_x < V) {
+            next5[iter_x] = 0;
+            if ( filter_frontier()( iter_x ) ) {
+                next5[iter_x] = 1;
+            }
+                }
+        else { break; }
+    } //end of loop
+    bsg_cuda_print_stat_end(tag_c);
+    barrier.sync();
+    return 0;
+}
+
+extern "C" void prefetch(int * in_indices, int * in_neighbors, int * from_vertexset, int V, int E) {
+        int id = __bsg_id;
+        int threads = bsg_tiles_X * bsg_tiles_Y;
+        // prefetch all data;
+        for (int i = 32 * id; i < E; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (in_neighbors[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (in_indices[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (from_vertexset[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (out_degree[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (p[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (old_rank[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (new_rank[i]));
+        }
+        barrier.sync();
+        return ;
+
+}
diff --git a/examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp b/examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp
new file mode 100644
index 000000000..ee50a54d6
--- /dev/null
+++ b/examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#ifndef __PR_PULL_BENCHMARK_HPP
+#define __PR_PULL_BENCHMARK_HPP
+
+#include <math.h>
+#include <local_range.h>
+#include <vertex_struct.h>
+#include <atomics.h>
+#endif
diff --git a/examples/graphit/pr_nibble/main.cpp b/examples/graphit/pr_nibble/main.cpp
new file mode 100644
index 000000000..aa2d4032f
--- /dev/null
+++ b/examples/graphit/pr_nibble/main.cpp
@@ -0,0 +1,177 @@
+#include "pr.hpp"
+
+//#define DEBUG
+
+#define VERIFY 0
+
+#ifdef DEBUG
+#define X 1 
+#define Y 1
+#else
+#define X 16 //tile group dim X 
+#define Y 8 // tile group dim Y
+#endif
+
+#define ROOT 6 
+#define NUM_LOCKS 1024 //width of manycore * 64
+
+GraphHB edges; 
+GlobalScalar<hb_mc_eva_t> p_dev;
+GlobalScalar<hb_mc_eva_t> old_rank_dev;
+GlobalScalar<hb_mc_eva_t> new_rank_dev;
+GlobalScalar<hb_mc_eva_t> out_degree_dev;
+
+#include "pr_host.hpp"
+
+int test_pr_nibble(int argc, char ** argv){
+    InputParser input(argc, argv);
+    if(!input.cmdOptionExists("-g")){
+        std::cerr << "no input args\n";
+        return 0;
+    }
+    std::string ucode_path = input.getRISCVFile();
+
+    int iter = 0;
+    std::string iterstrbase = "iteration-";
+    auto pos = ucode_path.find(iterstrbase);
+    auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos);
+    std::stringstream ss(iterstr);
+    ss >> iter;
+    std::cerr << "iteration: " << iter << std::endl;
+
+    int version = 0; //default to vertex pull
+    if(ucode_path.find("push") != std::string::npos) {
+        version = 1;
+    }
+    int hybrid = 0; //default to vertex pull
+    if(ucode_path.find("hybrid") != std::string::npos) {
+        hybrid = 1;
+    }
+    std::cerr << "version: " << version << std::endl;
+    std::cerr << "hybrid: " << hybrid << std::endl;
+    std::cerr << "load microcode" << std::endl;
+    hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
+
+    std::cerr << "load graph" << std::endl;
+    std::string graph_f = input.getCmdOption("-g");
+    edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); 
+
+    std::cerr << "size of graph: " << std::endl;
+    std::cerr << edges.num_nodes() << std::endl;
+    std::cerr << edges.num_edges() << std::endl; 
+
+    std::cerr << "init global scalars" << std::endl; 
+    p_dev = GlobalScalar<hb_mc_eva_t>("p");
+    hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), p_dev);
+    old_rank_dev = GlobalScalar<hb_mc_eva_t>("old_rank");
+    hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), old_rank_dev);
+    new_rank_dev = GlobalScalar<hb_mc_eva_t>("new_rank");
+    hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), new_rank_dev);
+    out_degree_dev = GlobalScalar<hb_mc_eva_t>("out_degree");
+    hammerblade::init_global_array<int32_t>(hammerblade::builtin_getVerticesHB(edges), out_degree_dev);
+    
+    std::cerr << "init locks" << std::endl;
+    GlobalScalar<hb_mc_eva_t> glbl_locks = GlobalScalar<hb_mc_eva_t>("locks");
+    hammerblade::init_global_array<std::atomic<int>>(NUM_LOCKS, glbl_locks);
+    std::atomic<int> tmp_a[NUM_LOCKS] = {};
+    Device::Ptr device = Device::GetInstance();
+    int start_vertex = ROOT;
+    Vector<int32_t> frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
+
+    std::vector<int32_t> hfrontier(edges.num_nodes(), 0);
+    std::vector<float> p(edges.num_nodes(), (float) 0.0);
+    std::vector<float> new_rank(edges.num_nodes(), (float) 0.0);
+    std::vector<float> old_rank(edges.num_nodes(), (float) 0.0);
+    std::vector<int32_t> out_degs = edges.get_out_degrees();
+
+    //compute up to current iter on host
+    hfrontier[start_vertex] = 1;
+    new_rank[start_vertex] = (float) 1.0;
+    old_rank[start_vertex] = (float) 1.0;
+    host_pr_calc(p, old_rank, new_rank, hfrontier, iter);
+
+    //copy all variables at their current state to device
+    frontier.copyToDevice(hfrontier.data(), hfrontier.size());
+    hammerblade::write_global_buffer_dma<float>(p.data(), p_dev, p.size());  
+    hammerblade::write_global_buffer_dma<float>(old_rank.data(), old_rank_dev, old_rank.size());    
+    hammerblade::write_global_buffer_dma<float>(new_rank.data(), new_rank_dev, new_rank.size());    
+    hammerblade::write_global_buffer_dma<int32_t>(out_degs.data(), out_degree_dev, out_degs.size());    
+    //initialize locks for atomics on device
+    hammerblade::write_global_buffer_dma<std::atomic<int>>(tmp_a, glbl_locks, NUM_LOCKS);
+
+    device->freeze_cores();
+    device->write_dma();
+    device->unfreeze_cores();
+    //determine push or pull traversal for this iteration 
+    if(hybrid) { 
+        int num_items = std::count(hfrontier.begin(), hfrontier.end(), 1);
+        int dir = calculate_direction(num_items, hfrontier, edges, edges.num_nodes(), edges.num_edges());
+        if(dir){ 
+            version = 0; //pull
+        } else {
+            version = 1; //push
+        }
+    }
+
+    std::cerr << "start of while loop\n";
+    int tag_c = 0;
+    int f_sz = 0;
+    switch(version) {
+        case 0: //vertex pull
+            std::cerr << "pull kernel\n";
+            std::cerr << "preloading the cache\n";
+            device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()});
+            device->runJobs();
+            std::cerr << "run update self vertex kernel\n";
+            device->enqueueJob("updateself_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
+            device->runJobs();
+            tag_c++;
+            std::cerr << "run update edges kernel on iter : " << iter << "\n";
+            device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
+            device->runJobs();
+            tag_c++;
+            std::cerr << "create next frontier\n";
+            device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+            device->runJobs();
+            std::cerr << "swap arrays\n";
+            hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
+            f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
+            std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
+            break;
+        case 1: //vertex push
+            std::cerr << "push kernel\n";
+            std::cerr << "preloading the cache\n";
+            device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()});
+            device->runJobs();
+            std::cerr << "run update self vertex kernel\n";
+            device->enqueueJob("updateself_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
+            device->runJobs();
+            tag_c++;
+            std::cerr << "run update edges kernel on iter : " << iter << "\n";
+            device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); 
+            device->runJobs();
+            tag_c++;
+            std::cerr << "create next frontier\n";
+            device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+            device->runJobs();
+            std::cerr << "swap arrays\n";
+            hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
+            f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
+            std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
+            break;
+    }
+    if(VERIFY) {
+        ofstream ver_file;
+        ver_file.open("./rank.txt");
+        float host_rank[edges.num_nodes()];
+        hammerblade::read_global_buffer_dma<float>(host_rank, old_rank_dev, edges.num_nodes());
+        for(int i = 0; i < edges.num_nodes(); i++) {
+            ver_file << host_rank[i] << std::endl;
+        }
+        ver_file.close();  
+    }
+    device->finish(); 
+    return 0;
+}
+
+declare_program_main("test_pr_nibble", test_pr_nibble); 
diff --git a/examples/graphit/pr_nibble/pr.hpp b/examples/graphit/pr_nibble/pr.hpp
new file mode 100644
index 000000000..ae01c8cc2
--- /dev/null
+++ b/examples/graphit/pr_nibble/pr.hpp
@@ -0,0 +1,25 @@
+#pragma once
+#ifndef __PR_PULL_BENCHMARK_HPP
+#define __PR_PULL_BENCHMARK_HPP
+
+#include "hb_intrinsics.h" //graphit host runtime libs
+#include "infra_hb/host/arg_parser.hpp"
+#include <bsg_manycore_regression.h>
+#include <string.h>
+#include <stdio.h>
+#include <iostream>
+#include <fstream> 
+#include <atomic>
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_errno.h>
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_loader.h>
+#include <bsg_manycore_cuda.h>
+
+
+using hammerblade::Device;
+using hammerblade::Vector;
+using hammerblade::GraphHB;
+using hammerblade::GlobalScalar;
+
+#endif
diff --git a/examples/graphit/pr_nibble/pr_host.hpp b/examples/graphit/pr_nibble/pr_host.hpp
new file mode 100644
index 000000000..fcbb811e0
--- /dev/null
+++ b/examples/graphit/pr_nibble/pr_host.hpp
@@ -0,0 +1,45 @@
+//function to compute pr-nibble on host up to current iter
+#pragma once
+#include <iostream>
+#include <fstream>
+
+inline void host_pr_calc(std::vector<float> & p, std::vector<float> & old_rank, std::vector<float> & new_rank, std::vector<int> & frontier, int iter) {
+        float alpha = (float) 0.15;
+        float epsilon = (float) 1e-06;
+        auto g = edges.getHostGraph();
+        int * in_neigh = g.in_neighbors_shared_.get();
+        int ** in_index = g.in_index_shared_.get();
+        for(int i = 0; i < iter; i++) {
+                new_rank.assign(old_rank.begin(), old_rank.end());
+                //print out iteration and size:
+                int num_items = std::count(frontier.begin(), frontier.end(), 1);
+                std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl;
+                //update_self
+                for(int v = 0; v < g.num_nodes(); v++) {
+                        if(frontier[v]) {
+                            p[v] += (2.0 * alpha) / (1.0    + alpha) * old_rank[v];
+                            new_rank[v] = (float) 0.0 ;
+                        }
+                }
+                //update edges
+                for(int d = 0; d < g.num_nodes(); d++) {
+                        for(int s : g.in_neigh(d)) {
+                                if(frontier[s]){
+                                        float update = ((1.0 - alpha) / (1.0    + alpha)) * old_rank[s];
+                                        update = update / ((float) g.out_degree(s));
+                                        new_rank[d] += update;
+                                }
+                        }
+                }
+                old_rank.assign(new_rank.begin(), new_rank.end());
+                //update frontier
+                for(int v = 0; v < g.num_nodes(); v++) {
+                        frontier[v] = 0;
+                        if(g.out_degree(v) > 0 && old_rank[v] >= (((float) g.out_degree(v)) * epsilon)) {
+                                frontier[v] = 1;
+                        }
+                }
+        }
+        int num_items = std::count(frontier.begin(), frontier.end(), 1);
+        std::cerr << "returning with frontier size: " << num_items << std::endl;
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/.gitignore b/examples/sdh-eval-workloads/ipnsw/.gitignore
new file mode 100644
index 000000000..737e26b00
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/.gitignore
@@ -0,0 +1 @@
+run/
\ No newline at end of file
diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp
new file mode 100644
index 000000000..3d14b2c8d
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp
@@ -0,0 +1,11 @@
+#pragma once
+#include "IPNSWFactory.hpp"
+#include "BeamSearchKernelRunner.hpp"
+#include "BeamSearchResultReader.hpp"
+namespace ipnsw {
+    class BeamSearchFactory : public IPNSWFactory {
+    private:
+        IPNSWKernelRunner *_KernelRunner() const { return new BeamSearchKernelRunner; }
+        IPNSWResultReader *_ResultReader() const { return new BeamSearchResultReader; }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp
new file mode 100644
index 000000000..6fe724b68
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp
@@ -0,0 +1,52 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWRunner.hpp"
+#include "GreedyWalkResults.hpp"
+
+namespace ipnsw {
+    class BeamSearchKernelRunner : public IPNSWKernelRunner {
+        std::string kernelName(const IPNSWRunner & runner) const {
+            return "ipnsw_beam_search";
+        }
+
+        Dim tgd(const IPNSWRunner & runner) const {
+            return Dim(runner.cfg().grp_x(),
+                       runner.cfg().grp_y());
+        }
+
+        Dim gd(const IPNSWRunner & runner) const {
+            return Dim(runner.cfg().grid_x(),
+                       runner.cfg().grid_y());
+        }
+        std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+            int v_curr;
+            float d_curr;
+            std::vector<int> do_queries = runner._io->do_queries();
+            if (do_queries.empty()) {
+                v_curr = std::get<GWR_VERT>(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]);
+                d_curr = std::get<GWR_DIST>(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]);
+            } else {
+                v_curr = std::get<GWR_VERT>(GREEDY_WALK_RESULTS[do_queries[0]]);
+                d_curr = std::get<GWR_DIST>(GREEDY_WALK_RESULTS[do_queries[0]]);
+            }
+
+            HammerBlade::Ptr hb = HammerBlade::Get();
+            hb->write(runner.v_curr_dev(0), &v_curr, sizeof(v_curr));
+            hb->write(runner.d_curr_dev(0), &d_curr, sizeof(d_curr));
+
+            std::vector<hb_mc_eva_t> argv = {
+                runner.graph_metadata_dev(),
+                runner.db_dev(),
+                runner.query_dev(0),
+                runner.seen_dev(0),
+                runner.v_curr_dev(0),
+                runner.d_curr_dev(0),
+                runner.candidates_dev(0),
+                runner.results_dev(0),
+                runner.n_results_dev(0),
+            };
+            return argv;
+        }
+
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp
new file mode 100644
index 000000000..3d4cc7493
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp
@@ -0,0 +1,26 @@
+#pragma once
+#include "IPNSWRunner.hpp"
+#include "IPNSWResultReader.hpp"
+#include "GreedyWalkResults.hpp"
+
+namespace ipnsw {
+    class BeamSearchResultReader : public IPNSWResultReader {
+    public:
+        void readResults(const IPNSWRunner & runner) {
+            HammerBlade::Ptr hb = HammerBlade::Get();
+
+            hb_mc_eva_t grp = 0;
+            int n_results;
+            hb->read(runner.n_results_dev(grp), &n_results, sizeof(int));
+
+            std::vector<GreedyWalkResult> results(n_results);
+            hb->push_read(runner.results_dev(grp), &results[0], n_results * sizeof(GreedyWalkResult));
+            hb->sync_read();
+
+            std::cout << "Beam search:" << std::endl;
+            for (auto & r : results) {
+                std::cout << "{" << std::get<0>(r) << "," << std::get<1>(r) << "}" << std::endl;
+            }
+        }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp
new file mode 100644
index 000000000..e98f11ad2
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp
@@ -0,0 +1,12 @@
+#pragma once
+#include "IPNSWFactory.hpp"
+#include "GreedyWalkKernelRunner.hpp"
+#include "GreedyWalkResultReader.hpp"
+
+namespace ipnsw {
+    class GreedyWalkFactory : public IPNSWFactory {
+    private:
+        IPNSWKernelRunner *_KernelRunner() const { return new GreedyWalkKernelRunner; }
+        IPNSWResultReader *_ResultReader() const { return new GreedyWalkResultReader; }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp
new file mode 100644
index 000000000..ac51739b4
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp
@@ -0,0 +1,35 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWRunner.hpp"
+
+namespace ipnsw {
+    class GreedyWalkKernelRunner : public IPNSWKernelRunner {
+
+        Dim tgd(const IPNSWRunner & runner) const {
+            return Dim(runner.cfg().grp_x(),
+                       runner.cfg().grp_y());
+        }
+
+        Dim gd(const IPNSWRunner & runner) const {
+            return Dim(runner.cfg().grid_x(),
+                       runner.cfg().grid_y());
+        }
+
+        std::string kernelName(const IPNSWRunner & runner) const {
+            return "ipnsw_greedy_search";
+        }
+
+        std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+            std::vector<hb_mc_eva_t> argv = {
+                runner.graph_metadata_dev(),
+                runner.db_dev(),
+                runner.query_dev(0),
+                runner.seen_dev(0),
+                runner.v_curr_dev(0),
+                runner.d_curr_dev(0),
+            };
+            return argv;
+        }
+
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp
new file mode 100644
index 000000000..6ca7851ff
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp
@@ -0,0 +1,21 @@
+#pragma once
+#include "IPNSWRunner.hpp"
+#include "IPNSWResultReader.hpp"
+
+namespace ipnsw {
+    class GreedyWalkResultReader : public IPNSWResultReader {
+    public:
+        void readResults(const IPNSWRunner & runner) {
+            HammerBlade::Ptr hb = HammerBlade::Get();
+            int v_curr;
+            float d_curr;
+
+            hb->read(runner.v_curr_dev(0), &v_curr, sizeof(int));
+            hb->read(runner.d_curr_dev(0), &d_curr, sizeof(float));
+
+            std::cout << "Greedy walk (v_curr,d_curr) = "
+                      << "(" << v_curr << "," << d_curr << ")"
+                      << std::endl;
+        }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp
new file mode 100644
index 000000000..7d37104df
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp
@@ -0,0 +1,517 @@
+#include "GreedyWalkResults.hpp"
+namespace ipnsw {
+    std::vector<GreedyWalkResult> GREEDY_WALK_RESULTS = {
+	GreedyWalkResult(static_cast<float>(-0x1.94442e0000000p-2), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.e72901fffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.cb85360000001p-4),541780),
+	GreedyWalkResult(static_cast<float>(-0x1.e56d7ffffffffp-8), 78517),
+	GreedyWalkResult(static_cast<float>(-0x1.655f860000000p-4),732469),
+	GreedyWalkResult(static_cast<float>(-0x1.04cbcc0000000p-4),380912),
+	GreedyWalkResult(static_cast<float>(-0x1.3243d20000000p-5),606365),
+	GreedyWalkResult(static_cast<float>(-0x1.2dbf640000000p-4),950108),
+	GreedyWalkResult(static_cast<float>(-0x1.fa90ea0000001p-1),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.2922f80000000p-3),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.5974060000000p-1),725033),
+	GreedyWalkResult(static_cast<float>(-0x1.abcf2c0000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.b262380000000p-1),272753),
+	GreedyWalkResult(static_cast<float>(-0x1.c0e98a0000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.01b4680000000p-2),184077),
+	GreedyWalkResult(static_cast<float>(-0x1.96e3280000000p-2),208965),
+	GreedyWalkResult(static_cast<float>(-0x1.58dd120000000p-3),580161),
+	GreedyWalkResult(static_cast<float>(-0x1.1f333a0000000p-3),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.8db7de0000000p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.4e43500000000p-2),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.a5ae760000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.7fcff00000000p-5),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.5630f40000000p-1),960530),
+	GreedyWalkResult(static_cast<float>(-0x1.48d8c20000000p-1),853984),
+	GreedyWalkResult(static_cast<float>(-0x1.14556ffffffffp+0),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.a746760000000p-2),865184),
+	GreedyWalkResult(static_cast<float>(-0x1.ddcb81fffffffp-3),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.94a92ffffffffp-2),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.45b69c0000000p-1),432335),
+	GreedyWalkResult(static_cast<float>(-0x1.2ef8fa0000000p-3),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.9909440000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1ce937fffffffp-5),321516),
+	GreedyWalkResult(static_cast<float>(-0x1.c0de380000000p-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.0de8e60000000p-8),897966),
+	GreedyWalkResult(static_cast<float>(-0x1.99783c0000000p-1),865184),
+	GreedyWalkResult(static_cast<float>(-0x1.01316e0000000p+0),886263),
+	GreedyWalkResult(static_cast<float>(-0x1.a172140000000p-6),177485),
+	GreedyWalkResult(static_cast<float>(-0x1.2b8f9a0000000p-7),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.924b440000000p-5),290055),
+	GreedyWalkResult(static_cast<float>(-0x1.8515aa0000000p-2),905210),
+	GreedyWalkResult(static_cast<float>(-0x1.f68975ffffffep-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.dd5ed00000001p-6),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.be40740000000p-1),870888),
+	GreedyWalkResult(static_cast<float>(-0x1.08f4460000001p-2),666073),
+	GreedyWalkResult(static_cast<float>(-0x1.2589100000000p-2),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.e43ad00000001p-3),230001),
+	GreedyWalkResult(static_cast<float>(-0x1.161b360000000p+0),646867),
+	GreedyWalkResult(static_cast<float>(-0x1.475e87fffffffp-6),179303),
+	GreedyWalkResult(static_cast<float>(-0x1.425b1c0000000p-1),463324),
+	GreedyWalkResult(static_cast<float>(-0x1.f4b68c0000000p-1),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.1333440000000p-1),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.0e35aa0000000p-1),312088),
+	GreedyWalkResult(static_cast<float>(-0x1.1b7653fffffffp+0),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.cb8adc0000000p-3),491377),
+	GreedyWalkResult(static_cast<float>(-0x1.51a0380000000p-1),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.e4b9940000000p-2),603696),
+	GreedyWalkResult(static_cast<float>(-0x1.623f9a0000000p-2),991097),
+	GreedyWalkResult(static_cast<float>(-0x1.1660b20000000p-1), 18868),
+	GreedyWalkResult(static_cast<float>(-0x1.bd75200000000p-7), 56131),
+	GreedyWalkResult(static_cast<float>(-0x1.4dbbe00000000p+0), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.1b55860000000p-5),310512),
+	GreedyWalkResult(static_cast<float>(-0x1.1f40e00000000p+0),115894),
+	GreedyWalkResult(static_cast<float>(-0x1.d403c60000001p-2),718485),
+	GreedyWalkResult(static_cast<float>(-0x1.a7b7bdfffffffp-7),601673),
+	GreedyWalkResult(static_cast<float>(-0x1.7f5c8c0000000p-2),552153),
+	GreedyWalkResult(static_cast<float>(-0x1.6834060000001p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.8ccf620000000p-2),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.1508660000000p+0),666073),
+	GreedyWalkResult(static_cast<float>(-0x1.6362300000000p-1),982683),
+	GreedyWalkResult(static_cast<float>(-0x1.175fbc0000000p-4),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.10e30a0000000p-5),703851),
+	GreedyWalkResult(static_cast<float>(-0x1.0343340000000p+0),580161),
+	GreedyWalkResult(static_cast<float>(-0x1.9337a20000000p-3),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.986e8a0000000p-7),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.1f400a0000000p+0),336830),
+	GreedyWalkResult(static_cast<float>(-0x1.3c0e060000000p-1),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.8589cc0000000p-1),118607),
+	GreedyWalkResult(static_cast<float>(-0x1.745f000000000p-3),272753),
+	GreedyWalkResult(static_cast<float>(-0x1.317ca40000000p-4),494402),
+	GreedyWalkResult(static_cast<float>(-0x1.ebd52a0000001p-7),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.7ad9100000001p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.6ed8a00000000p-2),134880),
+	GreedyWalkResult(static_cast<float>(-0x1.273edc0000000p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.93db8c0000000p-1),620143),
+	GreedyWalkResult(static_cast<float>(-0x1.324dd60000000p-4),778172),
+	GreedyWalkResult(static_cast<float>(-0x1.3c59a80000000p-1),270175),
+	GreedyWalkResult(static_cast<float>(-0x1.fc51e80000000p-2),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.a7fbc60000000p-2),603696),
+	GreedyWalkResult(static_cast<float>(-0x1.ab76780000000p-1),406402),
+	GreedyWalkResult(static_cast<float>(-0x1.8733320000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.447bb00000000p-1),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.b5c3140000000p-5),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.ca9b880000000p-1),785859),
+	GreedyWalkResult(static_cast<float>(-0x1.beee640000000p-1),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.47b4e80000000p-1),738101),
+	GreedyWalkResult(static_cast<float>(-0x1.069a7c0000000p-1),193430),
+	GreedyWalkResult(static_cast<float>(-0x1.20f53c0000000p-1),118809),
+	GreedyWalkResult(static_cast<float>(-0x1.1612f80000000p-2),711979),
+	GreedyWalkResult(static_cast<float>(-0x1.25c6c80000000p-1),348136),
+	GreedyWalkResult(static_cast<float>(-0x1.2507300000000p-2), 36731),
+	GreedyWalkResult(static_cast<float>(-0x1.14ef720000001p+0),268974),
+	GreedyWalkResult(static_cast<float>(-0x1.2b54f80000000p-4), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.e07ccbfffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.070d960000000p-4),785239),
+	GreedyWalkResult(static_cast<float>(-0x1.49e6200000000p-1),496330),
+	GreedyWalkResult(static_cast<float>(-0x1.86c9080000000p-1),969505),
+	GreedyWalkResult(static_cast<float>(-0x1.0b584c0000000p-1),587902),
+	GreedyWalkResult(static_cast<float>(-0x1.bb1ee00000000p-7),439426),
+	GreedyWalkResult(static_cast<float>(-0x1.ff17c9fffffffp-11),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.0da6980000000p+0),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1d0ba40000001p-3),288912),
+	GreedyWalkResult(static_cast<float>(-0x1.301dec0000000p-1),541780),
+	GreedyWalkResult(static_cast<float>(-0x1.2f9b800000000p-4),261103),
+	GreedyWalkResult(static_cast<float>(-0x1.8d769e0000000p-4),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.6ea4f80000000p-3),223977),
+	GreedyWalkResult(static_cast<float>(-0x1.fcc7dc0000000p-2),662137),
+	GreedyWalkResult(static_cast<float>(-0x1.5949fe0000000p-3),565830),
+	GreedyWalkResult(static_cast<float>(-0x1.1a11aa0000000p-1),908217),
+	GreedyWalkResult(static_cast<float>(-0x1.8bff140000000p-1),  2251),
+	GreedyWalkResult(static_cast<float>(-0x1.7ccda1fffffffp-2),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.80bf6e0000000p-2), 50016),
+	GreedyWalkResult(static_cast<float>(-0x1.3444300000000p-2),  2251),
+	GreedyWalkResult(static_cast<float>(-0x1.c8e8bc0000000p-1),223249),
+	GreedyWalkResult(static_cast<float>(-0x1.679767fffffffp-3),494887),
+	GreedyWalkResult(static_cast<float>(-0x1.6c896c0000000p-3),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.413b740000000p-4),772422),
+	GreedyWalkResult(static_cast<float>(-0x1.4e1d760000000p-3),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.7202fe0000001p-1),131611),
+	GreedyWalkResult(static_cast<float>(-0x1.2589840000000p+0),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.5820da0000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.96ceb00000001p-3),177485),
+	GreedyWalkResult(static_cast<float>(-0x1.d6ac77fffffffp-4),865184),
+	GreedyWalkResult(static_cast<float>(-0x1.bfefa00000000p-7),149329),
+	GreedyWalkResult(static_cast<float>(-0x1.69ac280000000p-1), 73867),
+	GreedyWalkResult(static_cast<float>(-0x1.04bb900000000p+0),567514),
+	GreedyWalkResult(static_cast<float>(-0x1.142a3dfffffffp+0),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.2f1ca40000000p-5),552153),
+	GreedyWalkResult(static_cast<float>(-0x1.1def580000000p-1),679881),
+	GreedyWalkResult(static_cast<float>(-0x1.072ac60000000p-4), 29163),
+	GreedyWalkResult(static_cast<float>(-0x1.2821940000000p-4),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.72a68e0000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.cafce80000000p-3),729852),
+	GreedyWalkResult(static_cast<float>(-0x1.3ba2d80000000p-2),729021),
+	GreedyWalkResult(static_cast<float>(-0x1.68739e0000000p-3),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.aeb25c0000000p-1),134880),
+	GreedyWalkResult(static_cast<float>(-0x1.18c0840000000p-5),693842),
+	GreedyWalkResult(static_cast<float>(-0x1.fe21ce0000001p-1), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.b41fb00000001p-1),735181),
+	GreedyWalkResult(static_cast<float>(-0x1.2826320000000p-8),379502),
+	GreedyWalkResult(static_cast<float>(-0x1.5eecda0000000p-1),925333),
+	GreedyWalkResult(static_cast<float>(-0x1.b002d40000000p-1),842476),
+	GreedyWalkResult(static_cast<float>(-0x1.4e53aa0000000p-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.a1b49bfffffffp-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.f1c7ac0000000p-1),750819),
+	GreedyWalkResult(static_cast<float>(-0x1.67f6720000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.31a6600000001p-6),341861),
+	GreedyWalkResult(static_cast<float>(-0x1.61c1080000000p-3),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.aaa3780000000p-2),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.3fa68a0000001p-6),160291),
+	GreedyWalkResult(static_cast<float>(-0x1.38c0b20000000p-1),379199),
+	GreedyWalkResult(static_cast<float>(-0x1.ee68980000001p-2),318485),
+	GreedyWalkResult(static_cast<float>(-0x1.dd852c0000001p-2),655315),
+	GreedyWalkResult(static_cast<float>(-0x1.06fa43fffffffp+0),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.07007e0000000p+0),926790),
+	GreedyWalkResult(static_cast<float>(-0x1.f352a1fffffffp-1),523435),
+	GreedyWalkResult(static_cast<float>(-0x1.c6d6160000000p-1),169991),
+	GreedyWalkResult(static_cast<float>(-0x1.090c620000000p-5),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.19f6860000000p+0),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.e3f8580000001p-2),255916),
+	GreedyWalkResult(static_cast<float>(-0x1.2148180000000p-1),206826),
+	GreedyWalkResult(static_cast<float>(-0x1.0487660000000p-2),494402),
+	GreedyWalkResult(static_cast<float>(-0x1.be5ea00000000p-3),532480),
+	GreedyWalkResult(static_cast<float>(-0x1.114b0a0000000p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1e0a2a0000000p-7),379350),
+	GreedyWalkResult(static_cast<float>(-0x1.22f06bfffffffp+0),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.bc42c20000000p-1),133288),
+	GreedyWalkResult(static_cast<float>(-0x1.9ec387fffffffp-2),495101),
+	GreedyWalkResult(static_cast<float>(-0x1.ab66b80000000p-3),115894),
+	GreedyWalkResult(static_cast<float>(-0x1.9be6e80000000p-4),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.4cdc7ffffffffp-6),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.c7a31c0000000p-7),764589),
+	GreedyWalkResult(static_cast<float>(-0x1.a35f1c0000000p-8),115043),
+	GreedyWalkResult(static_cast<float>(-0x1.3422a00000000p-1),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.5a4aa60000000p-4), 49557),
+	GreedyWalkResult(static_cast<float>(-0x1.06eddc0000000p-2),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.d46bde0000000p-1),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.02e72c0000000p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.e33abffffffffp-2),112248),
+	GreedyWalkResult(static_cast<float>(-0x1.ae74060000001p-4),133288),
+	GreedyWalkResult(static_cast<float>(-0x1.272a2bfffffffp-7),850826),
+	GreedyWalkResult(static_cast<float>(-0x1.357f25fffffffp-2),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.33c9f1fffffffp-3), 25893),
+	GreedyWalkResult(static_cast<float>(-0x1.771fdc0000001p-5),305162),
+	GreedyWalkResult(static_cast<float>(-0x1.18a1080000000p-4),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.46ad1e0000000p-4),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.0a53300000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.783f4e0000000p-6),546811),
+	GreedyWalkResult(static_cast<float>(-0x1.3f05b60000000p-3),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.602d5c0000000p-3),463324),
+	GreedyWalkResult(static_cast<float>(-0x1.c8f2b20000000p-5),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.0bde920000000p+0),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.8eb3fe0000000p-1),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.d981120000002p-3),849285),
+	GreedyWalkResult(static_cast<float>(-0x1.d8151a0000001p-1),133288),
+	GreedyWalkResult(static_cast<float>(-0x1.c231ec0000000p-1),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.c742700000000p-1),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.2a6c6a0000000p+0),945767),
+	GreedyWalkResult(static_cast<float>(-0x1.5b8c5bfffffffp-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.391a700000000p-12),562015),
+	GreedyWalkResult(static_cast<float>(-0x1.896b960000000p-1),969505),
+	GreedyWalkResult(static_cast<float>(-0x1.28e7fe0000000p-3),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.577a11fffffffp-4),348136),
+	GreedyWalkResult(static_cast<float>(-0x1.43b7f80000000p-4),950108),
+	GreedyWalkResult(static_cast<float>(-0x1.7e64600000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.97ebe20000000p-5),392823),
+	GreedyWalkResult(static_cast<float>(-0x1.a856440000000p-3),793084),
+	GreedyWalkResult(static_cast<float>(-0x1.84531a0000000p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.7c80d40000000p-4),186838),
+	GreedyWalkResult(static_cast<float>(-0x1.0c56e5fffffffp+0),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.72c0da0000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1844d00000000p-5),606365),
+	GreedyWalkResult(static_cast<float>(-0x1.52a5d40000000p-10),470059),
+	GreedyWalkResult(static_cast<float>(-0x1.7d31400000000p-1),738101),
+	GreedyWalkResult(static_cast<float>(-0x1.c47df00000000p-7),710471),
+	GreedyWalkResult(static_cast<float>(-0x1.dc3ccbfffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.5e773c0000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.7ffd660000000p-2),920345),
+	GreedyWalkResult(static_cast<float>(-0x1.ab0dc00000001p-2),677155),
+	GreedyWalkResult(static_cast<float>(-0x1.7f8db00000000p-5),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.add3b60000000p-1),293302),
+	GreedyWalkResult(static_cast<float>(-0x1.e0328c0000000p-4),758625),
+	GreedyWalkResult(static_cast<float>(-0x1.6022ce0000000p-5),666073),
+	GreedyWalkResult(static_cast<float>(-0x1.a1d241fffffffp-4),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.cec5e60000000p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.893f260000000p-3),855760),
+	GreedyWalkResult(static_cast<float>(-0x1.0790c00000000p-2),145893),
+	GreedyWalkResult(static_cast<float>(-0x1.49456ffffffffp-7),215955),
+	GreedyWalkResult(static_cast<float>(-0x1.71b1bc0000001p-5),312088),
+	GreedyWalkResult(static_cast<float>(-0x1.8b1c580000000p-1),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.2010d20000000p-4),142436),
+	GreedyWalkResult(static_cast<float>(-0x1.c33ecc0000000p-4),280878),
+	GreedyWalkResult(static_cast<float>(-0x1.6b1dce0000000p-2),444780),
+	GreedyWalkResult(static_cast<float>(-0x1.f76bb60000001p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.87151ffffffffp-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.f522ae0000000p-2),  9333),
+	GreedyWalkResult(static_cast<float>(-0x1.77d5c40000001p-4),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.f7f4edfffffffp-5),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.1c46b00000000p-1),270226),
+	GreedyWalkResult(static_cast<float>(-0x1.a4f43bfffffffp-6),140906),
+	GreedyWalkResult(static_cast<float>(-0x1.8952480000000p-1),670146),
+	GreedyWalkResult(static_cast<float>(-0x1.ca891c0000000p-7),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.e36b85fffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1aaf580000000p-3),909372),
+	GreedyWalkResult(static_cast<float>(-0x1.8116920000000p-8), 51434),
+	GreedyWalkResult(static_cast<float>(-0x1.acc07e0000000p-1), 26012),
+	GreedyWalkResult(static_cast<float>(-0x1.a2316c0000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.3a68660000000p-3),628152),
+	GreedyWalkResult(static_cast<float>(-0x1.c199e80000000p-2),907223),
+	GreedyWalkResult(static_cast<float>(-0x1.8bfc920000000p-3), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.c9b8520000000p-5),568921),
+	GreedyWalkResult(static_cast<float>(-0x1.be82e20000000p-2),134880),
+	GreedyWalkResult(static_cast<float>(-0x1.8cabe60000001p-2),660609),
+	GreedyWalkResult(static_cast<float>(-0x1.7222980000000p-1),118809),
+	GreedyWalkResult(static_cast<float>(-0x1.b313ea0000000p-1),842476),
+	GreedyWalkResult(static_cast<float>(-0x1.8b56380000000p-7), 38538),
+	GreedyWalkResult(static_cast<float>(-0x1.3e74440000000p-3),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.6349900000000p-9),136557),
+	GreedyWalkResult(static_cast<float>(-0x1.2128060000001p+0),672634),
+	GreedyWalkResult(static_cast<float>(-0x1.25d0560000001p-8),314066),
+	GreedyWalkResult(static_cast<float>(-0x1.206c1a0000000p+0),288181),
+	GreedyWalkResult(static_cast<float>(-0x1.696a200000001p-3),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.1a74180000000p-1),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.608e8a0000000p-2),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.e583780000001p-1),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.cdfae5fffffffp-1),288181),
+	GreedyWalkResult(static_cast<float>(-0x1.53c3200000001p-5),926790),
+	GreedyWalkResult(static_cast<float>(-0x1.a8f37bfffffffp-5),164698),
+	GreedyWalkResult(static_cast<float>(-0x1.e1399ffffffffp-7),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.adf8240000000p-3),587902),
+	GreedyWalkResult(static_cast<float>(-0x1.f91ca60000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.b717880000000p-3), 70417),
+	GreedyWalkResult(static_cast<float>(-0x1.57b0760000000p-4),939764),
+	GreedyWalkResult(static_cast<float>(-0x1.1de1ca0000000p+0), 74899),
+	GreedyWalkResult(static_cast<float>(-0x1.c67da40000000p-2),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.64c96c0000001p-2),261103),
+	GreedyWalkResult(static_cast<float>(-0x1.54c6240000000p-1),107308),
+	GreedyWalkResult(static_cast<float>(-0x1.0274f60000000p-2),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.5b05140000000p-1),969505),
+	GreedyWalkResult(static_cast<float>(-0x1.1a4ca80000000p-6),950108),
+	GreedyWalkResult(static_cast<float>(-0x1.de24900000000p-1),836318),
+	GreedyWalkResult(static_cast<float>(-0x1.5c834e0000000p-3),228059),
+	GreedyWalkResult(static_cast<float>(-0x1.682d5c0000000p-3),107308),
+	GreedyWalkResult(static_cast<float>(-0x1.b96de80000000p-1),532480),
+	GreedyWalkResult(static_cast<float>(-0x1.f1c5680000000p-1),186838),
+	GreedyWalkResult(static_cast<float>(-0x1.d87015fffffffp-3),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.992d1ffffffffp-2),884850),
+	GreedyWalkResult(static_cast<float>(-0x1.38d1580000001p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.a59a700000001p-3),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.bb07fe0000001p-5),531816),
+	GreedyWalkResult(static_cast<float>(-0x1.48fa060000000p-1),128603),
+	GreedyWalkResult(static_cast<float>(-0x1.81b2000000001p-7),129055),
+	GreedyWalkResult(static_cast<float>(-0x1.4bfc5bfffffffp-2),576030),
+	GreedyWalkResult(static_cast<float>(-0x1.4683200000000p-1),727476),
+	GreedyWalkResult(static_cast<float>(-0x1.9165800000000p-5), 38538),
+	GreedyWalkResult(static_cast<float>(-0x1.2b59be0000000p-3),941181),
+	GreedyWalkResult(static_cast<float>(-0x1.21086e0000000p-5),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.1fb5700000000p-7),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.3fa0620000000p-2), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.d2b8bdfffffffp-2),355312),
+	GreedyWalkResult(static_cast<float>(-0x1.ec8a43fffffffp-2),532480),
+	GreedyWalkResult(static_cast<float>(-0x1.eeaace0000000p-9),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.5649140000000p-1),842476),
+	GreedyWalkResult(static_cast<float>(-0x1.49e3ae0000001p-6), 29163),
+	GreedyWalkResult(static_cast<float>(-0x1.b53db20000001p-5),442413),
+	GreedyWalkResult(static_cast<float>(-0x1.5aa6380000000p-3),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.cdc0f80000000p-3),450479),
+	GreedyWalkResult(static_cast<float>(-0x1.c9aab80000000p-2),541408),
+	GreedyWalkResult(static_cast<float>(-0x1.0d78740000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1a48820000000p-6),810043),
+	GreedyWalkResult(static_cast<float>(-0x1.3a76fc0000000p-1),804725),
+	GreedyWalkResult(static_cast<float>(-0x1.2f318a0000000p-7),562579),
+	GreedyWalkResult(static_cast<float>(-0x1.6c91920000000p-2),270226),
+	GreedyWalkResult(static_cast<float>(-0x1.9ac5940000000p-4),263560),
+	GreedyWalkResult(static_cast<float>(-0x1.42bc8c0000000p-1),112754),
+	GreedyWalkResult(static_cast<float>(-0x1.906b7c0000000p-1),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.3586ac0000000p-7), 53791),
+	GreedyWalkResult(static_cast<float>(-0x1.69ef5a0000000p-3),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.4e4f3e0000000p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.b379440000000p-1),980037),
+	GreedyWalkResult(static_cast<float>(-0x1.1a94380000000p+0),624004),
+	GreedyWalkResult(static_cast<float>(-0x1.5e22e00000001p-8), 36331),
+	GreedyWalkResult(static_cast<float>(-0x1.919a7c0000000p-1),883883),
+	GreedyWalkResult(static_cast<float>(-0x1.0313ea0000000p+0),117555),
+	GreedyWalkResult(static_cast<float>(-0x1.8781320000000p-2),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.8504900000000p-2),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.2e79740000000p-2),827608),
+	GreedyWalkResult(static_cast<float>(-0x1.91ac000000000p-5),355549),
+	GreedyWalkResult(static_cast<float>(-0x1.e0b6b80000000p-6),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.ae8bd00000000p-1), 26012),
+	GreedyWalkResult(static_cast<float>(-0x1.edd4cc0000001p-5),587902),
+	GreedyWalkResult(static_cast<float>(-0x1.1191160000000p-6),750819),
+	GreedyWalkResult(static_cast<float>(-0x1.3c69140000000p-2),192244),
+	GreedyWalkResult(static_cast<float>(-0x1.30a7540000000p+0),804725),
+	GreedyWalkResult(static_cast<float>(-0x1.77bda40000002p-5),654035),
+	GreedyWalkResult(static_cast<float>(-0x1.f0496e0000001p-1),  2251),
+	GreedyWalkResult(static_cast<float>(-0x1.788009fffffffp-4),439426),
+	GreedyWalkResult(static_cast<float>(-0x1.3527f9fffffffp+0),354262),
+	GreedyWalkResult(static_cast<float>(-0x1.1914b20000000p+0), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.4b03460000000p-4),648421),
+	GreedyWalkResult(static_cast<float>(-0x1.25ae300000000p-1),292300),
+	GreedyWalkResult(static_cast<float>(-0x1.cd467c0000000p-6), 47898),
+	GreedyWalkResult(static_cast<float>(-0x1.e082960000001p-3),169790),
+	GreedyWalkResult(static_cast<float>(-0x1.38970e0000000p-5),495101),
+	GreedyWalkResult(static_cast<float>(-0x1.d88693fffffffp-2),348136),
+	GreedyWalkResult(static_cast<float>(-0x1.13046c0000000p-1),439129),
+	GreedyWalkResult(static_cast<float>(-0x1.ed2e720000001p-4),749981),
+	GreedyWalkResult(static_cast<float>(-0x1.b162180000000p-5),864388),
+	GreedyWalkResult(static_cast<float>(-0x1.458a1a0000000p-2),121683),
+	GreedyWalkResult(static_cast<float>(-0x1.ffddf40000000p-6), 82234),
+	GreedyWalkResult(static_cast<float>(-0x1.c99b320000001p-6),495323),
+	GreedyWalkResult(static_cast<float>(-0x1.aa13de0000000p-3),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.36671e0000000p-4),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.276aaa0000000p-2),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.41718e0000000p-6),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.39280c0000000p-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.8156020000000p-6),854497),
+	GreedyWalkResult(static_cast<float>(-0x1.075a840000000p+0),930775),
+	GreedyWalkResult(static_cast<float>(-0x1.0b01560000000p-1), 52041),
+	GreedyWalkResult(static_cast<float>(-0x1.fabeec0000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.794f3a0000000p-1),384841),
+	GreedyWalkResult(static_cast<float>(-0x1.d9d54dfffffffp-1),419057),
+	GreedyWalkResult(static_cast<float>(-0x1.c27da80000000p-2),219992),
+	GreedyWalkResult(static_cast<float>(-0x1.0d06660000000p-5),563395),
+	GreedyWalkResult(static_cast<float>(-0x1.7ee86e0000000p-1),348136),
+	GreedyWalkResult(static_cast<float>(-0x1.a219b9fffffffp-3),969505),
+	GreedyWalkResult(static_cast<float>(-0x1.434a760000000p-4), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.6cf4380000000p-1),677921),
+	GreedyWalkResult(static_cast<float>(-0x1.94c9c00000000p-6),901398),
+	GreedyWalkResult(static_cast<float>(-0x1.c625540000000p-5),932100),
+	GreedyWalkResult(static_cast<float>(-0x1.2309d40000000p-1),677155),
+	GreedyWalkResult(static_cast<float>(-0x1.3719a60000000p-4),112754),
+	GreedyWalkResult(static_cast<float>(-0x1.2c1eba0000000p-6),527498),
+	GreedyWalkResult(static_cast<float>(-0x1.affd100000000p-1),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.09db9c0000000p-2),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.b991e00000000p-4),535044),
+	GreedyWalkResult(static_cast<float>(-0x1.2c3aec0000000p-8),938124),
+	GreedyWalkResult(static_cast<float>(-0x1.cce0d20000000p-1),496356),
+	GreedyWalkResult(static_cast<float>(-0x1.d80a4a0000000p-8),776790),
+	GreedyWalkResult(static_cast<float>(-0x1.b3f6ec0000000p-1),749772),
+	GreedyWalkResult(static_cast<float>(-0x1.d370f60000000p-1),441230),
+	GreedyWalkResult(static_cast<float>(-0x1.17859e0000000p+0), 12009),
+	GreedyWalkResult(static_cast<float>(-0x1.552dde0000000p-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.1e56f40000000p+0), 56131),
+	GreedyWalkResult(static_cast<float>(-0x1.5b74140000000p-4),186084),
+	GreedyWalkResult(static_cast<float>(-0x1.2bc8580000000p+0),870888),
+	GreedyWalkResult(static_cast<float>(-0x1.03ba840000000p+0),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.9e8ea80000000p-2),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.9181880000000p-6),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.fd3e6a0000000p-3),255668),
+	GreedyWalkResult(static_cast<float>(-0x1.d793e5fffffffp-6),511753),
+	GreedyWalkResult(static_cast<float>(-0x1.335bf00000000p-6),679881),
+	GreedyWalkResult(static_cast<float>(-0x1.98bd340000000p-1), 56131),
+	GreedyWalkResult(static_cast<float>(-0x1.37253c0000000p-3),337863),
+	GreedyWalkResult(static_cast<float>(-0x1.55a79e0000000p-2),270226),
+	GreedyWalkResult(static_cast<float>(-0x1.f2ead00000001p-3),430269),
+	GreedyWalkResult(static_cast<float>(-0x1.f45e060000002p-3),226356),
+	GreedyWalkResult(static_cast<float>(-0x1.c435d60000001p-9), 81654),
+	GreedyWalkResult(static_cast<float>(-0x1.1ea9580000000p+0),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.cc1a520000000p-2),444956),
+	GreedyWalkResult(static_cast<float>(-0x1.9428000000000p-2),914163),
+	GreedyWalkResult(static_cast<float>(-0x1.8f2a440000000p-2), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.077cdc0000000p+0),582680),
+	GreedyWalkResult(static_cast<float>(-0x1.31819c0000000p-3),292300),
+	GreedyWalkResult(static_cast<float>(-0x1.5ae2840000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.0f86240000000p-1),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.e4b8040000000p-2),  5217),
+	GreedyWalkResult(static_cast<float>(-0x1.92a3020000000p-6),866106),
+	GreedyWalkResult(static_cast<float>(-0x1.4c2bd40000000p-3),560074),
+	GreedyWalkResult(static_cast<float>(-0x1.96bfae0000000p-2),225945),
+	GreedyWalkResult(static_cast<float>(-0x1.7cfb9a0000000p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.809e320000001p-5),890893),
+	GreedyWalkResult(static_cast<float>(-0x1.1156de0000000p-1),313671),
+	GreedyWalkResult(static_cast<float>(-0x1.eb64960000000p-1), 23136),
+	GreedyWalkResult(static_cast<float>(-0x1.5a97fa0000000p-2),228059),
+	GreedyWalkResult(static_cast<float>(-0x1.2f87c20000001p-1),945767),
+	GreedyWalkResult(static_cast<float>(-0x1.45a1460000000p-4), 29348),
+	GreedyWalkResult(static_cast<float>(-0x1.ddef220000001p-3),580161),
+	GreedyWalkResult(static_cast<float>(-0x1.0b9e120000000p-5),179074),
+	GreedyWalkResult(static_cast<float>(-0x1.f977160000001p-4),141149),
+	GreedyWalkResult(static_cast<float>(-0x1.b366bc0000000p-1),660609),
+	GreedyWalkResult(static_cast<float>(-0x1.7009520000000p-2),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.08adbe0000000p-3),550091),
+	GreedyWalkResult(static_cast<float>(-0x1.c989580000000p-4),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.56433a0000000p-1),672634),
+	GreedyWalkResult(static_cast<float>(-0x1.dbe0b00000000p-5),667763),
+	GreedyWalkResult(static_cast<float>(-0x1.11c0620000000p+0),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.d6d5560000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.899de00000000p-1),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.fc2835fffffffp-4),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.28141e0000000p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.abb32c0000000p-1),134340),
+	GreedyWalkResult(static_cast<float>(-0x1.2c2b640000001p-3),926855),
+	GreedyWalkResult(static_cast<float>(-0x1.3447780000000p-3), 47688),
+	GreedyWalkResult(static_cast<float>(-0x1.5fb8300000000p-6),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.73cba7fffffffp-4), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.b99f040000000p-1), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.6b9ba60000000p-1),112754),
+	GreedyWalkResult(static_cast<float>(-0x1.d3aa360000000p-1),192244),
+	GreedyWalkResult(static_cast<float>(-0x1.25282a0000000p+0),275023),
+	GreedyWalkResult(static_cast<float>(-0x1.16c09a0000000p-5), 56131),
+	GreedyWalkResult(static_cast<float>(-0x1.bdd6720000000p-3),667763),
+	GreedyWalkResult(static_cast<float>(-0x1.7421400000000p-1),587902),
+	GreedyWalkResult(static_cast<float>(-0x1.dfa079fffffffp-9),630231),
+	GreedyWalkResult(static_cast<float>(-0x1.debb760000001p-2),778627),
+	GreedyWalkResult(static_cast<float>(-0x1.3589be0000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.a659d00000000p-3),353498),
+	GreedyWalkResult(static_cast<float>(-0x1.9f913bfffffffp-4),936836),
+	GreedyWalkResult(static_cast<float>(-0x1.3b78740000000p-3),504419),
+	GreedyWalkResult(static_cast<float>(-0x1.42611c0000000p-3),107308),
+	GreedyWalkResult(static_cast<float>(-0x1.4e66860000000p-6),439809),
+	GreedyWalkResult(static_cast<float>(-0x1.4a79000000000p-1),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.41902a0000000p+0),774981),
+	GreedyWalkResult(static_cast<float>(-0x1.4850a60000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.a7bf000000000p-1),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.9d67d60000001p-5),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.c908860000000p-2),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.63e9520000000p-2),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.e423200000000p-5),295526),
+	GreedyWalkResult(static_cast<float>(-0x1.91894ffffffffp-2),476414),
+	GreedyWalkResult(static_cast<float>(-0x1.29ba4a0000000p-4),774219),
+	GreedyWalkResult(static_cast<float>(-0x1.a577500000000p-1),582680),
+	GreedyWalkResult(static_cast<float>(-0x1.de39c80000000p-2),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.f75ad40000001p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.93794a0000000p-1),750819),
+	GreedyWalkResult(static_cast<float>(-0x1.5f65ec0000000p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.23f7820000000p-6),786537),
+	GreedyWalkResult(static_cast<float>(-0x1.a4f01e0000000p-1),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.218c620000000p-12),134340),
+	GreedyWalkResult(static_cast<float>(-0x1.33a59e0000000p-1), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.c9920c0000000p-2),523435),
+	GreedyWalkResult(static_cast<float>(-0x1.18be840000000p-2),865184),
+	GreedyWalkResult(static_cast<float>(-0x1.0442d60000000p-1),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.047e940000000p+0),255668),
+	GreedyWalkResult(static_cast<float>(-0x1.0d97ac0000000p-1),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.2a5e4e0000001p-4),660609),
+	GreedyWalkResult(static_cast<float>(-0x1.f4887bfffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.a8d50c0000001p-2),531816),
+	GreedyWalkResult(static_cast<float>(-0x1.8e5e300000001p-4),541780),
+	GreedyWalkResult(static_cast<float>(-0x1.06e1a40000000p-2),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.2e98940000000p-5),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.1d7bb60000000p-4),320041),
+	GreedyWalkResult(static_cast<float>(-0x1.93514a0000000p-6), 38538),
+	GreedyWalkResult(static_cast<float>(-0x1.fe2429fffffffp-2),292300),
+	GreedyWalkResult(static_cast<float>(-0x1.161f500000000p-6), 38538),
+	GreedyWalkResult(static_cast<float>(-0x1.3d90900000000p-6),318039),
+	GreedyWalkResult(static_cast<float>(-0x1.01c5040000000p-2),532480),
+	GreedyWalkResult(static_cast<float>(-0x1.4f30960000000p-4),223261),
+	GreedyWalkResult(static_cast<float>(-0x1.8a9b3c0000000p-4),382537),
+	GreedyWalkResult(static_cast<float>(-0x1.02d07a0000000p-4),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.9527260000001p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.047eea0000000p-1),886263),
+	GreedyWalkResult(static_cast<float>(-0x1.d0deba0000000p-1),278930),
+	GreedyWalkResult(static_cast<float>(-0x1.5c2d320000000p-1),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.f1670a0000000p-8),580161),
+	GreedyWalkResult(static_cast<float>(-0x1.1426ce0000000p-3),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.b5f0ee0000000p-5),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.efd5180000000p-6),696486),
+	GreedyWalkResult(static_cast<float>(-0x1.f1b0440000000p-6),118809),
+	GreedyWalkResult(static_cast<float>(-0x1.28d45c0000000p-1),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.f18c5e0000000p-1),184077),
+	GreedyWalkResult(static_cast<float>(-0x1.50e1320000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.fb43600000000p-2),467026),
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp
new file mode 100644
index 000000000..ec4a799d7
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include <tuple>
+#include <vector>
+namespace ipnsw {
+    using GreedyWalkResult = std::pair<float, int>;
+    extern std::vector<GreedyWalkResult> GREEDY_WALK_RESULTS;
+    static constexpr int GWR_DIST = 0;
+    static constexpr int GWR_VERT = 1;
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GroupData.hpp b/examples/sdh-eval-workloads/ipnsw/GroupData.hpp
new file mode 100644
index 000000000..b9052ab23
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GroupData.hpp
@@ -0,0 +1,10 @@
+#include <bsg_manycore_cuda.h>
+namespace ipnsw {
+    struct GroupData {
+        hb_mc_eva_t seen_mem;
+        hb_mc_eva_t candidates_mem;
+        hb_mc_eva_t results_mem;
+        hb_mc_eva_t curr;
+        hb_mc_eva_t n_results;
+    };
+};
diff --git a/examples/sdh-eval-workloads/ipnsw/IO.hpp b/examples/sdh-eval-workloads/ipnsw/IO.hpp
new file mode 100644
index 000000000..52f0bad5b
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IO.hpp
@@ -0,0 +1,255 @@
+#pragma once
+#include <vector>
+#include <string>
+#include <sstream>
+#include <Graph.hpp>
+#include <Graph500Data.hpp>
+#include <StringHelpers.hpp>
+#include <sstream>
+#include <map>
+
+namespace ipnsw {
+    //using graph_tools::Graph;
+    //using graph_tools::Graph500Data;
+
+    class Parser {
+    public:
+        using OptionTable = std::map<std::string, std::string>;
+
+        Parser(){}
+
+        void parse(int argc, char *argv[]) {
+            int pos = 0;
+            int arg = 0;
+
+            while (arg < argc) {
+                std::string argstr = std::string(argv[arg]);
+                if (ipnsw::startswith(argstr, "--")) {
+                    // optional argument
+                    if (++arg >= argc) {
+                        throw std::runtime_error("'" + argstr + "' requries an argument");
+                    }
+                    _options[argstr] = std::string(argv[arg]);
+
+                } else {
+                    // positional argument
+                    switch (pos++) {
+                    case 0:
+                        _exe = argstr;
+                        break;
+
+                    case 1:
+                        _ucode = argstr;
+                        break;
+
+                    case 2:
+                        _version = argstr;
+                        break;
+
+                    case 3:
+                        _data = argstr;
+                        break;
+
+                    case 4:
+                        _queries = argstr;
+                        break;
+
+                    case 5:
+                    case 6:
+                    case 7:
+                    case 8:
+                        _graphs.push_back(argstr);
+                        break;
+
+                    default:
+                        break;
+                    }
+                }
+                arg++;
+            };
+
+            // _exe = std::string(argv[0]);
+            // _ucode = std::string(argv[1]);
+            // _version = std::string(argv[2]);
+            // _data = std::string(argv[3]);
+            // _queries = std::string(argv[4]);
+            // // graphs
+            // for (int i = 5; i < argc; ++i) {
+            //     _graphs.push_back(std::string(argv[i]));
+            // }
+        }
+
+        std::string str() const {
+            std::stringstream ss;
+            ss << "ucode: " << _ucode << "\n"
+               << "version: " << _version << "\n"
+               << "exe: " << _exe << "\n"
+               << "data: " << _data << "\n"
+               << "queries: " << _queries << "\n";
+
+            for (int i = 0; i < _graphs.size(); ++i) {
+                ss << "graph " << i << ": " << _graphs[i] << "\n";
+            }
+
+            return ss.str();
+        }
+
+        std::string option(const std::string &opt) const {
+            auto it = _options.find(opt);
+            if (it != _options.end())
+                return it->second;
+
+            return "";
+        }
+
+        std::vector<int> do_queries() const {
+            std::string do_queries_str = option("--queries");
+            if (do_queries_str.empty()) {
+                return {};
+            }
+
+            std::vector<int> _do_queries;
+            size_t pos = 0;
+            size_t at = 0;
+
+            while ((at = do_queries_str.find(",", pos)) != std::string::npos) {
+                    do_queries_str.replace(at, 1, " ");
+                    pos = at+1;
+            }
+
+            std::stringstream ss(do_queries_str);
+            while (ss.good()) {
+                int q;
+                ss >> q;
+                _do_queries.push_back(q);
+            }
+
+            return _do_queries;
+        }
+
+        int num_iproducts() const {
+            int n = 100;
+            auto s = option("--num-iproducts");
+            if (!s.empty()) {
+                n = from_string<int>(s);
+            }
+            return n;
+        }
+
+        int grid_x() const {
+            auto s = option("--grid-x");
+            if (!s.empty())
+                return from_string<int>(s);
+            else
+                return 1;
+        }
+
+        int grid_y() const {
+            auto s = option("--grid-y");
+            if (!s.empty())
+                return from_string<int>(s);
+            else
+                return 1;
+        }
+
+        int grp_x() const {
+            auto s = option("--group-x");
+            if (!s.empty())
+                return from_string<int>(s);
+            else
+                return 1;
+        }
+
+        int grp_y() const {
+            auto s = option("--group-y");
+            if (!s.empty())
+                return from_string<int>(s);
+            else
+                return 1;
+        }
+
+        std::string ucode() const   { return _ucode; }
+        std::string version() const { return _version; }
+        std::string exe() const     { return _exe; }
+        std::vector<std::string> graphs() const { return _graphs; }
+        std::string graph(int i) const { return _graphs[i]; }
+        std::string data() const    { return _data; }
+        std::string queries() const { return _queries; }
+
+        std::string              _ucode;
+        std::string              _version;
+        std::string              _exe;
+        std::vector<std::string> _graphs;
+        std::string              _data;
+        std::string              _queries;
+        OptionTable              _options;
+    };
+
+    class IO {
+    public:
+        IO() {}
+        IO(const Parser &p): _parser(p) {}
+
+
+        graph_tools::Graph graph(int i) {
+            std::cout << "Reading graph " << i << ": "
+                      << _parser._graphs[i] << std::endl;
+
+            graph_tools::Graph500Data d = graph_tools::Graph500Data::FromASCIIFile(_parser._graphs[i]);
+            return graph_tools::Graph::FromGraph500Data(d);
+        }
+
+        std::vector<graph_tools::Graph> graphs() {
+            std::vector<graph_tools::Graph> graphs;
+            for (int i = 0; i < _parser._graphs.size(); ++i)
+                graphs.push_back(graph(i));
+
+            return graphs;
+        }
+
+        template <typename T>
+        std::vector<T> read(const std::string & fname) {
+            int r;
+            struct stat st;
+
+            std::cerr << "Opening " << fname << std::endl;
+
+            r = stat(fname.c_str(), &st);
+            if (r != 0) {
+                auto s = fname + ": " + std::string(strerror(errno));
+                throw std::runtime_error(s);
+            }
+            std::vector<T> v(st.st_size/sizeof(T));
+
+            FILE *f = fopen(fname.c_str(), "rb");
+            if (!f) {
+                auto s = fname + ": " + std::string(strerror(errno));
+                throw std::runtime_error(s);
+            }
+
+            fread(&v[0], st.st_size, 1, f);
+            fclose(f);
+            return v;
+        }
+
+        template <typename T, int N>
+        std::vector<std::array<T, N>>
+        database() {
+            using array = std::array<T,N>;
+            return read<array>(_parser._data);
+        }
+
+        template <typename T, int N>
+        std::vector<std::array<T, N>>
+        queries() {
+            using array = std::array<T,N>;
+            return read<array>(_parser._queries);
+        }
+
+        std::string ucode() const { return _parser._ucode; }
+        std::vector<int> do_queries() const  { return _parser.do_queries(); }
+
+        Parser _parser;
+    };
+
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp
new file mode 100644
index 000000000..55e410789
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp
@@ -0,0 +1,17 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWResultReader.hpp"
+namespace ipnsw {
+    class IPNSWFactory {
+    public:
+        std::unique_ptr<IPNSWKernelRunner> KernelRunner()const {
+            return std::unique_ptr<IPNSWKernelRunner>(_KernelRunner());
+        }
+        std::unique_ptr<IPNSWResultReader> ResultReader()const {
+            return std::unique_ptr<IPNSWResultReader>(_ResultReader());
+        }
+    protected:
+        virtual IPNSWKernelRunner* _KernelRunner()const = 0;
+        virtual IPNSWResultReader* _ResultReader()const = 0;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp
new file mode 100644
index 000000000..4f942db01
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp
@@ -0,0 +1,69 @@
+#pragma once
+#include <HammerBlade.hpp>
+#include <Graph.hpp>
+#include <Graph500Data.hpp>
+#include <vector>
+
+namespace ipnsw {
+    class Graph {
+    public:
+        Graph() : Graph(graph_tools::Graph()) {}
+        Graph(const graph_tools::Graph &g) : _graph(g) {}
+        Graph(graph_tools::Graph &&g) : _graph(g) {}
+
+        void initialize_on_device() {
+            using hammerblade::host::HammerBlade;
+            HammerBlade::Ptr hb = HammerBlade::Get();
+
+            auto & offsets = _graph.get_offsets();
+            auto & neighbors = _graph.get_neighbors();
+            
+            _offsets  = hb->alloc(offsets.size() * sizeof(offsets[0]));
+            _neighbors = hb->alloc(neighbors.size() * sizeof(neighbors[0]));
+
+            hb->push_write(_offsets,   &offsets[0],   offsets.size() * sizeof(offsets[0]));
+            hb->push_write(_neighbors, &neighbors[0], neighbors.size() * sizeof(neighbors[0]));
+        }
+
+        graph_tools::Graph & graph() { return _graph; }
+        const graph_tools::Graph & graph() const { return _graph; }
+        hb_mc_eva_t offsets() const { return _offsets; }
+        hb_mc_eva_t neighbors() const  { return _neighbors; }
+        
+        static hb_mc_eva_t InitializeMetadataOnDevice(const std::vector<Graph> & Gs) {
+            using hammerblade::host::HammerBlade;
+            HammerBlade::Ptr hb = HammerBlade::Get();            
+            struct metadata {
+                hb_mc_eva_t offset;
+                hb_mc_eva_t neighbors;
+                int V;
+                int E;
+            };
+
+            std::vector<metadata> metad;
+            for (auto & g : Gs) {
+                std::cout << "Host: offset = " << std::hex << g.offsets() << " neighbors = " << g.neighbors() << std::endl;
+                std::cout << std::dec;
+                metadata m = {
+                    .offset = g.offsets(),
+                    .neighbors = g.neighbors(),
+                    g.graph().num_nodes(),
+                    g.graph().num_edges()
+                };
+                metad.push_back(m);
+            }
+            
+            hb_mc_eva_t metadata = hb->alloc(sizeof(struct metadata) * metad.size());
+            hb->push_write(metadata, &metad[0], sizeof(struct metadata) * metad.size());
+            hb->sync_write();
+
+            return metadata;
+        }
+        
+    private:
+        graph_tools::Graph _graph;
+
+        hb_mc_eva_t _offsets;
+        hb_mc_eva_t _neighbors;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp
new file mode 100644
index 000000000..1604cb93e
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp
@@ -0,0 +1,40 @@
+#pragma once
+#include "HammerBlade.hpp"
+#include <memory>
+#include <string>
+namespace ipnsw {
+    class IPNSWRunner; // forward declaration
+
+    class IPNSWKernelRunner {
+    public:
+        using HammerBlade = hammerblade::host::HammerBlade;
+        using Dim = hammerblade::host::Dim;
+        IPNSWKernelRunner(){}
+
+    protected:
+        virtual std::string kernelName(const IPNSWRunner & runner) const =0;
+        virtual std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const =0;
+
+    public:
+        virtual Dim gd(const IPNSWRunner &runner) const {
+            return Dim(1,1);
+        }
+        virtual Dim tgd(const IPNSWRunner &runner) const {
+            return Dim(1,1);
+        }
+
+    public:
+        virtual void beforeLaunchKernel(const IPNSWRunner &runner) { }
+        virtual void afterLaunchKernel(const IPNSWRunner &runner)  { }
+        
+        void runKernel(IPNSWRunner &runner) {
+            HammerBlade::Ptr hb = HammerBlade::Get();
+            hb->push_jobv(gd(runner),
+                          tgd(runner),
+                          kernelName(runner),
+                          argv(runner));
+            hb->exec();
+        }
+    };
+
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp
new file mode 100644
index 000000000..19eaff181
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp
@@ -0,0 +1,13 @@
+#pragma once
+#include "HammerBlade.hpp"
+namespace ipnsw {
+    class IPNSWRunner;
+
+    class IPNSWResultReader {
+    protected:
+        using HammerBlade = hammerblade::host::HammerBlade;
+
+    public:
+        virtual void readResults(const IPNSWRunner & runner) {}
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp
new file mode 100644
index 000000000..feebf121d
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp
@@ -0,0 +1,339 @@
+#pragma once
+#include "IO.hpp"
+#include "HammerBlade.hpp"
+#include "IPNSWGraph.hpp"
+#include "IPNSWFactory.hpp"
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWResultReader.hpp"
+#include "GreedyWalkResults.hpp"
+#include "GroupData.hpp"
+#include <memory>
+
+namespace ipnsw {
+
+    class IPNSWRunnerConfig {
+    public:
+        typedef enum {
+            Dense,
+            BitVector,
+            Sparse,
+        } SetType;
+
+        IPNSWRunnerConfig():
+            _set_type(BitVector),
+            _grid_x(1),
+            _grid_y(1),
+            _grp_x(1),
+            _grp_y(1) {
+        }
+
+        SetType set_type() const { return _set_type; }
+        SetType & set_type() { return _set_type; }
+
+        std::string set_type_str() const {
+            switch (set_type()) {
+            case Dense:
+                return "Dense";
+            case BitVector:
+                return "Dense Bit Vector";
+            case Sparse:
+                return "Sparse";
+            }
+        }
+
+        int & grid_x()       { return _grid_x; }
+        int   grid_x() const { return _grid_x; }
+        int & grid_y()       { return _grid_y; }
+        int   grid_y() const { return _grid_y; }
+
+        int & grp_x()       { return _grp_x; }
+        int   grp_x() const { return _grp_x; }
+        int & grp_y()       { return _grp_y; }
+        int   grp_y() const { return _grp_y; }
+
+    private:
+        SetType _set_type;
+        int     _grid_x;
+        int     _grid_y;
+        int     _grp_x;
+        int     _grp_y;
+    };
+
+    class IPNSWRunner {
+    public:
+        //static constexpr int QUERY = 276; // fewest dot products for greedy walk
+        //static constexpr int QUERY = 472; // fewest dot products for beam search
+        //static constexpr int QUERY = 427;
+        //static constexpr int QUERY = 355;
+        //static constexpr int QUERY = 2;
+        static constexpr int QUERY = 188;
+        //static constexpr int QUERY = 229;
+        //static constexpr int QUERY = 490;
+        //static constexpr int QUERY = 16;
+        //static constexpr int QUERY = 461;
+        //static constexpr int QUERY = 470;
+
+
+        static constexpr size_t CANDIDATES_MAX = 513;
+        static constexpr size_t RESULTS_MAX    = 129;
+
+        using HammerBlade = hammerblade::host::HammerBlade;
+        using Dim = hammerblade::host::Dim;
+
+        IPNSWRunner(const Parser &p,
+                    std::unique_ptr<IPNSWFactory> & fact) :
+            IPNSWRunner(p, fact, IPNSWRunnerConfig()) {
+        }
+        
+        IPNSWRunner(const Parser &p,
+                    std::unique_ptr<IPNSWFactory> & fact,
+                    const IPNSWRunnerConfig &cfg):
+            _factory(std::move(fact)),
+            _cfg(cfg) {
+            _io = std::unique_ptr<IO>(new IO(p));
+            _hb = HammerBlade::Get();
+            _kernel_runner = _factory->KernelRunner();
+            _result_reader = _factory->ResultReader();
+        }
+
+        virtual ~IPNSWRunner() { delete _hb; }
+
+        void readInput() {
+            auto graphs   = _io->graphs();
+            _graphs = {
+                Graph(std::move(graphs[3])),
+                Graph(std::move(graphs[2])),
+                Graph(std::move(graphs[1])),
+                Graph(std::move(graphs[0]))
+            };
+
+            _db       = _io->database<float,100>();
+            _queries  = _io->queries<float,100>();
+        }
+
+        void loadProgram() {
+            _hb->load_application(ucodePath());
+        }
+
+        void initializeDeviceMemoryDB() {
+            std::cout << "Initializing database " << std::endl;
+            _db_dev = _hb->alloc(_db.size() * sizeof(_db[0]));
+            _hb->push_write(_db_dev, &_db[0], _db.size() * sizeof(_db[0]));
+        }
+
+        void initializeDeviceMemoryQuery() {
+            std::cout << "Initializing query "  << std::endl;
+
+            std::vector<int> do_queries = _io->do_queries();
+            if (do_queries.empty()) {
+                do_queries = {QUERY};
+            }
+
+            _query_dev = _hb->alloc(sizeof(_queries[0]) * do_queries.size());
+
+            for (hb_mc_eva_t qidx = 0; qidx < do_queries.size(); ++qidx) {
+                int query = do_queries[qidx];
+                _hb->push_write(_query_dev + qidx * sizeof(_queries[query]),
+                                &_queries[query],
+                                sizeof(_queries[query]));
+            }
+        }
+
+        size_t seen_dev_size_per_group() const {
+            size_t size, words;
+            switch (_cfg.set_type()) {                
+            case IPNSWRunnerConfig::Dense:
+            case IPNSWRunnerConfig::Sparse:                
+                return _db.size() * sizeof(int);
+            case IPNSWRunnerConfig::BitVector:
+                words = _db.size()/32;
+                if (_db.size() % 32 != 0)
+                    words += 1;
+                return words * sizeof(int);
+            }
+        }
+        void initializeDeviceMemorySeen() {
+            std::cout << "Initializing seen set " << std::endl;
+            for (int i = 0; i < numGroups(); ++i) {
+                hb_mc_eva_t dev = _hb->alloc(seen_dev_size_per_group());
+                _seen_dev.push_back(dev);
+            }
+        }
+
+        void initializeDeviceMemoryGraphs() {
+            for (auto & graph : _graphs)
+                graph.initialize_on_device();
+
+            _graph_metadata_dev = Graph::InitializeMetadataOnDevice(_graphs);
+        }
+
+        void initializeDeviceVCurrDCurr() {
+            _curr_dev = _hb->alloc(sizeof(GreedyWalkResult) * numGroups());
+            hb_mc_eva_t grp = 0;
+            std::cout << std::hex;
+            std::cout << "_curr_dev=" << std::hex << _curr_dev << std::endl;
+            std::cout << "  curr(" << std::dec << grp << ")=" << std::hex <<   curr_dev(grp) << std::endl;
+            std::cout << "v_curr(" << std::dec << grp << ")=" << std::hex << v_curr_dev(grp) << std::endl;
+            std::cout << "d_curr(" << std::dec << grp << ")=" << std::hex << d_curr_dev(grp) << std::endl;
+            std::cout << std::dec;
+        }
+
+        size_t candidates_dev_size_per_group() const {
+            return sizeof(GreedyWalkResult) * CANDIDATES_MAX;
+        }
+
+        void initializeDeviceCandidateDev() {
+            for (int i = 0; i < numGroups(); ++i) {
+                hb_mc_eva_t dev = _hb->alloc(candidates_dev_size_per_group());
+                _candidates_dev.push_back(dev);
+            }
+        }
+
+        size_t results_dev_size_per_group() const {
+            return sizeof(GreedyWalkResult) * RESULTS_MAX;
+        }
+
+        void initializeDeviceResultsDev() {
+            for (int i = 0; i < numGroups(); ++i) {
+                hb_mc_eva_t dev = _hb->alloc(results_dev_size_per_group());
+                _results_dev.push_back(dev);
+            }
+        }
+
+        void initializeDeviceNResultsDev() {
+            _n_results_dev = _hb->alloc(sizeof(int) * numGroups());
+        }
+
+        void initializeGroupData() {
+            _group_data_dev = _hb->alloc(sizeof(GroupData) * numGroups());
+            for (int i = 0; i < numGroups(); ++i) {
+                GroupData gd = {
+                    .seen_mem       = seen_dev(i),
+                    .candidates_mem = candidates_dev(i),
+                    .results_mem    = results_dev(i),
+                    .curr           = curr_dev(i),
+                    .n_results      = n_results_dev(i),
+                };
+                _hb->push_write(group_data_dev(i), &gd, sizeof(gd));
+            }
+        }
+
+        void initializeDeviceMemory() {
+            initializeDeviceMemoryDB();
+            initializeDeviceMemoryQuery();
+            initializeDeviceMemorySeen();
+            initializeDeviceMemoryGraphs();
+            initializeDeviceVCurrDCurr();
+            initializeDeviceCandidateDev();
+            initializeDeviceResultsDev();
+            initializeDeviceNResultsDev();
+            initializeGroupData();
+        }
+
+        void runKernel() {
+            _kernel_runner->beforeLaunchKernel(*this);
+            // sync
+            std::cout << "Starting DMA" << std::endl;
+            _hb->sync_rw();
+            std::cout << "Launching kernel" << std::endl;
+            _kernel_runner->runKernel(*this);
+            _kernel_runner->afterLaunchKernel(*this);
+        }
+
+        void readResults() {
+            _result_reader->readResults(*this);
+
+        }
+
+        void run() {
+            readInput();
+            loadProgram();
+            initializeDeviceMemory();
+            runKernel();
+            readResults();
+        }
+
+        /////////////
+        // Getters //
+        /////////////
+        std::string ucodePath() const {
+            return _io->ucode();
+        }
+
+        hb_mc_eva_t db_dev() const { return _db_dev; }
+        hb_mc_eva_t query_dev(hb_mc_eva_t qidx) const {
+            return _query_dev + qidx * sizeof(_queries[qidx]);
+        }
+
+        hb_mc_eva_t seen_dev(hb_mc_eva_t grp) const {
+            return _seen_dev[grp];
+        }
+
+        hb_mc_eva_t curr_dev(hb_mc_eva_t grp = 0) const {
+            return _curr_dev + (grp*sizeof(GreedyWalkResult));
+        }
+
+        hb_mc_eva_t v_curr_dev(hb_mc_eva_t grp) const {
+            return curr_dev(grp) + sizeof(float);
+        }
+        hb_mc_eva_t d_curr_dev(hb_mc_eva_t grp) const {
+            return curr_dev(grp);
+        }
+
+        hb_mc_eva_t graph_metadata_dev() const { return _graph_metadata_dev; }
+
+        hb_mc_eva_t candidates_dev(hb_mc_eva_t grp) const {
+            return _candidates_dev[grp];
+        }
+
+        hb_mc_eva_t results_dev(hb_mc_eva_t grp) const {
+            return _results_dev[grp];
+        }
+
+        hb_mc_eva_t n_results_dev(hb_mc_eva_t grp) const {
+            return _n_results_dev + grp * sizeof(int);
+        }
+        
+        hb_mc_eva_t group_data_dev(hb_mc_eva_t grp) const {
+            return _group_data_dev + grp * sizeof(GroupData);
+        }
+
+        int numGroups() const { return _kernel_runner->gd(*this).x() * _kernel_runner->gd(*this).y(); }
+
+        const std::vector<std::array<float,100>> & db() const { return _db; }
+
+        const IPNSWRunnerConfig & cfg() const { return _cfg; }
+        /////////////
+        // Setters //
+        /////////////
+
+    private:
+        IPNSWRunnerConfig                     _cfg;
+
+    public:
+        std::unique_ptr<IO>                   _io;
+
+    private:
+        std::vector<Graph>                    _graphs;
+        std::vector<std::array<float, 100>>   _db;
+        std::vector<std::array<float, 100>>   _queries;
+        std::vector<GroupData>                _group_data;
+        HammerBlade::Ptr                      _hb;
+
+        // device pointers
+        hb_mc_eva_t _db_dev;
+        hb_mc_eva_t _query_dev;
+        std::vector<hb_mc_eva_t> _seen_dev;
+        hb_mc_eva_t _curr_dev;
+        hb_mc_eva_t _graph_metadata_dev;
+        std::vector<hb_mc_eva_t> _candidates_dev;
+        std::vector<hb_mc_eva_t> _results_dev;
+        hb_mc_eva_t _n_results_dev;
+        hb_mc_eva_t _group_data_dev;
+
+        // composites
+        std::unique_ptr<IPNSWKernelRunner> _kernel_runner;
+        std::unique_ptr<IPNSWResultReader> _result_reader;
+        std::unique_ptr<IPNSWFactory>      _factory;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp
new file mode 100644
index 000000000..ff0468903
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp
@@ -0,0 +1,19 @@
+#pragma once
+#include "IPNSWFactory.hpp"
+#include "IProductUBmkKernelRunner.hpp"
+#include "IProductUBmkResultReader.hpp"
+namespace ipnsw {
+    class IProductUBmkFactory : public IPNSWFactory {
+    public:
+        IProductUBmkFactory(int iterations = 10):
+            _iterations(iterations) {
+        }
+
+    protected:
+        virtual IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkKernelRunner(_iterations); }
+        virtual IPNSWResultReader *_ResultReader() const { return new IProductUBmkResultReader; }
+
+        int _iterations;
+    };
+}
+
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp
new file mode 100644
index 000000000..1ee4da763
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp
@@ -0,0 +1,30 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWRunner.hpp"
+
+namespace ipnsw {
+    class IProductUBmkKernelRunner : public IPNSWKernelRunner {
+    public:
+        IProductUBmkKernelRunner(int iterations = 10) :
+            IPNSWKernelRunner(),
+            _iterations(iterations) {
+        }
+
+    private:
+        std::string kernelName(const IPNSWRunner & runner) const {
+            return "inner_product_ubmk";
+        }
+
+        virtual std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+            std::vector<hb_mc_eva_t> argv = {
+                runner.db_dev(), // database
+                runner.query_dev(0), // query
+                static_cast<hb_mc_eva_t>(_iterations), // number of inner products
+            };
+            return argv;
+        }
+
+    protected:
+        int _iterations;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp
new file mode 100644
index 000000000..964cc2d8e
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp
@@ -0,0 +1,20 @@
+#pragma once
+#include "IPNSWFactory.hpp"
+#include "IProductUBmkKernelRunner.hpp"
+#include "IProductUBmkResultReader.hpp"
+#include "IProductUBmkFactory.hpp"
+#include "IProductUBmkParallelKernelRunner.hpp"
+
+namespace ipnsw {
+    class IProductUBmkParallelFactory : public IProductUBmkFactory {
+    public:
+        IProductUBmkParallelFactory(int itertions = 10):
+            IProductUBmkFactory(itertions) {
+        }
+
+    private:
+        IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkParallelKernelRunner(_iterations); }
+
+    };
+}
+
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp
new file mode 100644
index 000000000..668114fb2
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp
@@ -0,0 +1,64 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IProductUBmkKernelRunner.hpp"
+#include "IPNSWRunner.hpp"
+#include "HammerBlade.hpp"
+#include <algorithm>
+
+namespace ipnsw {
+    class IProductUBmkParallelKernelRunner : public IProductUBmkKernelRunner {
+    public:
+        IProductUBmkParallelKernelRunner(int iterations = 10) :
+            IProductUBmkKernelRunner(iterations) {
+        }
+
+    private:
+        using HammerBlade = hammerblade::host::HammerBlade;
+
+        void beforeLaunchKernel(const IPNSWRunner &runner) {
+            HammerBlade::Ptr _hb = HammerBlade::Get();
+            
+            _visit.clear();
+            
+            for (int i = 0; i < _iterations * runner.numGroups(); ++i) {
+                _visit.push_back((i*3) % runner.db().size());
+            }
+            std::random_shuffle(_visit.begin(), _visit.end());
+            
+            _visit_dev = _hb->alloc(sizeof(int) * _visit.size());
+
+            std::cout << "beforeLaunchKernel called: _visit_dev = " << std::hex << _visit_dev << std::endl;
+            std::cout << std::dec;
+
+            _hb->push_write(_visit_dev, &_visit[0], sizeof(int) * _visit.size());
+        }
+
+        std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+            std::cout << "Called" << std::endl;
+            std::vector<hb_mc_eva_t> argv = {
+                runner.db_dev(), // database
+                runner.query_dev(0), // query                
+                static_cast<hb_mc_eva_t>(_iterations), // number of inner products
+                _visit_dev, // vectors to visit
+            };
+            return argv;
+        }
+
+        void afterLaunchKernel(const IPNSWRunner &runner) {
+            HammerBlade::Ptr _hb = HammerBlade::Get();
+            _hb->free(_visit_dev);
+            _visit.clear();
+        }
+
+        virtual Dim gd(const IPNSWRunner &runner) const {
+            return Dim(runner.cfg().grid_x(),runner.cfg().grid_y());
+        }
+
+        virtual Dim tgd(const IPNSWRunner &runner) const {
+            return Dim(runner.cfg().grp_x(),runner.cfg().grp_y());
+        }
+
+        hb_mc_eva_t          _visit_dev;
+        std::vector<int>     _visit;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp
new file mode 100644
index 000000000..300990b18
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp
@@ -0,0 +1,12 @@
+#pragma once
+#include "IPNSWRunner.hpp"
+#include "IPNSWResultReader.hpp"
+
+namespace ipnsw {
+    class IProductUBmkResultReader : public IPNSWResultReader {
+    public:
+        void readResults(const IPNSWRunner & runner) {
+            std::cout << "Done" << std::endl;
+        }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile
new file mode 100644
index 000000000..a8f6da2c5
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/Makefile
@@ -0,0 +1,184 @@
+#####################
+# Standard includes #
+#####################
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+include $(REPLICANT_PATH)/environment.mk
+
+all:
+
+##################
+# Prepare inputs #
+##################
+ipnsw-eval-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw
+ipnsw-inputs  = $(ipnsw-eval-dir)/data/database_music100.bin
+ipnsw-inputs += $(ipnsw-eval-dir)/data/query_music100.bin
+ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_0
+ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_1
+ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_2
+ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_3
+
+ipnsw-input := $(ipnsw-eval-dir)/data/database_music100.bin
+# this rule generates all the inputs, but we just target one
+# to avoid running this more than once
+$(ipnsw-input):
+	cd $(ipnsw-eval-dir) && bash prep.sh
+
+#######################################
+# Base clase run directory generation #
+#######################################
+# $1 = name
+# $2 = version
+# $3 = args
+define run-dir
+run/$1/kernel.cpp: kernel/$2/kernel.cpp
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+	@echo "MAKING $$@"
+
+run/$1/Makefile: template.mk
+	@mkdir -p $$(dir $$@)
+	@cat $$< > $$@
+	@echo "C_ARGS += $3" >> $$@
+	@echo "MAKING $$@"
+
+.PHONY: generate-$1 build-$1 purge-$1 run-$1 profile-$1
+
+generate-$1: run/$1/Makefile run/$1/kernel.cpp
+
+purge-$1:
+	rm -rf run/$1
+
+build-$1: generate-$1
+	+$(MAKE) -C run/$1 main.riscv
+
+exec-$1: generate-$1
+	+$(MAKE) -C run/$1 main.exec.log
+
+profile-$1: generate-$1
+	+$(MAKE) -C run/$1 main.profile.log
+
+debug-$1: generate-$1
+	+$(MAKE) -C run/$1/main.debug.log
+
+saif-$1: generate-$1
+	+$(MAKE) -C run/$1/main.saifgen.log
+endef
+
+#################################
+# Common command line arguments #
+#################################
+C_ARGS += $(ipnsw-inputs)
+
+###############
+# Greedy Walk #
+###############
+# greedy-walk version -> dimensions
+ #  inner product with ipc=0.3 (8x4)
+greedy_walk-grp-x := 1
+greedy_walk-grp-y := 1
+#  inner product with ipc=0.43 (8x4)
+greedy_walk_v1-grp-x := 1
+greedy_walk_v1-grp-y := 1
+#  inner product with FLOPS/cycle=0.2  (8x4)
+greedy_walk_v2-grp-x := 1
+greedy_walk_v2-grp-y := 1
+#  inner product with FLOPS/cycle=0.26 (8x4)
+greedy_walk_v3-grp-x := 1
+greedy_walk_v3-grp-y := 1
+#  inner product v4-serial
+greedy_walk_v3-ipv4serial-grp-x := 1
+greedy_walk_v3-ipv4serial-grp-y := 1
+#  greedy_walk_v3 + ParallelInnerProduct_v1
+greedy_walk_v4-grp-x := 2
+greedy_walk_v4-grp-y := 2
+
+###############
+# Beam Search #
+###############
+# beam-search version -> dimensions
+
+#  very slow - uses a very dumb sparse set
+beam_search-grp-x := 1
+beam_search-grp-y := 1
+#  dense set - inner product with ipc=0.3  (8x4)
+beam_search_v1-grp-x := 1
+beam_search_v1-grp-y := 1
+#  dense set - inner product with ipc=0.43 (8x4)
+beam_search_v2-grp-x := 1
+beam_search_v2-grp-y := 1
+#  + inner_product_v2 (flops/cycle=0.2039) (8x4)
+beam_search_v3-grp-x := 1
+beam_search_v3-grp-y := 1
+#  + inner_product_v3 (flops/cycle=0.2663) (8x4)
+beam_search_v4-grp-x := 1
+beam_search_v4-grp-y := 1
+#  + Bit vector for dense set
+beam_search_v5-grp-x := 1
+beam_search_v5-grp-y := 1
+#  + Bit vector for dense set + inner product v4 seria;
+beam_search_v5-ipv4serial-grp-x := 1
+beam_search_v5-ipv4serial-grp-y := 1
+#  beam_search_v5 + inner_product_parallel_v3
+beam_search_v6-grp-x := 2
+beam_search_v6-grp-y := 2
+# beam_search_v6 but with 1x2 tile group
+beam_search_v7-grp-x := 1
+beam_search_v7-grp-y := 2
+# beam_search_v5 but edges of candidates traversed in parallel
+beam_search_v8-grp-x := 4
+beam_search_v8-grp-y := 4
+# combination of beam_search_v8 + beam_search_v6
+beam_search_v9-grp-x := 4
+beam_search_v9-grp-y := 4
+# beam_search_v5 but edges of candidates traversed in parallel
+beam_search_v10-grp-x := 8
+beam_search_v10-grp-y := 4
+
+# $1 = version
+# $2 = query
+run-name = $(1)_query$(2)
+define run
+$(eval $(call run-dir,$(call run-name,$1,$2),$1,\
+$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call run-name,$1,$2)/kernel.riscv \
+$1 \
+$(C_ARGS) \
+--queries $(2) \
+--group-x $($(1)-grp-x) \
+--group-y $($(1)-grp-y) \
+))
+generate: generate-$(call run-name,$1,$2)
+purge:    purge-$(call run-name,$1,$2)
+build:    build-$(call run-name,$1,$2)
+exec:     exec-$(call run-name,$1,$2)
+profile:  profile-$(call run-name,$1,$2)
+debug:    debug-$(call run-name,$1,$2)
+saifgen:  saifgen-$(call run-name,$1,$2)
+
+saifgen-$(call run-name,$1,$2): $(ipnsw-input)
+profile-$(call run-name,$1,$2): $(ipnsw-input)
+debug-$(call run-name,$1,$2):   $(ipnsw-input)
+exec-$(call run-name,$1,$2):    $(ipnsw-input)
+
+endef
+.PHONY: generate
+.PHONY: purge
+.PHONY: build
+.PHONY: exec
+.PHONY: profile
+.PHONY: debug
+.PHONY: saifgen
+
+#############################################################
+# Define which queries we want to run and instantiate rules #
+#############################################################
+greedy-walk-queries := 4 16 229 276 461 470 490
+$(foreach q,$(greedy-walk-queries),$(eval $(call run,greedy_walk_v4,$(q))))
+
+beam-search-queries := 2   188 229 355 427 472
+beam-search-queries += 25  74  112 140 148 178
+beam-search-queries += 214 244 278 302 331
+beam-search-queries += 396 420 452 489 511
+$(foreach q,$(beam-search-queries),$(eval $(call run,beam_search_v10,$(q))))
+
+.PHONY: all
+all: exec
diff --git a/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp b/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp
new file mode 100644
index 000000000..39e09b1a9
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp
@@ -0,0 +1,17 @@
+#pragma once
+#include <string>
+#include <sstream>
+
+namespace ipnsw {
+    static bool startswith(const std::string &st, const std::string &prefix) {
+        return st.rfind(prefix, 0) == 0;
+    }
+
+    template <typename T>
+    T from_string(const std::string &str) {
+        std::stringstream ss(str);
+        T v;
+        ss >> v;
+        return v;
+    }
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/graph-tools b/examples/sdh-eval-workloads/ipnsw/graph-tools
new file mode 160000
index 000000000..a7304c67c
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/graph-tools
@@ -0,0 +1 @@
+Subproject commit a7304c67c34070877e57719fd183c4a5ee569904
diff --git a/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers b/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers
new file mode 160000
index 000000000..9a26b6d0c
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers
@@ -0,0 +1 @@
+Subproject commit 9a26b6d0cbe04a9cc627cce7049a0ba97ca66621
diff --git a/examples/sdh-eval-workloads/ipnsw/hb-prog-eval b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval
new file mode 160000
index 000000000..f113c0865
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval
@@ -0,0 +1 @@
+Subproject commit f113c0865d2d9491551dab8f8b500445b75429bc
diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
new file mode 100644
index 000000000..23f2b20d1
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
@@ -0,0 +1,80 @@
+#include "bsg_manycore_regression.h"
+#include "ipnsw.hpp"
+#include "HammerBlade.hpp"
+#include "Graph500Data.hpp"
+#include "Graph.hpp"
+#include "IO.hpp"
+#include "IPNSWGraph.hpp"
+#include "IPNSWRunner.hpp"
+#include "IProductUBmkKernelRunner.hpp"
+#include "IProductUBmkResultReader.hpp"
+#include "IProductUBmkFactory.hpp"
+#include "IProductUBmkParallelFactory.hpp"
+#include "BeamSearchKernelRunner.hpp"
+#include "BeamSearchResultReader.hpp"
+#include "BeamSearchFactory.hpp"
+#include "GreedyWalkKernelRunner.hpp"
+#include "GreedyWalkResultReader.hpp"
+#include "GreedyWalkFactory.hpp"
+#include "GreedyWalkResults.hpp"
+#include "StringHelpers.hpp"
+#include <iostream>
+#include <memory>
+
+using namespace ipnsw;
+
+int Main(int argc, char *argv[])
+{
+    Parser args;
+    args.parse(argc, argv);
+
+    std::cout << args.str() << std::endl;
+
+    std::unique_ptr<IPNSWRunner> runner;
+    std::unique_ptr<IPNSWFactory> factory;
+
+    IPNSWRunnerConfig cfg;
+    cfg.grid_x() = args.grid_x();
+    cfg.grid_y() = args.grid_y();
+    cfg.grp_x()  = args.grp_x();
+    cfg.grp_y()  = args.grp_y();
+
+    if (ipnsw::startswith(args.version(), "greedy_walk")) {
+        factory = std::unique_ptr<IPNSWFactory>(new GreedyWalkFactory);
+    } else if (ipnsw::startswith(args.version(), "beam_search")) {
+        factory = std::unique_ptr<IPNSWFactory>(new BeamSearchFactory);
+    } else if (ipnsw::startswith(args.version(), "iproduct_ubmk")) {
+        /* parse the number of inner products */
+        std::cout << "num inner products " << args.num_iproducts() << std::endl;
+        int n_iproducts = args.num_iproducts();
+
+        bool parallel = args.version().find("parallel") != std::string::npos;
+        if (parallel) {
+            factory = std::unique_ptr<IPNSWFactory>(new IProductUBmkParallelFactory(n_iproducts));
+        } else {
+            factory = std::unique_ptr<IPNSWFactory>(new IProductUBmkFactory(n_iproducts)) ;
+        }
+
+    } else if (args._version == "debug") {
+        /* just for debugging */
+        std::cout << "--num-iproducts=" << args.num_iproducts() << std::endl;
+        std::cout << "--queries=" << std::endl;
+        std::cout << "--group-x=" << args.grp_x() << std::endl;
+        std::cout << "--group-y=" << args.grp_y() << std::endl;
+        auto do_queries = args.do_queries();
+        for (auto q : do_queries) {
+            std::cout << q << " ";
+        }
+        std::cout << std::endl;
+        return 0;
+    } else {
+        return 0;
+    }
+
+    runner = std::unique_ptr<IPNSWRunner>(new IPNSWRunner(args, factory, cfg));
+    runner->run();
+
+    return 0;
+}
+
+declare_program_main("IPNSW", Main);
diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp
new file mode 100644
index 000000000..9c91c72bd
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp
@@ -0,0 +1,36 @@
+// Copyright (c) 2019, University of Washington All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+// 
+// Redistributions of source code must retain the above copyright notice, this list
+// of conditions and the following disclaimer.
+// 
+// Redistributions in binary form must reproduce the above copyright notice, this
+// list of conditions and the following disclaimer in the documentation and/or
+// other materials provided with the distribution.
+// 
+// Neither the name of the copyright holder nor the names of its contributors may
+// be used to endorse or promote products derived from this software without
+// specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma  once
+#include <cstring>
+#include <cstdlib>
+#include <random>
+#include <limits>
+#include <iostream>
+#include <typeinfo>
+#include <bsg_manycore_errno.h>
+#include <bsg_manycore_cuda.h>
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp
new file mode 100644
index 000000000..a75d5b1bf
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp
@@ -0,0 +1,182 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_beam_search(const graph *Gs, const float *database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DynSet<int, std::less<int>> seen(seen_mem, N_V);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+            //v_worst = std::get<1>(results.top());
+            bsg_print_int(-v_best);
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+                bsg_print_int(dst);
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp
new file mode 100644
index 000000000..9ee2ce5e7
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp
@@ -0,0 +1,188 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs, const float *database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+results.size(), LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp
new file mode 100644
index 000000000..d55e7e900
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp
@@ -0,0 +1,279 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 8
+#define BSG_TILE_GROUP_Y_DIM 4
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.hpp>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v4_serial(v0, v1))
+
+
+    static constexpr int SYNC_INV  = -1;
+    static constexpr int SYNC_DONE = -2;
+
+    void ipnsw_distance_slave(bsg_attr_remote const float *__restrict database,
+                              const float *query,
+                              int   *dst_p,
+                              float *distance_p,
+                              int   *done_p,
+                              DenseSet_v1<int> *seen)
+    {
+        float *result = bsg_tile_group_remote_pointer<float>(0, 0, &distance_p[__bsg_id]);
+        int   *done   = bsg_tile_group_remote_pointer<int>(  0, 0, &done_p[__bsg_id]);
+        while (true) {
+            int dst = sleep_until_valid(dst_p, SYNC_INV);
+            if (dst == SYNC_DONE)
+                break;
+
+            if (!seen->in(dst)) {
+                seen->atomic_insert(dst);
+                //bsg_print_int(dst);
+                float tmp = distance(query, &database[dst * VSIZE]);
+                //bsg_print_float(tmp);
+                *result = tmp;
+            } else {
+                *result = -INFINITY;
+            }
+            *done = 1;
+        }
+    }
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database,
+                          const float *query,
+                          int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        int   dst_slave = SYNC_INV;
+        float dist_result[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+        int   dist_done  [BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+
+        if (__bsg_id != 0) {
+            ipnsw_distance_slave(database, q, &dst_slave, dist_result, dist_done, &seen);
+        } else {
+            bsg_saif_start();
+            int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+            for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x)
+                for (int y = 0; y < BSG_TILE_GROUP_Y_DIM; ++y) {
+                    dst_slave_ptr[bsg_x_y_to_id(x,y)]
+                        = bsg_tile_group_remote_pointer(x, y, &dst_slave);
+                    dist_result[bsg_x_y_to_id(x,y)] = INFINITY;
+                    dist_done[bsg_x_y_to_id(x,y)]   = 0;
+                }
+
+            // retrieve results from greedy walk
+            int v_curr   = *v_curr_o;
+            float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+            bsg_print_int(v_curr);
+            bsg_print_float(d_curr);
+#endif
+
+            // initialize priority queues
+            DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+            DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+            candidates.push({d_curr, v_curr});
+            results.push({d_curr, v_curr});
+
+            float d_worst = d_curr;
+            seen.insert(v_curr);
+
+            while (!candidates.empty()) {
+                int   v_best;
+                float d_best;
+
+                auto best = candidates.pop();
+                v_best = std::get<1>(best);
+                d_best = std::get<0>(best);
+
+                d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(-v_best);
+#endif
+
+                if (d_best > d_worst) {
+                    break;
+                }
+
+                // traverse neighbors of v_best
+                int dst_0 = G.offsets[v_best];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+
+                // traverse neighbors
+                for (int dst_i = 0;
+                     dst_i < degree;
+                     dst_i += BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM) {
+                     // read-in work
+                    int dst_n = std::min(BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM, degree-dst_i);
+                    int dst_v[dst_n];
+                    memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v));
+
+                    // delegate work
+                    int dst;
+                    for (int dst_j = 1; dst_j < dst_n; ++dst_j) {                        
+                        dst = dst_v[dst_j];
+                        *dst_slave_ptr[dst_j] = dst;                        
+                    }
+                    // work myself 
+                    {
+                        dst = dst_v[0];
+                        if (!seen.in(dst)) {
+                            seen.atomic_insert(dst);                            
+                            dist_result[0] = distance(q, &database[dst * VSIZE]);
+                        } else {
+                            dist_result[0] = -INFINITY;
+                        }
+                        dist_done[0] = 1;
+                    }
+                    // reduce
+                    for (int dst_j = 0; dst_j < dst_n; ++dst_j) {
+                        dst = dst_v[dst_j];
+
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                        bsg_print_int(dst);
+#endif
+                        bsg_wait_local_int_asm_blind(&dist_done[dst_j], 1);
+                        dist_done[dst_j] = 0;
+                        float d_neib = dist_result[dst_j];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                        bsg_print_float(d_neib);
+#endif
+                        // already seen?
+                        if (d_neib == -INFINITY)
+                            continue;
+
+                        d_worst = std::get<0>(results.top());
+                        // if there's room for new result or this distance is promising
+                        if ((results.size() < EF) || (d_neib < d_worst)) {
+                            
+                            // push onto candidates and results
+                            candidates.push({d_neib, dst});
+                            results.push({d_neib, dst});
+                            
+                            // prune down to recall
+                            if (results.size() > EF)
+                                results.pop();
+                        }
+                    }                    
+                    
+                }
+
+            }
+
+            int n_res = std::min(results.size(), N_RESULTS);
+            std::sort(results_mem, results_mem+results.size(), LT());
+            *n_results = n_res;
+            bsg_saif_end();
+        
+        }        
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp
new file mode 100644
index 000000000..b0f374a4c
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp
@@ -0,0 +1,189 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp
new file mode 100644
index 000000000..f98216636
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp
@@ -0,0 +1,189 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v2<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp
new file mode 100644
index 000000000..01f62555f
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp
@@ -0,0 +1,189 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp
new file mode 100644
index 000000000..1ced33c51
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp
@@ -0,0 +1,192 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v4_serial(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+results.size(), LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp
new file mode 100644
index 000000000..7073bb548
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp
@@ -0,0 +1,192 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+results.size(), LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp
new file mode 100644
index 000000000..e88095dea
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp
@@ -0,0 +1,195 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+
+        // Pepare other tiles for parallel inner products
+        InnerProduct ip(database, q);
+
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+        ip.init();
+
+        if (__bsg_id == 0) {
+
+            // retrieve results from greedy walk
+            int v_curr   = *v_curr_o;
+            float d_curr = *d_curr_o;
+            //bsg_print_int(v_curr);
+            //bsg_print_float(d_curr);
+
+            // initialize priority queues
+            DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+            DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+            candidates.push({d_curr, v_curr});
+            results.push({d_curr, v_curr});
+
+            float d_worst = d_curr;
+            seen.insert(v_curr);
+
+            while (!candidates.empty()) {
+                int   v_best;
+                float d_best;
+
+                auto best = candidates.pop();
+                v_best = std::get<1>(best);
+                d_best = std::get<0>(best);
+
+                d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(-v_best);
+#endif
+
+                if (d_best > d_worst) {
+                    break;
+                }
+
+                // traverse neighbors of v_best
+                int dst_0 = G.offsets[v_best];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                    bsg_print_int(dst);
+#endif
+                    if (!seen.in(dst)) {
+                        // mark as seen
+                        seen.insert(dst);
+                        float d_neib = -1 * ip.inner_product(dst);
+                        d_worst = std::get<0>(results.top());
+                        // if there's room for new result or this distance is promising
+                        if ((results.size() < EF) || (d_neib < d_worst)) {
+                            // push onto candidates and results
+                            candidates.push({d_neib, dst});
+                            results.push({d_neib, dst});
+
+                            // prune down to recall
+                            if (results.size() > EF)
+                                results.pop();
+                        }
+                    }
+                }
+
+            }
+
+            //ip.exit();
+
+            int n_res = std::min(results.size(), N_RESULTS);
+            std::sort(results_mem, results_mem+results.size(), LT());
+            *n_results = n_res;
+        }
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp
new file mode 100644
index 000000000..37d995573
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp
@@ -0,0 +1,194 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+
+        // Pepare other tiles for parallel inner products
+        InnerProduct ip(database, q);
+
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+        ip.init();
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = -1 * ip.inner_product(dst);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        //ip.exit();
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+results.size(), LT());
+
+        bsg_cuda_print_stat_end(0);
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp
new file mode 100644
index 000000000..d87eaf3bd
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp
@@ -0,0 +1,270 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.hpp>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v4_serial(v0, v1))
+
+
+    static constexpr int SYNC_INV  = -1;
+    static constexpr int SYNC_DONE = -2;
+
+    void ipnsw_distance_slave(bsg_attr_remote const float *__restrict database,
+                              const float *query,
+                              int   *dst_p,
+                              float *distance_p,
+                              DenseSet_v1<int> *seen)
+    {
+        float *result = bsg_tile_group_remote_pointer<float>(0, 0, &distance_p[__bsg_id]);
+        while (true) {
+            int dst = sleep_until_valid(dst_p, SYNC_INV);
+            if (dst == SYNC_DONE)
+                break;
+
+            if (!seen->in(dst)) {
+                seen->atomic_insert(dst);
+                //bsg_print_int(dst);
+                float tmp = distance(query, &database[dst * VSIZE]);
+                //bsg_print_float(tmp);
+                *result = tmp;
+            } else {
+                *result = -INFINITY;
+            }
+        }
+    }
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database,
+                          const float *query,
+                          int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        int   dst_slave = SYNC_INV;
+        float dist_result[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+
+        if (__bsg_id != 0) {
+            ipnsw_distance_slave(database, q, &dst_slave, dist_result, &seen);
+        } else {
+
+            int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+            for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x)
+                for (int y = 0; y < BSG_TILE_GROUP_Y_DIM; ++y) {
+                    dst_slave_ptr[bsg_x_y_to_id(x,y)]
+                        = bsg_tile_group_remote_pointer(x, y, &dst_slave);
+                    dist_result[bsg_x_y_to_id(x,y)] = INFINITY;
+                }
+
+            // retrieve results from greedy walk
+            int v_curr   = *v_curr_o;
+            float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+            bsg_print_int(v_curr);
+            bsg_print_float(d_curr);
+#endif
+
+            // initialize priority queues
+            DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+            DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+            candidates.push({d_curr, v_curr});
+            results.push({d_curr, v_curr});
+
+            float d_worst = d_curr;
+            seen.insert(v_curr);
+
+            while (!candidates.empty()) {
+                int   v_best;
+                float d_best;
+
+                auto best = candidates.pop();
+                v_best = std::get<1>(best);
+                d_best = std::get<0>(best);
+
+                d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(-v_best);
+#endif
+
+                if (d_best > d_worst) {
+                    break;
+                }
+
+                // traverse neighbors of v_best
+                int dst_0 = G.offsets[v_best];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+
+                // traverse neighbors
+                for (int dst_i = 0;
+                     dst_i < degree;
+                     dst_i += BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM) {
+                     // read-in work
+                    int dst_n = std::min(BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM, degree-dst_i);
+                    int dst_v[dst_n];
+                    memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v));
+
+                    // delegate work
+                    int dst;
+                    for (int dst_j = 1; dst_j < dst_n; ++dst_j) {
+                        dst = dst_v[dst_j];
+                        *dst_slave_ptr[dst_j] = dst;                        
+                    }
+                    // work myself 
+                    {
+                        dst = dst_v[0];
+                        if (!seen.in(dst)) {
+                            seen.atomic_insert(dst);                            
+                            dist_result[0] = distance(q, &database[dst * VSIZE]);
+                        } else {
+                            dist_result[0] = -INFINITY;
+                        }
+                    }
+                    // reduce
+                    for (int dst_j = 0; dst_j < dst_n; ++dst_j) {
+                        dst = dst_v[dst_j];
+
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                        bsg_print_int(dst);
+#endif                        
+                        float d_neib = sleep_until_valid(&dist_result[dst_j], INFINITY);
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                        bsg_print_float(d_neib);
+#endif
+                        // already seen?
+                        if (d_neib == -INFINITY)
+                            continue;
+
+                        d_worst = std::get<0>(results.top());
+                        // if there's room for new result or this distance is promising
+                        if ((results.size() < EF) || (d_neib < d_worst)) {
+                            
+                            // push onto candidates and results
+                            candidates.push({d_neib, dst});
+                            results.push({d_neib, dst});
+                            
+                            // prune down to recall
+                            if (results.size() > EF)
+                                results.pop();
+                        }
+                    }                    
+                    
+                }
+
+            }
+
+            int n_res = std::min(results.size(), N_RESULTS);
+            std::sort(results_mem, results_mem+results.size(), LT());
+            *n_results = n_res;
+        
+        }        
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp
new file mode 100644
index 000000000..69def7bdd
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp
@@ -0,0 +1,249 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 4
+#define BSG_TILE_GROUP_Y_DIM 4
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.hpp>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Uncomment to turn on debugging
+#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+    using InnerProduct = InnerProductParallel_Y<BSG_TILE_GROUP_Y_DIM>;
+
+    static constexpr int SYNC_INV  = -1;
+    static constexpr int SYNC_DONE = -2;
+
+    void ipnsw_x_master(bsg_attr_remote const float *__restrict database,
+                             const float *query,
+                             int   *dst_p,
+                             float *distance_p,
+                        DenseSet_v1<int> *seen,
+                        InnerProduct *ip_y)
+    {
+        float *result = bsg_tile_group_remote_pointer<float>(0, 0, &distance_p[__bsg_x]);
+        while (true) {
+            int dst = sleep_until_valid(dst_p, SYNC_INV);
+            if (dst == SYNC_DONE)
+                break;
+
+            if (!seen->in(dst)) {
+                seen->atomic_insert(dst);
+                //bsg_print_int(dst);
+                *result  = -1.0 * ip_y->inner_product(dst);
+            } else {
+                *result = -INFINITY;
+            }
+        }
+    }
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database,
+                          const float *query,
+                          int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);        
+        memcpy(q, query, sizeof(q));
+
+        InnerProduct ip_y(database, q);
+        ip_y.init();
+
+        int   dst_slave = SYNC_INV;
+        float dist_result[BSG_TILE_GROUP_X_DIM];
+        
+        if (__bsg_y == 0) {        
+            if (__bsg_x == 0) {
+                
+                int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM];
+                for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x) {
+                    dst_slave_ptr[x] = bsg_tile_group_remote_pointer(x, 0, &dst_slave);
+                    dist_result[x] = INFINITY;
+                }
+
+                // retrieve results from greedy walk
+                int v_curr   = *v_curr_o;
+                float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+                bsg_print_int(v_curr);
+                bsg_print_float(d_curr);
+#endif
+
+                // initialize priority queues
+                DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+                DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+                candidates.push({d_curr, v_curr});
+                results.push({d_curr, v_curr});
+
+                float d_worst = d_curr;
+                seen.insert(v_curr);
+
+                while (!candidates.empty()) {
+                    int   v_best;
+                    float d_best;
+
+                    auto best = candidates.pop();
+                    v_best = std::get<1>(best);
+                    d_best = std::get<0>(best);
+
+                    d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                    bsg_print_int(-v_best);
+#endif
+
+                    if (d_best > d_worst) {
+                        break;
+                    }
+
+                    // traverse neighbors of v_best
+                    int dst_0 = G.offsets[v_best];
+                    int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+
+                    // traverse neighbors
+                    for (int dst_i = 0;
+                         dst_i < degree;
+                         dst_i += BSG_TILE_GROUP_X_DIM) {
+                        // read-in work
+                        int dst_n = std::min(BSG_TILE_GROUP_X_DIM, degree-dst_i);
+                        int dst_v[dst_n];
+                        memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v));
+
+                        // delegate work
+                        int dst;
+                        for (int dst_j = 1; dst_j < dst_n; ++dst_j) {
+                            dst = dst_v[dst_j];
+                            *dst_slave_ptr[dst_j] = dst;                        
+                        }
+                        // work myself 
+                        {
+                            dst = dst_v[0];
+                            if (!seen.in(dst)) {
+                                seen.atomic_insert(dst);                            
+                                dist_result[0] = -1.0 * ip_y.inner_product(dst);
+                            } else {
+                                dist_result[0] = -INFINITY;
+                            }
+                        }
+                        // reduce
+                        for (int dst_j = 0; dst_j < dst_n; ++dst_j) {
+                            dst = dst_v[dst_j];
+
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                            bsg_print_int(dst);
+#endif                        
+                            float d_neib = sleep_until_valid(&dist_result[dst_j], INFINITY);
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                            bsg_print_float(d_neib);
+#endif
+                            // already seen?
+                            if (d_neib == -INFINITY)
+                                continue;
+
+                            d_worst = std::get<0>(results.top());
+                            // if there's room for new result or this distance is promising
+                            if ((results.size() < EF) || (d_neib < d_worst)) {
+                            
+                                // push onto candidates and results
+                                candidates.push({d_neib, dst});
+                                results.push({d_neib, dst});
+                            
+                                // prune down to recall
+                                if (results.size() > EF)
+                                    results.pop();
+                            }
+                        }                    
+                    
+                    }
+
+                }
+
+                // signal all columns done
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM; ++tile)
+                    *dst_slave_ptr[tile] = SYNC_DONE;
+
+                int n_res = std::min(results.size(), N_RESULTS);
+                std::sort(results_mem, results_mem+results.size(), LT());
+                *n_results = n_res;
+        
+            } else { // bsg_x != 0
+                ipnsw_x_master(database, q, &dst_slave, dist_result, &seen, &ip_y);
+            }
+        }
+
+        ip_y.exit();
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp
new file mode 100644
index 000000000..9c761e94d
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp
@@ -0,0 +1,2 @@
+extern "C" int empty() {
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp
new file mode 100644
index 000000000..385e69d8a
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, const float *database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp
new file mode 100644
index 000000000..67533d6da
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp
new file mode 100644
index 000000000..d7c2bd9c3
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v2<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp
new file mode 100644
index 000000000..aafefe6fd
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v4_serial(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VCURR_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp
new file mode 100644
index 000000000..99614fc8b
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VCURR_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp
new file mode 100644
index 000000000..c60b3e125
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp
@@ -0,0 +1,152 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+#define DEBUG_GREEDY_VIS_TR
+
+    int ipnsw_greedy_search (const graph *Gs,
+                             bsg_attr_remote const float *__restrict database,
+                             const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        InnerProduct ip(database, q);
+        ip.init();
+        if (__bsg_id == 0) {
+            bsg_saif_start();
+            int   v_curr = V_ENTRY;
+            float d_curr = 0;
+
+            d_curr = -1.0 * ip.inner_product(v_curr);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+            bsg_print_int(v_curr);
+            bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+            for (int i = 0; i < NG-1; i++) {
+                struct graph G = Gs[i];
+                bool changed = true;
+                while (changed) {
+                    changed = false;
+                    // fetch neighbors
+                    int dst_0 = G.offsets[v_curr];
+                    int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                    for (int dst_i = 0; dst_i < degree; dst_i++) {
+                        int dst = G.neighbors[dst_0+dst_i];
+                        // calc. iproduct
+                        float d = -1.0 * ip.inner_product(dst);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(dst);
+                        bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                        if (d < d_curr) {
+                            d_curr = d;
+                            v_curr = dst;
+                            changed = true;
+
+#if defined(DEBUG_GREEDY_VCURR_TR)
+                            bsg_print_int(v_curr);
+                            bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                        }
+                    }
+                }
+            }
+
+            *v_curr_o = v_curr;
+            *d_curr_o = d_curr;
+            bsg_saif_end();
+        }
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp
new file mode 100644
index 000000000..12da1aebb
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp
@@ -0,0 +1,113 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+#define DEBUG_GREEDY_VIS_TR
+
+    /**/
+    int ipnsw_greedy_search (const graph *Gs,
+                             bsg_attr_remote const float *__restrict database,
+                             const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+    /* loc:2 */
+        /**/
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+        /* loc:2 */
+
+        /* init code - can be hidden by library*/
+        InnerProduct ip(database, q);
+        ip.init();
+        if (__bsg_id == 0) {
+            bsg_saif_start();
+            /**/
+            int   v_curr = V_ENTRY;
+            float d_curr = 0;
+
+            d_curr = -1.0 * ip.inner_product(v_curr);
+
+            /**/
+            for (int i = 0; i < NG-1; i++) {
+                struct graph G = Gs[i];
+                bool changed = true;
+                while (changed) {
+                    changed = false;
+                    /* loc:5 */
+                    // fetch neighbors
+                    /**/
+                    for (int dst : G.neighbors(v_curr)) {
+                        float d = -1.0 * ip.inner_product(dst);
+                        if (d < d_curr) {
+                            d_curr = d;
+                            v_curr = dst;
+                            changed = true;
+                        }
+                    }            
+                }
+            }
+            /* loc: 10 */
+            /**/
+            *v_curr_o = v_curr;
+            *d_curr_o = d_curr;
+        }
+        return 0;
+    }
+    /* loc: 5 */
+    
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh
new file mode 100644
index 000000000..1f12e76ba
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh
@@ -0,0 +1 @@
+cat kernel.loc.cpp | grep loc: | cut -d: -f2 | cut -d* -f1 | awk 'BEGIN{x=0}{x = x+$1}END{print x}'
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp
new file mode 100644
index 000000000..aaaf5317d
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp
@@ -0,0 +1,40 @@
+#pragma once
+#include <array>
+#include <algorithm>
+
+template <typename T, typename Comparitor>
+class DynHeap {
+public:
+    DynHeap(T *data, int N):
+        _data(data),
+        _data_N(N),
+        _n(0){
+    }
+
+    void push(T i) {
+        _data[_n++] = i;
+        std::push_heap(_data, _data+_n, Comparitor());
+        if (_n > _data_N) pop();
+    }
+
+    T pop() {
+        std::pop_heap(_data, _data+_n--, Comparitor());
+        return _data[_n];
+    }
+
+    T top() const {
+        return _data[0];
+    }
+
+    bool empty() const {
+        return _n == 0;
+    }
+
+    int size() const {
+        return _n;
+    }
+
+    int _n;
+    int _data_N;
+    T  *_data;
+};
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp
new file mode 100644
index 000000000..95d6b291e
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp
@@ -0,0 +1,6 @@
+#ifndef __HELLO_WORLD_HPP
+#define __HELLO_WORLD_HPP
+
+#include <cstdint>
+
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp
new file mode 100644
index 000000000..6099411c2
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp
@@ -0,0 +1,319 @@
+#pragma once
+#include "bsg_striped_array.hpp"
+#include <cmath>
+#include <numeric>
+#include <bsg_manycore.hpp>
+#include "sleep_until_valid.hpp"
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
+__attribute__((noinline))
+FLOAT_T inner_product(const FLOAT_T *__restrict a, const FLOAT_T *__restrict b)
+{
+    FLOAT_T r = 0.0;
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) {
+        #pragma GCC unroll 32
+        for (int j = 0; j < BSIZE; ++j) {
+            r += a[i + j]*b[i + j];
+        }
+    }
+    return r;
+}
+
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
+__attribute__((noinline))
+FLOAT_T inner_product_v1(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    FLOAT_T r = 0.0;
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) {
+        #pragma GCC unroll 32
+        for (int j = 0; j < BSIZE; ++j) {
+            r += a[i + j]*b[i + j];
+        }
+    }
+    return r;
+}
+
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
+__attribute__((noinline))
+FLOAT_T inner_product_v2(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    FLOAT_T r = 0.0;
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) {
+        #pragma GCC unroll 32
+        for (int j = 0; j < BSIZE; ++j) {
+            r = fmaf(a[i+j], b[i+j], r);
+        }
+    }
+    return r;
+}
+
+
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
+__attribute__((noinline))
+FLOAT_T inner_product_v3(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    FLOAT_T r0 = 0.0, r1 = 0.0;
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += 2 * BSIZE * TG_X * TG_Y) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+            r0 = fmaf(a[i+j+0*BSIZE], b[i+j+0*BSIZE], r0);
+            r1 = fmaf(a[i+j+1*BSIZE], b[i+j+1*BSIZE], r1);
+        }
+    }
+    return r0+r1;
+}
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=5, int UNROLL=5>
+__attribute__((noinline))
+FLOAT_T inner_product_v4(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    register FLOAT_T r[UNROLL] = {0};
+    for (int i = __bsg_id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+#pragma bsg_unroll(32)
+            for (int k =0 ; k < UNROLL; ++k) {
+                r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]);
+            }
+        }
+    }
+    FLOAT_T rs = 0.0;
+    for (int i = 0; i < UNROLL; ++i)
+        rs += r[i];
+    return rs;
+}
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=5, int UNROLL=5>
+__attribute__((noinline))
+FLOAT_T inner_product_parallel_v1(const FLOAT_T *__restrict a,
+                                  bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    register FLOAT_T r[UNROLL] = {0.0};
+
+    for (int i = __bsg_id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+#pragma bsg_unroll(32)
+            for (int k =0 ; k < UNROLL; ++k) {
+                r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]);
+            }
+        }
+    }
+    FLOAT_T rs = 0.0;
+    for (int i = 0; i < UNROLL; ++i)
+        rs += r[i];
+
+    return rs;
+}
+
+
+template<typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=5, int UNROLL=5>
+__attribute__((noinline))
+FLOAT_T inner_product_v4_serial(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    register FLOAT_T r[UNROLL] = {0};
+    for (int i = 0; i < VSIZE; i += UNROLL * BSIZE) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+#pragma bsg_unroll(32)
+            for (int k =0 ; k < UNROLL; ++k) {
+                r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]);
+            }
+        }
+    }
+    FLOAT_T rs = 0.0;
+    for (int i = 0; i < UNROLL; ++i)
+        rs += r[i];
+    return rs;
+}
+
+
+template<std::size_t TG_N,
+         typename    FLOAT_T=float,
+         std::size_t VSIZE=100,
+         std::size_t BSIZE=5,
+         int         UNROLL=5>
+FLOAT_T inner_product_parallel_v2(
+    int id,
+    const FLOAT_T *__restrict a,
+    bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    register FLOAT_T r[UNROLL] = {0.0};
+
+    for (int i = id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_N) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+#pragma bsg_unroll(32)
+            for (int k =0 ; k < UNROLL; ++k) {
+                r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]);
+            }
+        }
+    }
+    FLOAT_T rs = 0.0;
+    for (int i = 0; i < UNROLL; ++i)
+        rs += r[i];
+
+    return rs;
+}
+
+template <std::size_t TG_X, std::size_t TG_Y>
+class InnerProductParallel_v1 {
+public:
+    static constexpr std::size_t VSIZE = 100;
+    static constexpr std::size_t TG_N = TG_X * TG_Y;
+    static constexpr int SYNC_DONE = -2;
+    static constexpr int SYNC_INV  = -1;
+
+    InnerProductParallel_v1(bsg_attr_remote const float *t1, const float *t2) {
+        _inf = INFINITY;
+        for (int i = 0; i < TG_N; ++i)
+            _partial[i] = _inf;
+
+        for (int x = 0; x < TG_X; ++x)
+            for (int y = 0; y < TG_Y; ++y)
+                _t1_idx_group[bsg_x_y_to_id(x,y)]
+                    = bsg_tile_group_remote_pointer(x,y,&_t1_idx);
+
+        _t1 = t1;
+        _t2 = t2;
+        _t1_idx = SYNC_INV;
+    }
+
+    void init() {
+        if (__bsg_id == 0) {
+            return;
+        }
+
+        float p = 0.0;
+        int t1_idx;
+        float *partial_result = bsg_tile_group_remote_pointer(0, 0, &_partial[__bsg_id]);
+
+        while (true) {
+            t1_idx = sleep_until_valid(&_t1_idx, SYNC_INV);
+            if (t1_idx == SYNC_DONE)
+                break;
+
+            p = inner_product_parallel_v1<TG_X,TG_Y>(_t2, &_t1[t1_idx * VSIZE]);
+            *partial_result = p;
+        }
+    }
+
+    float inner_product(int idx) {
+        if (__bsg_id != 0)
+            return 0.0;
+
+        for (int tile = 0; tile < TG_X*TG_Y; ++tile)
+            *_t1_idx_group[tile] = idx;
+
+        _partial[__bsg_id] = inner_product_parallel_v1<TG_X,TG_Y>(_t2, &_t1[idx * VSIZE]);
+
+        float r = 0.0;
+        for (int tile = 0; tile <TG_X*TG_Y; ++tile) {
+            float tmp = sleep_until_valid(&_partial[tile], _inf);
+            r += tmp;
+        }
+
+        return r;
+    }
+
+    void exit() {
+        if (__bsg_id != 0)
+            return;
+
+        for (int tile = 0; tile < TG_X*TG_Y; ++tile)
+            *_t1_idx_group[tile] = SYNC_DONE;
+
+        return;
+    }
+
+    bsg_attr_remote const float   *_t1;
+    const float                   *_t2;
+    int                            _t1_idx;
+    int                           *_t1_idx_group[TG_N];
+    float                          _partial[TG_N];
+    float                          _inf;
+};
+
+template <std::size_t TG_Y>
+class InnerProductParallel_Y {
+public:
+    static constexpr std::size_t VSIZE = 100;
+    static constexpr int SYNC_DONE = -2;
+    static constexpr int SYNC_INV  = -1;
+
+    InnerProductParallel_Y(bsg_attr_remote const float *t1, const float *t2) {        
+        _inf = INFINITY;
+        for (int i = 0; i < TG_Y; ++i)
+            _partial[i] = _inf;
+
+        for (int y = 0; y < TG_Y; ++y)
+            _t1_idx_group[y] = bsg_tile_group_remote_pointer(__bsg_x, y, &_t1_idx);        
+
+        _t1 = t1;
+        _t2 = t2;
+        _t1_idx = SYNC_INV;
+    }
+
+    void init() {
+        if (__bsg_y == 0) {
+            return;
+        }
+
+        float p = 0.0;
+        int t1_idx;
+        float *partial_result = bsg_tile_group_remote_pointer(__bsg_x, 0, &_partial[__bsg_y]);
+
+        while (true) {
+            t1_idx = sleep_until_valid(&_t1_idx, SYNC_INV);
+            if (t1_idx == SYNC_DONE)
+                break;
+
+            p = inner_product_parallel_v2<TG_Y>(__bsg_y, _t2, &_t1[t1_idx * VSIZE]);
+            *partial_result = p;
+        }
+    }
+
+    float inner_product(int idx) {
+        if (__bsg_y != 0)
+            return 0.0;
+
+        for (int tile = 0; tile < TG_Y; ++tile)
+            *_t1_idx_group[tile] = idx;
+
+        _partial[__bsg_y] = inner_product_parallel_v2<TG_Y>(__bsg_y, _t2, &_t1[idx * VSIZE]);
+
+        float r = 0.0;
+        for (int tile = 0; tile <TG_Y; ++tile) {
+            float tmp = sleep_until_valid(&_partial[tile], _inf);
+            r += tmp;
+        }
+
+        return r;
+    }
+
+    void exit() {
+        if (__bsg_y != 0)
+            return;
+
+        for (int tile = 0; tile < TG_Y; ++tile)
+            *_t1_idx_group[tile] = SYNC_DONE;
+
+        return;
+    }
+
+    bsg_attr_remote const float   *_t1;
+    const float                   *_t2;
+    int                            _t1_idx;
+    int                           *_t1_idx_group [TG_Y];
+    float                          _partial      [TG_Y];
+    float                          _inf;
+};
+
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp
new file mode 100644
index 000000000..2e3621fef
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp
@@ -0,0 +1,86 @@
+#pragma once
+#include <algorithm>
+#include <atomic>
+template<typename T, typename Comparitor>
+class DynSet {
+public:
+    DynSet(T *data, int N):
+        _data(data),
+        _data_N(N),
+        _n(0) {
+    }
+
+    void insert(T i) {
+        _data[_n++] = i;
+        std::sort(_data, _data+_n, Comparitor());
+    }
+
+    bool in(T i) {
+        return std::binary_search(_data, _data+_n, i, Comparitor());
+    }
+
+    int size() const {
+        return _n;
+    }
+
+    T    *_data;
+    int   _n;
+    int   _data_N;
+};
+
+template<typename T>
+class DenseSet {
+public:
+    DenseSet(int *data):
+        _data(data) {
+    }
+
+    void insert(T i) {
+        _data[i] = 1;
+    }
+
+    void atomic_insert(T i) {
+        insert(i);
+    }
+
+    bool in(T i) {
+        return _data[i] == 1;
+    }
+
+    int *_data;
+};
+
+template<typename T>
+class DenseSet_v1 {
+public:
+    DenseSet_v1(int *data) :
+        _data(data){
+    }
+
+    void insert(T i) {
+        _data[word(i)] |= (1 << bit(i));
+    }
+
+    void atomic_insert(T i) {
+        int *ptr = &_data[word(i)];
+        int r    = 1 << bit(i);
+        asm volatile ("amoor.w x0, %[r], %[ptr]" :
+                      :
+                      [r] "r" (r),
+                      [ptr] "m" (*ptr));
+        return;
+    }
+
+    bool in(T i) {
+        return _data[word(i)] & (1 << bit(i));
+    }
+
+    int word(T i) const {
+        return  i >> 5;
+    }
+
+    int bit(T i) const {
+        return i & 31;
+    }
+    int *_data;
+};
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp
new file mode 100644
index 000000000..d59088d75
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp
@@ -0,0 +1,28 @@
+#pragma once
+#include <bsg_manycore.h>
+template <typename T>
+static inline T sleep_on_update(volatile T *ptr)
+{
+    T r;
+    asm volatile ("lr.w.aq %[r], %[ptr]" :
+                  [r]   "=r" (r) :
+                  [ptr] "m"  (*ptr)
+        );
+    return r;
+}
+
+template <typename T>
+static inline T sleep_until_valid(volatile T *ptr, T not_valid)
+{
+    T r;
+
+    asm volatile ("lr.w %[r], %[ptr]" :
+                  [r] "=r" (r) :
+                  [ptr] "m" (*ptr));
+
+    while (r == not_valid) {
+        r = sleep_on_update(ptr);
+    }
+    *ptr = not_valid;
+    return r;
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp
new file mode 100644
index 000000000..9fe605f3a
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp
@@ -0,0 +1,180 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+//#include <bsg_tile_group_barrier.h>
+#include <bsg_tile_group_barrier.hpp>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+#include <atomic>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define VISIT_BUFSIZE 512
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+//#define DEBUG_SLAVE
+//#define DEBUG_MASTER
+
+using barrier = bsg_barrier<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_parallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+#define SYNC_DONE -1
+   
+    __attribute__((noinline))
+    int inner_product_ubmk_master(bsg_attr_remote const float * __restrict database,
+                                  const float * __restrict query,
+                                  int N,
+                                  int *visit_remote_all,
+                                  barrier *group_barrier,
+                                  std::atomic<int> *kp,
+                                  std::atomic<float> *rp)
+    {
+        float r = 0.0;
+        int visit[VISIT_BUFSIZE];
+        int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id];
+
+        // pre-compute addresses on remote tiles
+        std::atomic<int>   *kp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM];
+        std::atomic<float> *rp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM];
+
+        for (int tile_x = 0; tile_x < BSG_TILE_GROUP_X_DIM; ++tile_x) {
+            for (int tile_y = 0; tile_y < BSG_TILE_GROUP_Y_DIM; ++tile_y) {                
+                kp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, kp);
+                rp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, rp);
+            }
+        }
+
+        for (int i = 0; i < N; i += VISIT_BUFSIZE) {
+            size_t sz = std::min(VISIT_BUFSIZE, (N-i));
+            memcpy(visit, &visit_remote[i], sz*sizeof(int));
+
+            for (int j = 0; j < sz; ++j) {
+                // read k
+                int k = visit[j];
+
+                // set k on all tiles
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile)
+                    kp_group[tile]->store(k, std::memory_order_relaxed);
+
+                // do inner product
+                group_barrier->sync(); // signal ready
+                float r_local = iproduct(query, &database[k * VSIZE]);
+#ifdef DEBUG_MASTER
+                bsg_print_float(r_local);
+#endif
+                rp_group[__bsg_id]->store(r_local, std::memory_order_relaxed);
+                group_barrier->sync(); // signal done
+
+                // read r from all tiles
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) {
+                    float r_remote = rp_group[tile]->load(std::memory_order_relaxed);
+#ifdef DEBUG_MASTER
+                    bsg_print_float(r_remote);
+#endif
+                    r += r_remote;
+                }
+            }
+        }
+
+        return (int)r;
+    }
+
+    __attribute__((noinline))
+    void inner_product_ubmk_slave(bsg_attr_remote const float * __restrict database,
+                                  const float * __restrict query,
+                                  barrier *group_barrier,
+                                  std::atomic<int> *kp,
+                                  std::atomic<float> *rp)
+    {
+        float r = 0.0;
+        int k;
+
+        while (true) {
+            // load next
+            group_barrier->sync(); // signal ready
+            k = kp->load(std::memory_order_relaxed);
+            if (k == SYNC_DONE)
+                break;
+
+            // do inner product
+            r = iproduct(query, &database[k * VSIZE]);
+            rp->store(r, std::memory_order_relaxed);
+#ifdef DEBUG_SLAVE
+            bsg_print_float(r);
+#endif
+            group_barrier->sync(); // signal done
+        }
+    }
+    
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N,
+                           int *visit_remote_all)
+    {
+        static barrier group_barrier;
+        static std::atomic<int> k;
+        static std::atomic<float> r;
+        float rr;
+
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        if (__bsg_id == 0) {
+            // enter master loop
+            rr = inner_product_ubmk_master(database, q, N, visit_remote_all,
+                                           &group_barrier, &k, &r);
+        } else {
+            // enter slave loop
+            inner_product_ubmk_slave(database, q, &group_barrier, &k, &r);
+        }
+        bsg_cuda_print_stat_end(0);
+        
+        return (int)(rr);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp
new file mode 100644
index 000000000..df8be2dae
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+//#include <bsg_tile_group_barrier.h>
+#include <bsg_tile_group_barrier.hpp>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+#include <atomic>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+#include "sleep_until_valid.hpp"
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define VISIT_BUFSIZE 512
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_MASTER
+//#define DEBUG_SLAVE
+#define iproduct(x,y)                                                   \
+    inner_product_parallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    #define SYNC_INV  -2
+    #define SYNC_DONE -1
+
+        __attribute__((noinline))
+    int inner_product_ubmk_master(bsg_attr_remote const float * __restrict database,
+                                  const float * __restrict query,
+                                  int N,
+                                  int *visit_remote_all,
+                                  int   *kp,
+                                  float *rp)
+    {
+        float r = 0.0;
+        int visit[VISIT_BUFSIZE];
+        int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id];
+
+        // pre-compute addresses on remote tiles
+        int   *kp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM];
+
+        for (int tile_x = 0; tile_x < BSG_TILE_GROUP_X_DIM; ++tile_x) {
+            for (int tile_y = 0; tile_y < BSG_TILE_GROUP_Y_DIM; ++tile_y) {                
+                kp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, kp);
+            }
+        }
+
+        for (int i = 0; i < N; i += VISIT_BUFSIZE) {
+            size_t sz = std::min(VISIT_BUFSIZE, (N-i));
+            memcpy(visit, &visit_remote[i], sz*sizeof(int));
+
+            for (int j = 0; j < sz; ++j) {
+                // read k
+                int k = visit[j];
+
+                // set k on all tiles
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile)
+                    *kp_group[tile] = k;
+
+                float r_local = iproduct(query, &database[k * VSIZE]);
+#ifdef DEBUG_MASTER
+                bsg_print_float(r_local);
+#endif
+                rp[__bsg_id] = r_local;
+
+                // read r from all tiles
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) {
+                    float r_remote = sleep_until_valid(&rp[tile], INFINITY);
+#ifdef DEBUG_MASTER
+                    bsg_print_float(r_remote);
+#endif
+                    r += r_remote;
+                }
+            }
+        }
+
+        for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile)
+            *kp_group[tile] = SYNC_DONE;
+
+        return (int)r;
+    }
+
+    __attribute__((noinline))
+    void inner_product_ubmk_slave(bsg_attr_remote const float * __restrict database,
+                                  const float * __restrict query,
+                                  int   *kp,
+                                  float *rp)
+    {
+        float r = 0.0;
+        int k;
+
+        while (true) {
+            // load next
+            k = sleep_until_valid(kp, SYNC_INV);
+            if (k == SYNC_DONE)
+                break;
+
+            r = iproduct(query, &database[k * VSIZE]);
+            *rp = r;
+#ifdef DEBUG_SLAVE
+            bsg_print_float(r);
+#endif
+        }
+    }
+    
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N,
+                           int *visit_remote_all)
+    {
+        static int   k = SYNC_INV;
+        static float r [BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM] = {INFINITY};
+        float rr;
+
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        if (__bsg_id == 0) {
+            // enter master loop
+            rr = inner_product_ubmk_master(database, q, N, visit_remote_all, &k, r);
+        } else {
+            // enter slave loop
+            inner_product_ubmk_slave(database, q, &k, bsg_tile_group_remote_pointer(0,0, &r[__bsg_id]));
+        }
+        bsg_cuda_print_stat_end(0);
+        
+        return (int)(rr);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp
new file mode 100644
index 000000000..be92dfcc9
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp
@@ -0,0 +1,80 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+//#include <bsg_tile_group_barrier.h>
+#include <bsg_tile_group_barrier.hpp>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+#include <atomic>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+#include "sleep_until_valid.hpp"
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define VISIT_BUFSIZE 512
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+using barrier = bsg_barrier<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N,
+                           int *visit_remote_all)
+    {
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+        barrier b;
+
+        bsg_cuda_print_stat_start(0);
+        
+        InnerProduct ip(database, q);
+        ip.init();        
+        float r = 0.0;
+        int visit[VISIT_BUFSIZE];
+        int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id];
+
+        for (int i = 0; i < N; i += VISIT_BUFSIZE) {
+            size_t sz = std::min(VISIT_BUFSIZE, (N-i));
+            memcpy(visit, &visit_remote[i], sz*sizeof(int));
+
+            for (int j = 0; j < sz; ++j) {
+                // read k
+                int k = visit[j];                
+                float rp = ip.inner_product(k);
+                r += rp;
+            }
+        }
+
+        ip.exit();
+        bsg_cuda_print_stat_end(0);
+        b.sync();
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp
new file mode 100644
index 000000000..fc8dd7c82
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp
@@ -0,0 +1,71 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    int inner_product_ubmk(const float *database, const float *query, int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            const float *b = &database[i*3*VSIZE];
+            r += inner_product<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(q,b);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp
new file mode 100644
index 000000000..9d9dcbc11
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp
@@ -0,0 +1,94 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define VISIT_BUFSIZE 512
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N,
+                           int *visit_remote_all)
+    {
+        float q[VSIZE];
+        float r = 0;
+        int visit[VISIT_BUFSIZE];
+        //int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id_x * __bsg_tile_group_id_y];
+        int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id];
+        //int *visit_remote = &visit_remote_all[0];
+
+        bsg_print_int(-1 * __bsg_tile_group_id);
+        bsg_print_int(N);
+        bsg_print_hexadecimal(reinterpret_cast<unsigned>(database));
+        bsg_print_hexadecimal(reinterpret_cast<unsigned>(query));
+        bsg_print_hexadecimal(reinterpret_cast<unsigned>(visit_remote_all));
+
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        for (int i = 0; i < N; i += VISIT_BUFSIZE) {
+            size_t sz = std::min(VISIT_BUFSIZE, (N-i));
+            memcpy(visit, &visit_remote[i], sz*sizeof(int));
+            
+            for (int j = 0; j < sz; ++j) {
+                int k = visit[j];
+                //r += iproduct(q, &database[(i+j*3)*VSIZE]);
+                r += iproduct(q, &database[k*VSIZE]);
+            }
+        }
+        bsg_cuda_print_stat_end(0);
+
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp
new file mode 100644
index 000000000..2deb68437
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+    
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            //const float *b = &database[i*3*VSIZE];
+            r += iproduct(q, &database[i*3*VSIZE]);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp
new file mode 100644
index 000000000..0d4fce43b
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v2<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            //const float *b = &database[i*3*VSIZE];
+            r += iproduct(q, &database[i*3*VSIZE]);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp
new file mode 100644
index 000000000..8f1058017
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            //const float *b = &database[i*3*VSIZE];
+            r += iproduct(q, &database[i*3*VSIZE]);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp
new file mode 100644
index 000000000..c1ab7a9ba
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v4<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            //const float *b = &database[i*3*VSIZE];
+            r += iproduct(q, &database[i*3*VSIZE]);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/template.mk b/examples/sdh-eval-workloads/ipnsw/template.mk
new file mode 100644
index 000000000..13c1e5919
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/template.mk
@@ -0,0 +1,72 @@
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+include $(REPLICANT_PATH)/environment.mk
+include $(BSG_MACHINE_PATH)/Makefile.machine.include
+
+# kernel code
+BSG_MANYCORE_KERNELS = kernel.riscv
+
+RISCV_CCPPFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/kernel/include
+RISCV_CCPPFLAGS += -Dbsg_tiles_X=1
+RISCV_CCPPFLAGS += -Dbsg_tiles_Y=1
+
+RISCV_TARGET_OBJECTS = kernel.rvo
+kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
+RISCV_OPT_LEVEL = -O3
+include $(EXAMPLES_PATH)/cuda/riscv.mk
+RISCV_LDFLAGS := $(filter-out -nostdlib,$(RISCV_LDFLAGS))
+
+# host code
+graphtools-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/graph-tools
+hammerblade-helpers-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hammerblade-helpers
+
+include $(graphtools-dir)/libgraphtools.mk
+include $(hammerblade-helpers-dir)/libhammerblade-helpers-host.mk
+
+# header files
+TEST_HEADERS := $(libhammerblade-helpers-host-interface-headers)
+TEST_HEADERS += $(libgraphtools-interface-headers)
+TEST_HEADERS += GreedyWalkResults.hpp
+TEST_HEADERS += IO.hpp
+TEST_HEADERS += IPNSWGraph.hpp
+TEST_HEADERS += IPNSWRunner.hpp
+TEST_HEADERS += IPNSWKernelRunner.hpp
+TEST_HEADERS += GreedyWalkKernelRunner.hpp
+TEST_HEADERS += BeamSearchKernelRunner.hpp
+TEST_HEADERS += IProductUBmkKernelRunner.hpp
+TEST_HEADERS += IPNSWResultReader.hpp
+TEST_HEADERS += GreedyWalkResultReader.hpp
+TEST_HEADERS += BeamSearchResultReader.hpp
+TEST_HEADERS += GreedyWalkResults.hpp
+TEST_HEADERS += IPNSWFactory.hpp
+TEST_HEADERS += GreedyWalkFactory.hpp
+TEST_HEADERS += BeamSearchFactory.hpp
+TEST_HEADERS += IProductUBmkFactory.hpp
+TEST_HEADERS += StringHelpers.hpp
+
+# source files
+TEST_SOURCES := GreedyWalkResults.cpp
+TEST_SOURCES += ipnsw.cpp
+
+# cxxflags
+CXXFLAGS += $(libgraphtools-interface-cxxflags)
+CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags)
+CXXFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw
+CXXFLAGS += -DCOSIM
+
+# ldflags
+LDFLAGS += $(libgraphtools-interface-ldflags)
+LDFLAGS += $(libhammerblade-helpers-host-interface-ldflags)
+
+vpath %.cpp $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw
+vpath %.hpp $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw
+
+TEST_NAME = main
+
+include $(EXAMPLES_PATH)/compilation.mk
+include $(EXAMPLES_PATH)/link.mk
+
+# mark dependencies
+$(TEST_OBJECTS): $(libgraphtools-interface-libraries)
+$(TEST_OBJECTS): $(TEST_HEADERS)
+
+include $(EXAMPLES_PATH)/execution.mk