From 11664ab6ab8300feeaa23cc43a4e9ced48d2fafc Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Fri, 30 Apr 2021 11:25:13 -0700
Subject: [PATCH 01/22] IPNSW source from baseline

---
 .gitmodules                                   |   3 +
 .../ipnsw/BeamSearchFactory.hpp               |  11 +
 .../ipnsw/BeamSearchKernelRunner.hpp          |  38 ++
 .../ipnsw/BeamSearchResultReader.hpp          |  25 +
 .../ipnsw/GreedyWalkFactory.hpp               |  12 +
 .../ipnsw/GreedyWalkKernelRunner.hpp          |  25 +
 .../ipnsw/GreedyWalkResultReader.hpp          |  21 +
 .../ipnsw/GreedyWalkResults.cpp               | 517 ++++++++++++++++++
 .../ipnsw/GreedyWalkResults.hpp               |   9 +
 examples/sdh-eval-workloads/ipnsw/IO.hpp      | 223 ++++++++
 .../sdh-eval-workloads/ipnsw/IPNSWFactory.hpp |  17 +
 .../sdh-eval-workloads/ipnsw/IPNSWGraph.hpp   |  69 +++
 .../ipnsw/IPNSWKernelRunner.hpp               |  31 ++
 .../ipnsw/IPNSWResultReader.hpp               |  13 +
 .../sdh-eval-workloads/ipnsw/IPNSWRunner.hpp  | 184 +++++++
 .../ipnsw/IProductUBmkFactory.hpp             |  19 +
 .../ipnsw/IProductUBmkKernelRunner.hpp        |  31 ++
 .../ipnsw/IProductUBmkResultReader.hpp        |  12 +
 examples/sdh-eval-workloads/ipnsw/Makefile    | 351 ++++++++++++
 .../ipnsw/StringHelpers.hpp                   |  17 +
 .../sdh-eval-workloads/ipnsw/hb-prog-eval     |   1 +
 examples/sdh-eval-workloads/ipnsw/ipnsw.cpp   |  86 +++
 examples/sdh-eval-workloads/ipnsw/ipnsw.hpp   |  37 ++
 .../ipnsw/kernel/beam_search/kernel.cpp       | 182 ++++++
 .../ipnsw/kernel/beam_search_v1/kernel.cpp    | 188 +++++++
 .../ipnsw/kernel/beam_search_v2/kernel.cpp    | 189 +++++++
 .../ipnsw/kernel/beam_search_v3/kernel.cpp    | 189 +++++++
 .../ipnsw/kernel/beam_search_v4/kernel.cpp    | 189 +++++++
 .../ipnsw/kernel/beam_search_v5/kernel.cpp    | 189 +++++++
 .../ipnsw/kernel/debug/kernel.cpp             |   2 +
 .../ipnsw/kernel/greedy_walk/kernel.cpp       | 147 +++++
 .../ipnsw/kernel/greedy_walk_v1/kernel.cpp    | 147 +++++
 .../ipnsw/kernel/greedy_walk_v2/kernel.cpp    | 147 +++++
 .../ipnsw/kernel/greedy_walk_v3/kernel.cpp    | 147 +++++
 .../ipnsw/kernel/include/heap.hpp             |  40 ++
 .../ipnsw/kernel/include/hello_world.hpp      |   6 +
 .../ipnsw/kernel/include/inner_product.hpp    |  89 +++
 .../ipnsw/kernel/include/set.hpp              |  73 +++
 .../ipnsw/kernel/iproduct_ubmk/kernel.cpp     |  71 +++
 .../ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp  |  76 +++
 .../ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp  |  76 +++
 .../ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp  |  76 +++
 .../ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp  |  76 +++
 43 files changed, 4051 insertions(+)
 create mode 100644 .gitmodules
 create mode 100644 examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IO.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/Makefile
 create mode 100644 examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp
 create mode 160000 examples/sdh-eval-workloads/ipnsw/hb-prog-eval
 create mode 100644 examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/ipnsw.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..5083eb4ea
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "n"]
+	path = examples/sdh-eval-workloads/ipnsw/hb-prog-eval
+	url = git@github.com:bespoke-silicon-group/hb-prog-eval
diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp
new file mode 100644
index 000000000..3d14b2c8d
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp
@@ -0,0 +1,11 @@
+#pragma once
+#include "IPNSWFactory.hpp"
+#include "BeamSearchKernelRunner.hpp"
+#include "BeamSearchResultReader.hpp"
+namespace ipnsw {
+    class BeamSearchFactory : public IPNSWFactory {
+    private:
+        IPNSWKernelRunner *_KernelRunner() const { return new BeamSearchKernelRunner; }
+        IPNSWResultReader *_ResultReader() const { return new BeamSearchResultReader; }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp
new file mode 100644
index 000000000..426042f6d
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp
@@ -0,0 +1,38 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWRunner.hpp"
+#include "GreedyWalkResults.hpp"
+
+namespace ipnsw {
+    class BeamSearchKernelRunner : public IPNSWKernelRunner {
+        std::string kernelName(const IPNSWRunner & runner) const {
+            return "ipnsw_beam_search";
+        }
+
+        std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+            int v_curr;
+            float d_curr;
+            v_curr = std::get<GWR_VERT>(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]);
+            d_curr = std::get<GWR_DIST>(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]);
+
+            HammerBlade::Ptr hb = HammerBlade::Get();
+            hb->write(runner.v_curr_dev(), &v_curr, sizeof(v_curr));
+            hb->write(runner.d_curr_dev(), &d_curr, sizeof(d_curr));
+
+            std::vector<hb_mc_eva_t> argv = {
+                runner.graph_metadata_dev(),
+                runner.db_dev(),
+                runner.query_dev(),
+                runner.seen_dev(),
+                runner.v_curr_dev(),
+                runner.d_curr_dev(),
+                runner.candidates_dev(),
+                runner.results_dev(),
+                runner.n_results_dev(),
+            };
+            return argv;
+        };
+        Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);}
+        Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);}
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp
new file mode 100644
index 000000000..ce77d324f
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp
@@ -0,0 +1,25 @@
+#pragma once
+#include "IPNSWRunner.hpp"
+#include "IPNSWResultReader.hpp"
+#include "GreedyWalkResults.hpp"
+
+namespace ipnsw {
+    class BeamSearchResultReader : public IPNSWResultReader {
+    public:
+        void readResults(const IPNSWRunner & runner) {
+            HammerBlade::Ptr hb = HammerBlade::Get();
+
+            int n_results;
+            hb->read(runner.n_results_dev(), &n_results, sizeof(int));
+
+            std::vector<GreedyWalkResult> results(n_results);
+            hb->push_read(runner.results_dev(), &results[0], n_results * sizeof(GreedyWalkResult));
+            hb->sync_read();
+
+            std::cout << "Beam search:" << std::endl;
+            for (auto & r : results) {
+                std::cout << "{" << std::get<0>(r) << "," << std::get<1>(r) << "}" << std::endl;
+            }
+        }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp
new file mode 100644
index 000000000..e98f11ad2
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp
@@ -0,0 +1,12 @@
+#pragma once
+#include "IPNSWFactory.hpp"
+#include "GreedyWalkKernelRunner.hpp"
+#include "GreedyWalkResultReader.hpp"
+
+namespace ipnsw {
+    class GreedyWalkFactory : public IPNSWFactory {
+    private:
+        IPNSWKernelRunner *_KernelRunner() const { return new GreedyWalkKernelRunner; }
+        IPNSWResultReader *_ResultReader() const { return new GreedyWalkResultReader; }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp
new file mode 100644
index 000000000..72eea9f0f
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp
@@ -0,0 +1,25 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWRunner.hpp"
+
+namespace ipnsw {
+    class GreedyWalkKernelRunner : public IPNSWKernelRunner {
+        std::string kernelName(const IPNSWRunner & runner) const {
+            return "ipnsw_greedy_search";
+        }
+
+        std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+            std::vector<hb_mc_eva_t> argv = {
+                runner.graph_metadata_dev(),
+                runner.db_dev(),
+                runner.query_dev(),
+                runner.seen_dev(),
+                runner.v_curr_dev(),
+                runner.d_curr_dev(),
+            };
+            return argv;
+        };
+        Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);}
+        Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);}
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp
new file mode 100644
index 000000000..ae57cd548
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp
@@ -0,0 +1,21 @@
+#pragma once
+#include "IPNSWRunner.hpp"
+#include "IPNSWResultReader.hpp"
+
+namespace ipnsw {
+    class GreedyWalkResultReader : public IPNSWResultReader {
+    public:
+        void readResults(const IPNSWRunner & runner) {
+            HammerBlade::Ptr hb = HammerBlade::Get();
+            int v_curr;
+            float d_curr;
+
+            hb->read(runner.v_curr_dev(), &v_curr, sizeof(int));
+            hb->read(runner.d_curr_dev(), &d_curr, sizeof(float));
+
+            std::cout << "Greedy walk (v_curr,d_curr) = "
+                      << "(" << v_curr << "," << d_curr << ")"
+                      << std::endl;
+        }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp
new file mode 100644
index 000000000..7d37104df
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp
@@ -0,0 +1,517 @@
+#include "GreedyWalkResults.hpp"
+namespace ipnsw {
+    std::vector<GreedyWalkResult> GREEDY_WALK_RESULTS = {
+	GreedyWalkResult(static_cast<float>(-0x1.94442e0000000p-2), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.e72901fffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.cb85360000001p-4),541780),
+	GreedyWalkResult(static_cast<float>(-0x1.e56d7ffffffffp-8), 78517),
+	GreedyWalkResult(static_cast<float>(-0x1.655f860000000p-4),732469),
+	GreedyWalkResult(static_cast<float>(-0x1.04cbcc0000000p-4),380912),
+	GreedyWalkResult(static_cast<float>(-0x1.3243d20000000p-5),606365),
+	GreedyWalkResult(static_cast<float>(-0x1.2dbf640000000p-4),950108),
+	GreedyWalkResult(static_cast<float>(-0x1.fa90ea0000001p-1),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.2922f80000000p-3),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.5974060000000p-1),725033),
+	GreedyWalkResult(static_cast<float>(-0x1.abcf2c0000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.b262380000000p-1),272753),
+	GreedyWalkResult(static_cast<float>(-0x1.c0e98a0000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.01b4680000000p-2),184077),
+	GreedyWalkResult(static_cast<float>(-0x1.96e3280000000p-2),208965),
+	GreedyWalkResult(static_cast<float>(-0x1.58dd120000000p-3),580161),
+	GreedyWalkResult(static_cast<float>(-0x1.1f333a0000000p-3),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.8db7de0000000p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.4e43500000000p-2),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.a5ae760000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.7fcff00000000p-5),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.5630f40000000p-1),960530),
+	GreedyWalkResult(static_cast<float>(-0x1.48d8c20000000p-1),853984),
+	GreedyWalkResult(static_cast<float>(-0x1.14556ffffffffp+0),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.a746760000000p-2),865184),
+	GreedyWalkResult(static_cast<float>(-0x1.ddcb81fffffffp-3),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.94a92ffffffffp-2),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.45b69c0000000p-1),432335),
+	GreedyWalkResult(static_cast<float>(-0x1.2ef8fa0000000p-3),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.9909440000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1ce937fffffffp-5),321516),
+	GreedyWalkResult(static_cast<float>(-0x1.c0de380000000p-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.0de8e60000000p-8),897966),
+	GreedyWalkResult(static_cast<float>(-0x1.99783c0000000p-1),865184),
+	GreedyWalkResult(static_cast<float>(-0x1.01316e0000000p+0),886263),
+	GreedyWalkResult(static_cast<float>(-0x1.a172140000000p-6),177485),
+	GreedyWalkResult(static_cast<float>(-0x1.2b8f9a0000000p-7),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.924b440000000p-5),290055),
+	GreedyWalkResult(static_cast<float>(-0x1.8515aa0000000p-2),905210),
+	GreedyWalkResult(static_cast<float>(-0x1.f68975ffffffep-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.dd5ed00000001p-6),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.be40740000000p-1),870888),
+	GreedyWalkResult(static_cast<float>(-0x1.08f4460000001p-2),666073),
+	GreedyWalkResult(static_cast<float>(-0x1.2589100000000p-2),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.e43ad00000001p-3),230001),
+	GreedyWalkResult(static_cast<float>(-0x1.161b360000000p+0),646867),
+	GreedyWalkResult(static_cast<float>(-0x1.475e87fffffffp-6),179303),
+	GreedyWalkResult(static_cast<float>(-0x1.425b1c0000000p-1),463324),
+	GreedyWalkResult(static_cast<float>(-0x1.f4b68c0000000p-1),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.1333440000000p-1),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.0e35aa0000000p-1),312088),
+	GreedyWalkResult(static_cast<float>(-0x1.1b7653fffffffp+0),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.cb8adc0000000p-3),491377),
+	GreedyWalkResult(static_cast<float>(-0x1.51a0380000000p-1),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.e4b9940000000p-2),603696),
+	GreedyWalkResult(static_cast<float>(-0x1.623f9a0000000p-2),991097),
+	GreedyWalkResult(static_cast<float>(-0x1.1660b20000000p-1), 18868),
+	GreedyWalkResult(static_cast<float>(-0x1.bd75200000000p-7), 56131),
+	GreedyWalkResult(static_cast<float>(-0x1.4dbbe00000000p+0), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.1b55860000000p-5),310512),
+	GreedyWalkResult(static_cast<float>(-0x1.1f40e00000000p+0),115894),
+	GreedyWalkResult(static_cast<float>(-0x1.d403c60000001p-2),718485),
+	GreedyWalkResult(static_cast<float>(-0x1.a7b7bdfffffffp-7),601673),
+	GreedyWalkResult(static_cast<float>(-0x1.7f5c8c0000000p-2),552153),
+	GreedyWalkResult(static_cast<float>(-0x1.6834060000001p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.8ccf620000000p-2),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.1508660000000p+0),666073),
+	GreedyWalkResult(static_cast<float>(-0x1.6362300000000p-1),982683),
+	GreedyWalkResult(static_cast<float>(-0x1.175fbc0000000p-4),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.10e30a0000000p-5),703851),
+	GreedyWalkResult(static_cast<float>(-0x1.0343340000000p+0),580161),
+	GreedyWalkResult(static_cast<float>(-0x1.9337a20000000p-3),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.986e8a0000000p-7),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.1f400a0000000p+0),336830),
+	GreedyWalkResult(static_cast<float>(-0x1.3c0e060000000p-1),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.8589cc0000000p-1),118607),
+	GreedyWalkResult(static_cast<float>(-0x1.745f000000000p-3),272753),
+	GreedyWalkResult(static_cast<float>(-0x1.317ca40000000p-4),494402),
+	GreedyWalkResult(static_cast<float>(-0x1.ebd52a0000001p-7),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.7ad9100000001p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.6ed8a00000000p-2),134880),
+	GreedyWalkResult(static_cast<float>(-0x1.273edc0000000p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.93db8c0000000p-1),620143),
+	GreedyWalkResult(static_cast<float>(-0x1.324dd60000000p-4),778172),
+	GreedyWalkResult(static_cast<float>(-0x1.3c59a80000000p-1),270175),
+	GreedyWalkResult(static_cast<float>(-0x1.fc51e80000000p-2),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.a7fbc60000000p-2),603696),
+	GreedyWalkResult(static_cast<float>(-0x1.ab76780000000p-1),406402),
+	GreedyWalkResult(static_cast<float>(-0x1.8733320000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.447bb00000000p-1),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.b5c3140000000p-5),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.ca9b880000000p-1),785859),
+	GreedyWalkResult(static_cast<float>(-0x1.beee640000000p-1),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.47b4e80000000p-1),738101),
+	GreedyWalkResult(static_cast<float>(-0x1.069a7c0000000p-1),193430),
+	GreedyWalkResult(static_cast<float>(-0x1.20f53c0000000p-1),118809),
+	GreedyWalkResult(static_cast<float>(-0x1.1612f80000000p-2),711979),
+	GreedyWalkResult(static_cast<float>(-0x1.25c6c80000000p-1),348136),
+	GreedyWalkResult(static_cast<float>(-0x1.2507300000000p-2), 36731),
+	GreedyWalkResult(static_cast<float>(-0x1.14ef720000001p+0),268974),
+	GreedyWalkResult(static_cast<float>(-0x1.2b54f80000000p-4), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.e07ccbfffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.070d960000000p-4),785239),
+	GreedyWalkResult(static_cast<float>(-0x1.49e6200000000p-1),496330),
+	GreedyWalkResult(static_cast<float>(-0x1.86c9080000000p-1),969505),
+	GreedyWalkResult(static_cast<float>(-0x1.0b584c0000000p-1),587902),
+	GreedyWalkResult(static_cast<float>(-0x1.bb1ee00000000p-7),439426),
+	GreedyWalkResult(static_cast<float>(-0x1.ff17c9fffffffp-11),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.0da6980000000p+0),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1d0ba40000001p-3),288912),
+	GreedyWalkResult(static_cast<float>(-0x1.301dec0000000p-1),541780),
+	GreedyWalkResult(static_cast<float>(-0x1.2f9b800000000p-4),261103),
+	GreedyWalkResult(static_cast<float>(-0x1.8d769e0000000p-4),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.6ea4f80000000p-3),223977),
+	GreedyWalkResult(static_cast<float>(-0x1.fcc7dc0000000p-2),662137),
+	GreedyWalkResult(static_cast<float>(-0x1.5949fe0000000p-3),565830),
+	GreedyWalkResult(static_cast<float>(-0x1.1a11aa0000000p-1),908217),
+	GreedyWalkResult(static_cast<float>(-0x1.8bff140000000p-1),  2251),
+	GreedyWalkResult(static_cast<float>(-0x1.7ccda1fffffffp-2),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.80bf6e0000000p-2), 50016),
+	GreedyWalkResult(static_cast<float>(-0x1.3444300000000p-2),  2251),
+	GreedyWalkResult(static_cast<float>(-0x1.c8e8bc0000000p-1),223249),
+	GreedyWalkResult(static_cast<float>(-0x1.679767fffffffp-3),494887),
+	GreedyWalkResult(static_cast<float>(-0x1.6c896c0000000p-3),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.413b740000000p-4),772422),
+	GreedyWalkResult(static_cast<float>(-0x1.4e1d760000000p-3),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.7202fe0000001p-1),131611),
+	GreedyWalkResult(static_cast<float>(-0x1.2589840000000p+0),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.5820da0000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.96ceb00000001p-3),177485),
+	GreedyWalkResult(static_cast<float>(-0x1.d6ac77fffffffp-4),865184),
+	GreedyWalkResult(static_cast<float>(-0x1.bfefa00000000p-7),149329),
+	GreedyWalkResult(static_cast<float>(-0x1.69ac280000000p-1), 73867),
+	GreedyWalkResult(static_cast<float>(-0x1.04bb900000000p+0),567514),
+	GreedyWalkResult(static_cast<float>(-0x1.142a3dfffffffp+0),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.2f1ca40000000p-5),552153),
+	GreedyWalkResult(static_cast<float>(-0x1.1def580000000p-1),679881),
+	GreedyWalkResult(static_cast<float>(-0x1.072ac60000000p-4), 29163),
+	GreedyWalkResult(static_cast<float>(-0x1.2821940000000p-4),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.72a68e0000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.cafce80000000p-3),729852),
+	GreedyWalkResult(static_cast<float>(-0x1.3ba2d80000000p-2),729021),
+	GreedyWalkResult(static_cast<float>(-0x1.68739e0000000p-3),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.aeb25c0000000p-1),134880),
+	GreedyWalkResult(static_cast<float>(-0x1.18c0840000000p-5),693842),
+	GreedyWalkResult(static_cast<float>(-0x1.fe21ce0000001p-1), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.b41fb00000001p-1),735181),
+	GreedyWalkResult(static_cast<float>(-0x1.2826320000000p-8),379502),
+	GreedyWalkResult(static_cast<float>(-0x1.5eecda0000000p-1),925333),
+	GreedyWalkResult(static_cast<float>(-0x1.b002d40000000p-1),842476),
+	GreedyWalkResult(static_cast<float>(-0x1.4e53aa0000000p-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.a1b49bfffffffp-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.f1c7ac0000000p-1),750819),
+	GreedyWalkResult(static_cast<float>(-0x1.67f6720000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.31a6600000001p-6),341861),
+	GreedyWalkResult(static_cast<float>(-0x1.61c1080000000p-3),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.aaa3780000000p-2),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.3fa68a0000001p-6),160291),
+	GreedyWalkResult(static_cast<float>(-0x1.38c0b20000000p-1),379199),
+	GreedyWalkResult(static_cast<float>(-0x1.ee68980000001p-2),318485),
+	GreedyWalkResult(static_cast<float>(-0x1.dd852c0000001p-2),655315),
+	GreedyWalkResult(static_cast<float>(-0x1.06fa43fffffffp+0),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.07007e0000000p+0),926790),
+	GreedyWalkResult(static_cast<float>(-0x1.f352a1fffffffp-1),523435),
+	GreedyWalkResult(static_cast<float>(-0x1.c6d6160000000p-1),169991),
+	GreedyWalkResult(static_cast<float>(-0x1.090c620000000p-5),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.19f6860000000p+0),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.e3f8580000001p-2),255916),
+	GreedyWalkResult(static_cast<float>(-0x1.2148180000000p-1),206826),
+	GreedyWalkResult(static_cast<float>(-0x1.0487660000000p-2),494402),
+	GreedyWalkResult(static_cast<float>(-0x1.be5ea00000000p-3),532480),
+	GreedyWalkResult(static_cast<float>(-0x1.114b0a0000000p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1e0a2a0000000p-7),379350),
+	GreedyWalkResult(static_cast<float>(-0x1.22f06bfffffffp+0),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.bc42c20000000p-1),133288),
+	GreedyWalkResult(static_cast<float>(-0x1.9ec387fffffffp-2),495101),
+	GreedyWalkResult(static_cast<float>(-0x1.ab66b80000000p-3),115894),
+	GreedyWalkResult(static_cast<float>(-0x1.9be6e80000000p-4),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.4cdc7ffffffffp-6),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.c7a31c0000000p-7),764589),
+	GreedyWalkResult(static_cast<float>(-0x1.a35f1c0000000p-8),115043),
+	GreedyWalkResult(static_cast<float>(-0x1.3422a00000000p-1),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.5a4aa60000000p-4), 49557),
+	GreedyWalkResult(static_cast<float>(-0x1.06eddc0000000p-2),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.d46bde0000000p-1),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.02e72c0000000p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.e33abffffffffp-2),112248),
+	GreedyWalkResult(static_cast<float>(-0x1.ae74060000001p-4),133288),
+	GreedyWalkResult(static_cast<float>(-0x1.272a2bfffffffp-7),850826),
+	GreedyWalkResult(static_cast<float>(-0x1.357f25fffffffp-2),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.33c9f1fffffffp-3), 25893),
+	GreedyWalkResult(static_cast<float>(-0x1.771fdc0000001p-5),305162),
+	GreedyWalkResult(static_cast<float>(-0x1.18a1080000000p-4),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.46ad1e0000000p-4),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.0a53300000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.783f4e0000000p-6),546811),
+	GreedyWalkResult(static_cast<float>(-0x1.3f05b60000000p-3),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.602d5c0000000p-3),463324),
+	GreedyWalkResult(static_cast<float>(-0x1.c8f2b20000000p-5),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.0bde920000000p+0),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.8eb3fe0000000p-1),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.d981120000002p-3),849285),
+	GreedyWalkResult(static_cast<float>(-0x1.d8151a0000001p-1),133288),
+	GreedyWalkResult(static_cast<float>(-0x1.c231ec0000000p-1),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.c742700000000p-1),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.2a6c6a0000000p+0),945767),
+	GreedyWalkResult(static_cast<float>(-0x1.5b8c5bfffffffp-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.391a700000000p-12),562015),
+	GreedyWalkResult(static_cast<float>(-0x1.896b960000000p-1),969505),
+	GreedyWalkResult(static_cast<float>(-0x1.28e7fe0000000p-3),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.577a11fffffffp-4),348136),
+	GreedyWalkResult(static_cast<float>(-0x1.43b7f80000000p-4),950108),
+	GreedyWalkResult(static_cast<float>(-0x1.7e64600000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.97ebe20000000p-5),392823),
+	GreedyWalkResult(static_cast<float>(-0x1.a856440000000p-3),793084),
+	GreedyWalkResult(static_cast<float>(-0x1.84531a0000000p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.7c80d40000000p-4),186838),
+	GreedyWalkResult(static_cast<float>(-0x1.0c56e5fffffffp+0),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.72c0da0000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1844d00000000p-5),606365),
+	GreedyWalkResult(static_cast<float>(-0x1.52a5d40000000p-10),470059),
+	GreedyWalkResult(static_cast<float>(-0x1.7d31400000000p-1),738101),
+	GreedyWalkResult(static_cast<float>(-0x1.c47df00000000p-7),710471),
+	GreedyWalkResult(static_cast<float>(-0x1.dc3ccbfffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.5e773c0000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.7ffd660000000p-2),920345),
+	GreedyWalkResult(static_cast<float>(-0x1.ab0dc00000001p-2),677155),
+	GreedyWalkResult(static_cast<float>(-0x1.7f8db00000000p-5),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.add3b60000000p-1),293302),
+	GreedyWalkResult(static_cast<float>(-0x1.e0328c0000000p-4),758625),
+	GreedyWalkResult(static_cast<float>(-0x1.6022ce0000000p-5),666073),
+	GreedyWalkResult(static_cast<float>(-0x1.a1d241fffffffp-4),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.cec5e60000000p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.893f260000000p-3),855760),
+	GreedyWalkResult(static_cast<float>(-0x1.0790c00000000p-2),145893),
+	GreedyWalkResult(static_cast<float>(-0x1.49456ffffffffp-7),215955),
+	GreedyWalkResult(static_cast<float>(-0x1.71b1bc0000001p-5),312088),
+	GreedyWalkResult(static_cast<float>(-0x1.8b1c580000000p-1),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.2010d20000000p-4),142436),
+	GreedyWalkResult(static_cast<float>(-0x1.c33ecc0000000p-4),280878),
+	GreedyWalkResult(static_cast<float>(-0x1.6b1dce0000000p-2),444780),
+	GreedyWalkResult(static_cast<float>(-0x1.f76bb60000001p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.87151ffffffffp-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.f522ae0000000p-2),  9333),
+	GreedyWalkResult(static_cast<float>(-0x1.77d5c40000001p-4),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.f7f4edfffffffp-5),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.1c46b00000000p-1),270226),
+	GreedyWalkResult(static_cast<float>(-0x1.a4f43bfffffffp-6),140906),
+	GreedyWalkResult(static_cast<float>(-0x1.8952480000000p-1),670146),
+	GreedyWalkResult(static_cast<float>(-0x1.ca891c0000000p-7),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.e36b85fffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1aaf580000000p-3),909372),
+	GreedyWalkResult(static_cast<float>(-0x1.8116920000000p-8), 51434),
+	GreedyWalkResult(static_cast<float>(-0x1.acc07e0000000p-1), 26012),
+	GreedyWalkResult(static_cast<float>(-0x1.a2316c0000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.3a68660000000p-3),628152),
+	GreedyWalkResult(static_cast<float>(-0x1.c199e80000000p-2),907223),
+	GreedyWalkResult(static_cast<float>(-0x1.8bfc920000000p-3), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.c9b8520000000p-5),568921),
+	GreedyWalkResult(static_cast<float>(-0x1.be82e20000000p-2),134880),
+	GreedyWalkResult(static_cast<float>(-0x1.8cabe60000001p-2),660609),
+	GreedyWalkResult(static_cast<float>(-0x1.7222980000000p-1),118809),
+	GreedyWalkResult(static_cast<float>(-0x1.b313ea0000000p-1),842476),
+	GreedyWalkResult(static_cast<float>(-0x1.8b56380000000p-7), 38538),
+	GreedyWalkResult(static_cast<float>(-0x1.3e74440000000p-3),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.6349900000000p-9),136557),
+	GreedyWalkResult(static_cast<float>(-0x1.2128060000001p+0),672634),
+	GreedyWalkResult(static_cast<float>(-0x1.25d0560000001p-8),314066),
+	GreedyWalkResult(static_cast<float>(-0x1.206c1a0000000p+0),288181),
+	GreedyWalkResult(static_cast<float>(-0x1.696a200000001p-3),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.1a74180000000p-1),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.608e8a0000000p-2),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.e583780000001p-1),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.cdfae5fffffffp-1),288181),
+	GreedyWalkResult(static_cast<float>(-0x1.53c3200000001p-5),926790),
+	GreedyWalkResult(static_cast<float>(-0x1.a8f37bfffffffp-5),164698),
+	GreedyWalkResult(static_cast<float>(-0x1.e1399ffffffffp-7),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.adf8240000000p-3),587902),
+	GreedyWalkResult(static_cast<float>(-0x1.f91ca60000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.b717880000000p-3), 70417),
+	GreedyWalkResult(static_cast<float>(-0x1.57b0760000000p-4),939764),
+	GreedyWalkResult(static_cast<float>(-0x1.1de1ca0000000p+0), 74899),
+	GreedyWalkResult(static_cast<float>(-0x1.c67da40000000p-2),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.64c96c0000001p-2),261103),
+	GreedyWalkResult(static_cast<float>(-0x1.54c6240000000p-1),107308),
+	GreedyWalkResult(static_cast<float>(-0x1.0274f60000000p-2),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.5b05140000000p-1),969505),
+	GreedyWalkResult(static_cast<float>(-0x1.1a4ca80000000p-6),950108),
+	GreedyWalkResult(static_cast<float>(-0x1.de24900000000p-1),836318),
+	GreedyWalkResult(static_cast<float>(-0x1.5c834e0000000p-3),228059),
+	GreedyWalkResult(static_cast<float>(-0x1.682d5c0000000p-3),107308),
+	GreedyWalkResult(static_cast<float>(-0x1.b96de80000000p-1),532480),
+	GreedyWalkResult(static_cast<float>(-0x1.f1c5680000000p-1),186838),
+	GreedyWalkResult(static_cast<float>(-0x1.d87015fffffffp-3),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.992d1ffffffffp-2),884850),
+	GreedyWalkResult(static_cast<float>(-0x1.38d1580000001p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.a59a700000001p-3),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.bb07fe0000001p-5),531816),
+	GreedyWalkResult(static_cast<float>(-0x1.48fa060000000p-1),128603),
+	GreedyWalkResult(static_cast<float>(-0x1.81b2000000001p-7),129055),
+	GreedyWalkResult(static_cast<float>(-0x1.4bfc5bfffffffp-2),576030),
+	GreedyWalkResult(static_cast<float>(-0x1.4683200000000p-1),727476),
+	GreedyWalkResult(static_cast<float>(-0x1.9165800000000p-5), 38538),
+	GreedyWalkResult(static_cast<float>(-0x1.2b59be0000000p-3),941181),
+	GreedyWalkResult(static_cast<float>(-0x1.21086e0000000p-5),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.1fb5700000000p-7),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.3fa0620000000p-2), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.d2b8bdfffffffp-2),355312),
+	GreedyWalkResult(static_cast<float>(-0x1.ec8a43fffffffp-2),532480),
+	GreedyWalkResult(static_cast<float>(-0x1.eeaace0000000p-9),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.5649140000000p-1),842476),
+	GreedyWalkResult(static_cast<float>(-0x1.49e3ae0000001p-6), 29163),
+	GreedyWalkResult(static_cast<float>(-0x1.b53db20000001p-5),442413),
+	GreedyWalkResult(static_cast<float>(-0x1.5aa6380000000p-3),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.cdc0f80000000p-3),450479),
+	GreedyWalkResult(static_cast<float>(-0x1.c9aab80000000p-2),541408),
+	GreedyWalkResult(static_cast<float>(-0x1.0d78740000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.1a48820000000p-6),810043),
+	GreedyWalkResult(static_cast<float>(-0x1.3a76fc0000000p-1),804725),
+	GreedyWalkResult(static_cast<float>(-0x1.2f318a0000000p-7),562579),
+	GreedyWalkResult(static_cast<float>(-0x1.6c91920000000p-2),270226),
+	GreedyWalkResult(static_cast<float>(-0x1.9ac5940000000p-4),263560),
+	GreedyWalkResult(static_cast<float>(-0x1.42bc8c0000000p-1),112754),
+	GreedyWalkResult(static_cast<float>(-0x1.906b7c0000000p-1),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.3586ac0000000p-7), 53791),
+	GreedyWalkResult(static_cast<float>(-0x1.69ef5a0000000p-3),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.4e4f3e0000000p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.b379440000000p-1),980037),
+	GreedyWalkResult(static_cast<float>(-0x1.1a94380000000p+0),624004),
+	GreedyWalkResult(static_cast<float>(-0x1.5e22e00000001p-8), 36331),
+	GreedyWalkResult(static_cast<float>(-0x1.919a7c0000000p-1),883883),
+	GreedyWalkResult(static_cast<float>(-0x1.0313ea0000000p+0),117555),
+	GreedyWalkResult(static_cast<float>(-0x1.8781320000000p-2),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.8504900000000p-2),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.2e79740000000p-2),827608),
+	GreedyWalkResult(static_cast<float>(-0x1.91ac000000000p-5),355549),
+	GreedyWalkResult(static_cast<float>(-0x1.e0b6b80000000p-6),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.ae8bd00000000p-1), 26012),
+	GreedyWalkResult(static_cast<float>(-0x1.edd4cc0000001p-5),587902),
+	GreedyWalkResult(static_cast<float>(-0x1.1191160000000p-6),750819),
+	GreedyWalkResult(static_cast<float>(-0x1.3c69140000000p-2),192244),
+	GreedyWalkResult(static_cast<float>(-0x1.30a7540000000p+0),804725),
+	GreedyWalkResult(static_cast<float>(-0x1.77bda40000002p-5),654035),
+	GreedyWalkResult(static_cast<float>(-0x1.f0496e0000001p-1),  2251),
+	GreedyWalkResult(static_cast<float>(-0x1.788009fffffffp-4),439426),
+	GreedyWalkResult(static_cast<float>(-0x1.3527f9fffffffp+0),354262),
+	GreedyWalkResult(static_cast<float>(-0x1.1914b20000000p+0), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.4b03460000000p-4),648421),
+	GreedyWalkResult(static_cast<float>(-0x1.25ae300000000p-1),292300),
+	GreedyWalkResult(static_cast<float>(-0x1.cd467c0000000p-6), 47898),
+	GreedyWalkResult(static_cast<float>(-0x1.e082960000001p-3),169790),
+	GreedyWalkResult(static_cast<float>(-0x1.38970e0000000p-5),495101),
+	GreedyWalkResult(static_cast<float>(-0x1.d88693fffffffp-2),348136),
+	GreedyWalkResult(static_cast<float>(-0x1.13046c0000000p-1),439129),
+	GreedyWalkResult(static_cast<float>(-0x1.ed2e720000001p-4),749981),
+	GreedyWalkResult(static_cast<float>(-0x1.b162180000000p-5),864388),
+	GreedyWalkResult(static_cast<float>(-0x1.458a1a0000000p-2),121683),
+	GreedyWalkResult(static_cast<float>(-0x1.ffddf40000000p-6), 82234),
+	GreedyWalkResult(static_cast<float>(-0x1.c99b320000001p-6),495323),
+	GreedyWalkResult(static_cast<float>(-0x1.aa13de0000000p-3),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.36671e0000000p-4),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.276aaa0000000p-2),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.41718e0000000p-6),973080),
+	GreedyWalkResult(static_cast<float>(-0x1.39280c0000000p-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.8156020000000p-6),854497),
+	GreedyWalkResult(static_cast<float>(-0x1.075a840000000p+0),930775),
+	GreedyWalkResult(static_cast<float>(-0x1.0b01560000000p-1), 52041),
+	GreedyWalkResult(static_cast<float>(-0x1.fabeec0000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.794f3a0000000p-1),384841),
+	GreedyWalkResult(static_cast<float>(-0x1.d9d54dfffffffp-1),419057),
+	GreedyWalkResult(static_cast<float>(-0x1.c27da80000000p-2),219992),
+	GreedyWalkResult(static_cast<float>(-0x1.0d06660000000p-5),563395),
+	GreedyWalkResult(static_cast<float>(-0x1.7ee86e0000000p-1),348136),
+	GreedyWalkResult(static_cast<float>(-0x1.a219b9fffffffp-3),969505),
+	GreedyWalkResult(static_cast<float>(-0x1.434a760000000p-4), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.6cf4380000000p-1),677921),
+	GreedyWalkResult(static_cast<float>(-0x1.94c9c00000000p-6),901398),
+	GreedyWalkResult(static_cast<float>(-0x1.c625540000000p-5),932100),
+	GreedyWalkResult(static_cast<float>(-0x1.2309d40000000p-1),677155),
+	GreedyWalkResult(static_cast<float>(-0x1.3719a60000000p-4),112754),
+	GreedyWalkResult(static_cast<float>(-0x1.2c1eba0000000p-6),527498),
+	GreedyWalkResult(static_cast<float>(-0x1.affd100000000p-1),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.09db9c0000000p-2),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.b991e00000000p-4),535044),
+	GreedyWalkResult(static_cast<float>(-0x1.2c3aec0000000p-8),938124),
+	GreedyWalkResult(static_cast<float>(-0x1.cce0d20000000p-1),496356),
+	GreedyWalkResult(static_cast<float>(-0x1.d80a4a0000000p-8),776790),
+	GreedyWalkResult(static_cast<float>(-0x1.b3f6ec0000000p-1),749772),
+	GreedyWalkResult(static_cast<float>(-0x1.d370f60000000p-1),441230),
+	GreedyWalkResult(static_cast<float>(-0x1.17859e0000000p+0), 12009),
+	GreedyWalkResult(static_cast<float>(-0x1.552dde0000000p-2),228514),
+	GreedyWalkResult(static_cast<float>(-0x1.1e56f40000000p+0), 56131),
+	GreedyWalkResult(static_cast<float>(-0x1.5b74140000000p-4),186084),
+	GreedyWalkResult(static_cast<float>(-0x1.2bc8580000000p+0),870888),
+	GreedyWalkResult(static_cast<float>(-0x1.03ba840000000p+0),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.9e8ea80000000p-2),114191),
+	GreedyWalkResult(static_cast<float>(-0x1.9181880000000p-6),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.fd3e6a0000000p-3),255668),
+	GreedyWalkResult(static_cast<float>(-0x1.d793e5fffffffp-6),511753),
+	GreedyWalkResult(static_cast<float>(-0x1.335bf00000000p-6),679881),
+	GreedyWalkResult(static_cast<float>(-0x1.98bd340000000p-1), 56131),
+	GreedyWalkResult(static_cast<float>(-0x1.37253c0000000p-3),337863),
+	GreedyWalkResult(static_cast<float>(-0x1.55a79e0000000p-2),270226),
+	GreedyWalkResult(static_cast<float>(-0x1.f2ead00000001p-3),430269),
+	GreedyWalkResult(static_cast<float>(-0x1.f45e060000002p-3),226356),
+	GreedyWalkResult(static_cast<float>(-0x1.c435d60000001p-9), 81654),
+	GreedyWalkResult(static_cast<float>(-0x1.1ea9580000000p+0),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.cc1a520000000p-2),444956),
+	GreedyWalkResult(static_cast<float>(-0x1.9428000000000p-2),914163),
+	GreedyWalkResult(static_cast<float>(-0x1.8f2a440000000p-2), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.077cdc0000000p+0),582680),
+	GreedyWalkResult(static_cast<float>(-0x1.31819c0000000p-3),292300),
+	GreedyWalkResult(static_cast<float>(-0x1.5ae2840000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.0f86240000000p-1),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.e4b8040000000p-2),  5217),
+	GreedyWalkResult(static_cast<float>(-0x1.92a3020000000p-6),866106),
+	GreedyWalkResult(static_cast<float>(-0x1.4c2bd40000000p-3),560074),
+	GreedyWalkResult(static_cast<float>(-0x1.96bfae0000000p-2),225945),
+	GreedyWalkResult(static_cast<float>(-0x1.7cfb9a0000000p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.809e320000001p-5),890893),
+	GreedyWalkResult(static_cast<float>(-0x1.1156de0000000p-1),313671),
+	GreedyWalkResult(static_cast<float>(-0x1.eb64960000000p-1), 23136),
+	GreedyWalkResult(static_cast<float>(-0x1.5a97fa0000000p-2),228059),
+	GreedyWalkResult(static_cast<float>(-0x1.2f87c20000001p-1),945767),
+	GreedyWalkResult(static_cast<float>(-0x1.45a1460000000p-4), 29348),
+	GreedyWalkResult(static_cast<float>(-0x1.ddef220000001p-3),580161),
+	GreedyWalkResult(static_cast<float>(-0x1.0b9e120000000p-5),179074),
+	GreedyWalkResult(static_cast<float>(-0x1.f977160000001p-4),141149),
+	GreedyWalkResult(static_cast<float>(-0x1.b366bc0000000p-1),660609),
+	GreedyWalkResult(static_cast<float>(-0x1.7009520000000p-2),467026),
+	GreedyWalkResult(static_cast<float>(-0x1.08adbe0000000p-3),550091),
+	GreedyWalkResult(static_cast<float>(-0x1.c989580000000p-4),168533),
+	GreedyWalkResult(static_cast<float>(-0x1.56433a0000000p-1),672634),
+	GreedyWalkResult(static_cast<float>(-0x1.dbe0b00000000p-5),667763),
+	GreedyWalkResult(static_cast<float>(-0x1.11c0620000000p+0),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.d6d5560000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.899de00000000p-1),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.fc2835fffffffp-4),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.28141e0000000p-6),986292),
+	GreedyWalkResult(static_cast<float>(-0x1.abb32c0000000p-1),134340),
+	GreedyWalkResult(static_cast<float>(-0x1.2c2b640000001p-3),926855),
+	GreedyWalkResult(static_cast<float>(-0x1.3447780000000p-3), 47688),
+	GreedyWalkResult(static_cast<float>(-0x1.5fb8300000000p-6),226268),
+	GreedyWalkResult(static_cast<float>(-0x1.73cba7fffffffp-4), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.b99f040000000p-1), 16476),
+	GreedyWalkResult(static_cast<float>(-0x1.6b9ba60000000p-1),112754),
+	GreedyWalkResult(static_cast<float>(-0x1.d3aa360000000p-1),192244),
+	GreedyWalkResult(static_cast<float>(-0x1.25282a0000000p+0),275023),
+	GreedyWalkResult(static_cast<float>(-0x1.16c09a0000000p-5), 56131),
+	GreedyWalkResult(static_cast<float>(-0x1.bdd6720000000p-3),667763),
+	GreedyWalkResult(static_cast<float>(-0x1.7421400000000p-1),587902),
+	GreedyWalkResult(static_cast<float>(-0x1.dfa079fffffffp-9),630231),
+	GreedyWalkResult(static_cast<float>(-0x1.debb760000001p-2),778627),
+	GreedyWalkResult(static_cast<float>(-0x1.3589be0000000p-4),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.a659d00000000p-3),353498),
+	GreedyWalkResult(static_cast<float>(-0x1.9f913bfffffffp-4),936836),
+	GreedyWalkResult(static_cast<float>(-0x1.3b78740000000p-3),504419),
+	GreedyWalkResult(static_cast<float>(-0x1.42611c0000000p-3),107308),
+	GreedyWalkResult(static_cast<float>(-0x1.4e66860000000p-6),439809),
+	GreedyWalkResult(static_cast<float>(-0x1.4a79000000000p-1),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.41902a0000000p+0),774981),
+	GreedyWalkResult(static_cast<float>(-0x1.4850a60000000p-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.a7bf000000000p-1),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.9d67d60000001p-5),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.c908860000000p-2),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.63e9520000000p-2),513240),
+	GreedyWalkResult(static_cast<float>(-0x1.e423200000000p-5),295526),
+	GreedyWalkResult(static_cast<float>(-0x1.91894ffffffffp-2),476414),
+	GreedyWalkResult(static_cast<float>(-0x1.29ba4a0000000p-4),774219),
+	GreedyWalkResult(static_cast<float>(-0x1.a577500000000p-1),582680),
+	GreedyWalkResult(static_cast<float>(-0x1.de39c80000000p-2),909721),
+	GreedyWalkResult(static_cast<float>(-0x1.f75ad40000001p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.93794a0000000p-1),750819),
+	GreedyWalkResult(static_cast<float>(-0x1.5f65ec0000000p-3),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.23f7820000000p-6),786537),
+	GreedyWalkResult(static_cast<float>(-0x1.a4f01e0000000p-1),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.218c620000000p-12),134340),
+	GreedyWalkResult(static_cast<float>(-0x1.33a59e0000000p-1), 40323),
+	GreedyWalkResult(static_cast<float>(-0x1.c9920c0000000p-2),523435),
+	GreedyWalkResult(static_cast<float>(-0x1.18be840000000p-2),865184),
+	GreedyWalkResult(static_cast<float>(-0x1.0442d60000000p-1),729175),
+	GreedyWalkResult(static_cast<float>(-0x1.047e940000000p+0),255668),
+	GreedyWalkResult(static_cast<float>(-0x1.0d97ac0000000p-1),239334),
+	GreedyWalkResult(static_cast<float>(-0x1.2a5e4e0000001p-4),660609),
+	GreedyWalkResult(static_cast<float>(-0x1.f4887bfffffffp-1),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.a8d50c0000001p-2),531816),
+	GreedyWalkResult(static_cast<float>(-0x1.8e5e300000001p-4),541780),
+	GreedyWalkResult(static_cast<float>(-0x1.06e1a40000000p-2),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.2e98940000000p-5),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.1d7bb60000000p-4),320041),
+	GreedyWalkResult(static_cast<float>(-0x1.93514a0000000p-6), 38538),
+	GreedyWalkResult(static_cast<float>(-0x1.fe2429fffffffp-2),292300),
+	GreedyWalkResult(static_cast<float>(-0x1.161f500000000p-6), 38538),
+	GreedyWalkResult(static_cast<float>(-0x1.3d90900000000p-6),318039),
+	GreedyWalkResult(static_cast<float>(-0x1.01c5040000000p-2),532480),
+	GreedyWalkResult(static_cast<float>(-0x1.4f30960000000p-4),223261),
+	GreedyWalkResult(static_cast<float>(-0x1.8a9b3c0000000p-4),382537),
+	GreedyWalkResult(static_cast<float>(-0x1.02d07a0000000p-4),790506),
+	GreedyWalkResult(static_cast<float>(-0x1.9527260000001p-2),294738),
+	GreedyWalkResult(static_cast<float>(-0x1.047eea0000000p-1),886263),
+	GreedyWalkResult(static_cast<float>(-0x1.d0deba0000000p-1),278930),
+	GreedyWalkResult(static_cast<float>(-0x1.5c2d320000000p-1),236872),
+	GreedyWalkResult(static_cast<float>(-0x1.f1670a0000000p-8),580161),
+	GreedyWalkResult(static_cast<float>(-0x1.1426ce0000000p-3),550771),
+	GreedyWalkResult(static_cast<float>(-0x1.b5f0ee0000000p-5),517512),
+	GreedyWalkResult(static_cast<float>(-0x1.efd5180000000p-6),696486),
+	GreedyWalkResult(static_cast<float>(-0x1.f1b0440000000p-6),118809),
+	GreedyWalkResult(static_cast<float>(-0x1.28d45c0000000p-1),854962),
+	GreedyWalkResult(static_cast<float>(-0x1.f18c5e0000000p-1),184077),
+	GreedyWalkResult(static_cast<float>(-0x1.50e1320000000p-1),385014),
+	GreedyWalkResult(static_cast<float>(-0x1.fb43600000000p-2),467026),
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp
new file mode 100644
index 000000000..ec4a799d7
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include <tuple>
+#include <vector>
+namespace ipnsw {
+    using GreedyWalkResult = std::pair<float, int>;
+    extern std::vector<GreedyWalkResult> GREEDY_WALK_RESULTS;
+    static constexpr int GWR_DIST = 0;
+    static constexpr int GWR_VERT = 1;
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IO.hpp b/examples/sdh-eval-workloads/ipnsw/IO.hpp
new file mode 100644
index 000000000..7dd4ef05e
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IO.hpp
@@ -0,0 +1,223 @@
+#pragma once
+#include <vector>
+#include <string>
+#include <sstream>
+#include <Graph.hpp>
+#include <Graph500Data.hpp>
+#include <StringHelpers.hpp>
+#include <sstream>
+#include <map>
+
+namespace ipnsw {
+    //using graph_tools::Graph;
+    //using graph_tools::Graph500Data;
+
+    class Parser {
+    public:
+        using OptionTable = std::map<std::string, std::string>;
+
+        Parser(){}
+
+        void parse(int argc, char *argv[]) {
+            int pos = 0;
+            int arg = 0;
+
+            while (arg < argc) {
+                std::string argstr = std::string(argv[arg]);
+                if (ipnsw::startswith(argstr, "--")) {
+                    // optional argument
+                    if (++arg >= argc) {
+                        throw std::runtime_error("'" + argstr + "' requries an argument");
+                    }
+                    _options[argstr] = std::string(argv[arg]);
+
+                } else {
+                    // positional argument
+                    switch (pos++) {
+                    case 0:
+                        _exe = argstr;
+                        break;
+
+                    case 1:
+                        _ucode = argstr;
+                        break;
+
+                    case 2:
+                        _version = argstr;
+                        break;
+
+                    case 3:
+                        _data = argstr;
+                        break;
+
+                    case 4:
+                        _queries = argstr;
+                        break;
+
+                    case 5:
+                    case 6:
+                    case 7:
+                    case 8:
+                        _graphs.push_back(argstr);
+                        break;
+
+                    default:
+                        break;
+                    }
+                }
+                arg++;
+            };
+
+            // _exe = std::string(argv[0]);
+            // _ucode = std::string(argv[1]);
+            // _version = std::string(argv[2]);
+            // _data = std::string(argv[3]);
+            // _queries = std::string(argv[4]);
+            // // graphs
+            // for (int i = 5; i < argc; ++i) {
+            //     _graphs.push_back(std::string(argv[i]));
+            // }
+        }
+
+        std::string str() const {
+            std::stringstream ss;
+            ss << "ucode: " << _ucode << "\n"
+               << "version: " << _version << "\n"
+               << "exe: " << _exe << "\n"
+               << "data: " << _data << "\n"
+               << "queries: " << _queries << "\n";
+
+            for (int i = 0; i < _graphs.size(); ++i) {
+                ss << "graph " << i << ": " << _graphs[i] << "\n";
+            }
+
+            return ss.str();
+        }
+
+        std::string option(const std::string &opt) const {
+            auto it = _options.find(opt);
+            if (it != _options.end())
+                return it->second;
+
+            return "";
+        }
+
+        std::vector<int> do_queries() const {
+            std::string do_queries_str = option("--queries");
+            if (do_queries_str.empty()) {
+                return {};
+            }
+
+            std::vector<int> _do_queries;
+            size_t pos = 0;
+            size_t at = 0;
+
+            while ((at = do_queries_str.find(",", pos)) != std::string::npos) {
+                    do_queries_str.replace(at, 1, " ");
+                    pos = at+1;
+            }
+
+            std::stringstream ss(do_queries_str);
+            while (ss.good()) {
+                int q;
+                ss >> q;
+                _do_queries.push_back(q);
+            }
+
+            return _do_queries;
+        }
+
+        int num_iproducts() const {
+            int n = 100;
+            auto s = option("--num-iproducts");
+            if (!s.empty()) {
+                n = from_string<int>(s);
+            }
+            return n;
+        }
+
+        std::string ucode() const   { return _ucode; }
+        std::string version() const { return _version; }
+        std::string exe() const     { return _exe; }
+        std::vector<std::string> graphs() const { return _graphs; }
+        std::string graph(int i) const { return _graphs[i]; }
+        std::string data() const    { return _data; }
+        std::string queries() const { return _queries; }
+
+        std::string              _ucode;
+        std::string              _version;
+        std::string              _exe;
+        std::vector<std::string> _graphs;
+        std::string              _data;
+        std::string              _queries;
+        OptionTable              _options;
+    };
+
+    class IO {
+    public:
+        IO() {}
+        IO(const Parser &p): _parser(p) {}
+
+
+        graph_tools::Graph graph(int i) {
+            std::cout << "Reading graph " << i << ": "
+                      << _parser._graphs[i] << std::endl;
+
+            graph_tools::Graph500Data d = graph_tools::Graph500Data::FromASCIIFile(_parser._graphs[i]);
+            return graph_tools::Graph::FromGraph500Data(d);
+        }
+
+        std::vector<graph_tools::Graph> graphs() {
+            std::vector<graph_tools::Graph> graphs;
+            for (int i = 0; i < _parser._graphs.size(); ++i)
+                graphs.push_back(graph(i));
+
+            return graphs;
+        }
+
+        template <typename T>
+        std::vector<T> read(const std::string & fname) {
+            int r;
+            struct stat st;
+
+            std::cerr << "Opening " << fname << std::endl;
+
+            r = stat(fname.c_str(), &st);
+            if (r != 0) {
+                auto s = fname + ": " + std::string(strerror(errno));
+                throw std::runtime_error(s);
+            }
+            std::vector<T> v(st.st_size/sizeof(T));
+
+            FILE *f = fopen(fname.c_str(), "rb");
+            if (!f) {
+                auto s = fname + ": " + std::string(strerror(errno));
+                throw std::runtime_error(s);
+            }
+
+            fread(&v[0], st.st_size, 1, f);
+            fclose(f);
+            return v;
+        }
+
+        template <typename T, int N>
+        std::vector<std::array<T, N>>
+        database() {
+            using array = std::array<T,N>;
+            return read<array>(_parser._data);
+        }
+
+        template <typename T, int N>
+        std::vector<std::array<T, N>>
+        queries() {
+            using array = std::array<T,N>;
+            return read<array>(_parser._queries);
+        }
+
+        std::string ucode() const { return _parser._ucode; }
+        std::vector<int> do_queries() const  { return _parser.do_queries(); }
+
+        Parser _parser;
+    };
+
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp
new file mode 100644
index 000000000..55e410789
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp
@@ -0,0 +1,17 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWResultReader.hpp"
+namespace ipnsw {
+    class IPNSWFactory {
+    public:
+        std::unique_ptr<IPNSWKernelRunner> KernelRunner()const {
+            return std::unique_ptr<IPNSWKernelRunner>(_KernelRunner());
+        }
+        std::unique_ptr<IPNSWResultReader> ResultReader()const {
+            return std::unique_ptr<IPNSWResultReader>(_ResultReader());
+        }
+    protected:
+        virtual IPNSWKernelRunner* _KernelRunner()const = 0;
+        virtual IPNSWResultReader* _ResultReader()const = 0;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp
new file mode 100644
index 000000000..4f942db01
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp
@@ -0,0 +1,69 @@
+#pragma once
+#include <HammerBlade.hpp>
+#include <Graph.hpp>
+#include <Graph500Data.hpp>
+#include <vector>
+
+namespace ipnsw {
+    class Graph {
+    public:
+        Graph() : Graph(graph_tools::Graph()) {}
+        Graph(const graph_tools::Graph &g) : _graph(g) {}
+        Graph(graph_tools::Graph &&g) : _graph(g) {}
+
+        void initialize_on_device() {
+            using hammerblade::host::HammerBlade;
+            HammerBlade::Ptr hb = HammerBlade::Get();
+
+            auto & offsets = _graph.get_offsets();
+            auto & neighbors = _graph.get_neighbors();
+            
+            _offsets  = hb->alloc(offsets.size() * sizeof(offsets[0]));
+            _neighbors = hb->alloc(neighbors.size() * sizeof(neighbors[0]));
+
+            hb->push_write(_offsets,   &offsets[0],   offsets.size() * sizeof(offsets[0]));
+            hb->push_write(_neighbors, &neighbors[0], neighbors.size() * sizeof(neighbors[0]));
+        }
+
+        graph_tools::Graph & graph() { return _graph; }
+        const graph_tools::Graph & graph() const { return _graph; }
+        hb_mc_eva_t offsets() const { return _offsets; }
+        hb_mc_eva_t neighbors() const  { return _neighbors; }
+        
+        static hb_mc_eva_t InitializeMetadataOnDevice(const std::vector<Graph> & Gs) {
+            using hammerblade::host::HammerBlade;
+            HammerBlade::Ptr hb = HammerBlade::Get();            
+            struct metadata {
+                hb_mc_eva_t offset;
+                hb_mc_eva_t neighbors;
+                int V;
+                int E;
+            };
+
+            std::vector<metadata> metad;
+            for (auto & g : Gs) {
+                std::cout << "Host: offset = " << std::hex << g.offsets() << " neighbors = " << g.neighbors() << std::endl;
+                std::cout << std::dec;
+                metadata m = {
+                    .offset = g.offsets(),
+                    .neighbors = g.neighbors(),
+                    g.graph().num_nodes(),
+                    g.graph().num_edges()
+                };
+                metad.push_back(m);
+            }
+            
+            hb_mc_eva_t metadata = hb->alloc(sizeof(struct metadata) * metad.size());
+            hb->push_write(metadata, &metad[0], sizeof(struct metadata) * metad.size());
+            hb->sync_write();
+
+            return metadata;
+        }
+        
+    private:
+        graph_tools::Graph _graph;
+
+        hb_mc_eva_t _offsets;
+        hb_mc_eva_t _neighbors;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp
new file mode 100644
index 000000000..e6042acaa
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp
@@ -0,0 +1,31 @@
+#pragma once
+#include "HammerBlade.hpp"
+#include <memory>
+#include <string>
+namespace ipnsw {
+    class IPNSWRunner; // forward declaration
+
+    class IPNSWKernelRunner {
+    public:
+        using HammerBlade = hammerblade::host::HammerBlade;
+        using Dim = hammerblade::host::Dim;
+        IPNSWKernelRunner() {}
+
+    protected:
+        virtual std::string kernelName(const IPNSWRunner & runner) const =0;
+        virtual std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const =0;
+        virtual Dim gd(const IPNSWRunner &runner) const = 0;
+        virtual Dim tgd(const IPNSWRunner &runner) const = 0;
+
+    public:
+        void runKernel(IPNSWRunner &runner) {
+            HammerBlade::Ptr hb = HammerBlade::Get();
+            hb->push_jobv(gd(runner),
+                          tgd(runner),
+                          kernelName(runner),
+                          argv(runner));
+            hb->exec();
+        }
+    };
+
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp
new file mode 100644
index 000000000..19eaff181
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp
@@ -0,0 +1,13 @@
+#pragma once
+#include "HammerBlade.hpp"
+namespace ipnsw {
+    class IPNSWRunner;
+
+    class IPNSWResultReader {
+    protected:
+        using HammerBlade = hammerblade::host::HammerBlade;
+
+    public:
+        virtual void readResults(const IPNSWRunner & runner) {}
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp
new file mode 100644
index 000000000..3dbca5bec
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp
@@ -0,0 +1,184 @@
+#pragma once
+#include "IO.hpp"
+#include "HammerBlade.hpp"
+#include "IPNSWGraph.hpp"
+#include "IPNSWFactory.hpp"
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWResultReader.hpp"
+#include "GreedyWalkResults.hpp"
+#include <memory>
+
+namespace ipnsw {
+
+    class IPNSWRunner {
+    public:
+        //static constexpr int QUERY = 276; // fewest dot products for greedy walk
+        //static constexpr int QUERY = 472; // fewest dot products for beam search
+        //static constexpr int QUERY = 427;
+        //static constexpr int QUERY = 355;
+        //static constexpr int QUERY = 2;
+        static constexpr int QUERY = 188;
+        //static constexpr int QUERY = 229;
+        //static constexpr int QUERY = 490;
+        //static constexpr int QUERY = 16;
+        //static constexpr int QUERY = 461;
+        //static constexpr int QUERY = 470;
+
+        using HammerBlade = hammerblade::host::HammerBlade;
+        using Dim = hammerblade::host::Dim;
+
+        IPNSWRunner(const Parser &p,
+                    std::unique_ptr<IPNSWFactory> & fact):
+            _factory(std::move(fact)) {
+            _io = std::unique_ptr<IO>(new IO(p));
+            _hb = HammerBlade::Get();
+            _kernel_runner = _factory->KernelRunner();
+            _result_reader = _factory->ResultReader();
+        }
+
+        virtual ~IPNSWRunner() { delete _hb; }
+
+        void readInput() {
+            auto graphs   = _io->graphs();
+            _graphs = {
+                Graph(std::move(graphs[3])),
+                Graph(std::move(graphs[2])),
+                Graph(std::move(graphs[1])),
+                Graph(std::move(graphs[0]))
+            };
+
+            _db       = _io->database<float,100>();
+            _queries  = _io->queries<float,100>();
+        }
+
+        void loadProgram() {
+            _hb->load_application(ucodePath());
+        }
+
+        void initializeDeviceMemoryDB() {
+            std::cout << "Initializing database " << std::endl;
+            _db_dev = _hb->alloc(_db.size() * sizeof(_db[0]));
+            _hb->push_write(_db_dev, &_db[0], _db.size() * sizeof(_db[0]));
+        }
+
+        void initializeDeviceMemoryQuery() {
+            std::cout << "Initializing query "  << std::endl;
+            int query = QUERY;
+
+            auto do_queries = _io->do_queries();
+            if (!do_queries.empty())
+                query = do_queries[0];
+
+            _query_dev = _hb->alloc(sizeof(_queries[query]));
+            _hb->push_write(_query_dev, &_queries[query], sizeof(_queries[query]));
+        }
+
+        void initializeDeviceMemorySeen() {
+            std::cout << "Initializing seen set " << std::endl;
+            _seen_dev = _hb->alloc(_db.size() * sizeof(int));
+        }
+
+        void initializeDeviceMemoryGraphs() {
+            for (auto & graph : _graphs)
+                graph.initialize_on_device();
+
+            _graph_metadata_dev = Graph::InitializeMetadataOnDevice(_graphs);
+        }
+
+        void initializeDeviceVCurr() {
+            _v_curr_dev = _hb->alloc(sizeof(int));
+        }
+        void initializeDeviceDCurr() {
+            _d_curr_dev = _hb->alloc(sizeof(float));
+        }
+
+        void initializeDeviceCandidateDev() {
+            _candidates_dev = _hb->alloc(sizeof(GreedyWalkResult)*513);
+        }
+
+        void initializeDeviceResultsDev() {
+            _results_dev = _hb->alloc(sizeof(GreedyWalkResult) * 129);
+        }
+
+        void initializeDeviceNResultsDev() {
+            _n_results_dev = _hb->alloc(sizeof(int));
+        }
+
+        void initializeDeviceMemory() {
+            initializeDeviceMemoryDB();
+            initializeDeviceMemoryQuery();
+            initializeDeviceMemorySeen();
+            initializeDeviceMemoryGraphs();
+            initializeDeviceVCurr();
+            initializeDeviceDCurr();
+            initializeDeviceCandidateDev();
+            initializeDeviceResultsDev();
+            initializeDeviceNResultsDev();
+            // sync
+            std::cout << "Starting DMA" << std::endl;
+            _hb->sync_rw();
+        }
+
+        void runKernel() {
+            std::cout << "Launching kernel" << std::endl;
+            _kernel_runner->runKernel(*this);
+        }
+
+        void readResults() {
+            _result_reader->readResults(*this);
+
+        }
+
+        void run() {
+            readInput();
+            loadProgram();
+            initializeDeviceMemory();
+            runKernel();
+            readResults();
+        }
+
+        /////////////
+        // Getters //
+        /////////////
+        std::string ucodePath() const {
+            return _io->ucode();
+        }
+
+        hb_mc_eva_t db_dev() const { return _db_dev; }
+        hb_mc_eva_t query_dev() const { return _query_dev; }
+        hb_mc_eva_t seen_dev() const { return _seen_dev; }
+        hb_mc_eva_t v_curr_dev() const { return _v_curr_dev; }
+        hb_mc_eva_t d_curr_dev() const { return _d_curr_dev; }
+        hb_mc_eva_t graph_metadata_dev() const { return _graph_metadata_dev; }
+        hb_mc_eva_t candidates_dev() const { return _candidates_dev; }
+        hb_mc_eva_t results_dev() const { return _results_dev; }
+        hb_mc_eva_t n_results_dev() const { return _n_results_dev; }
+
+        /////////////
+        // Setters //
+        /////////////
+
+    private:
+        std::unique_ptr<IO>                  _io;
+        std::vector<Graph>                   _graphs;
+        std::vector<std::array<float, 100>>  _db;
+        std::vector<std::array<float, 100>>  _queries;
+        HammerBlade::Ptr                     _hb;
+
+        // device pointers
+        hb_mc_eva_t _db_dev;
+        hb_mc_eva_t _query_dev;
+        hb_mc_eva_t _seen_dev;
+        hb_mc_eva_t _v_curr_dev;
+        hb_mc_eva_t _d_curr_dev;
+        hb_mc_eva_t _graph_metadata_dev;
+        hb_mc_eva_t _candidates_dev;
+        hb_mc_eva_t _results_dev;
+        hb_mc_eva_t _n_results_dev;
+
+        // composites
+        std::unique_ptr<IPNSWKernelRunner> _kernel_runner;
+        std::unique_ptr<IPNSWResultReader> _result_reader;
+        std::unique_ptr<IPNSWFactory>      _factory;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp
new file mode 100644
index 000000000..9a4861844
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp
@@ -0,0 +1,19 @@
+#pragma once
+#include "IPNSWFactory.hpp"
+#include "IProductUBmkKernelRunner.hpp"
+#include "IProductUBmkResultReader.hpp"
+namespace ipnsw {
+    class IProductUBmkFactory : public IPNSWFactory {
+    public:
+        IProductUBmkFactory(int iterations = 10):
+            _iterations(iterations) {
+        }
+
+    private:
+        IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkKernelRunner(_iterations); }
+        IPNSWResultReader *_ResultReader() const { return new IProductUBmkResultReader; }
+
+        int _iterations;
+    };
+}
+
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp
new file mode 100644
index 000000000..e9d3010bc
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp
@@ -0,0 +1,31 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IPNSWRunner.hpp"
+
+namespace ipnsw {
+    class IProductUBmkKernelRunner : public IPNSWKernelRunner {
+    public:
+        IProductUBmkKernelRunner(int iterations = 10) :
+            IPNSWKernelRunner(),
+            _iterations(iterations) {
+        }
+
+    private:
+        std::string kernelName(const IPNSWRunner & runner) const {
+            return "inner_product_ubmk";
+        }
+
+        std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+            std::vector<hb_mc_eva_t> argv = {
+                runner.db_dev(), // database
+                runner.query_dev(), // query
+                static_cast<hb_mc_eva_t>(_iterations), // number of inner products
+            };
+            return argv;
+        };
+        Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);}
+        Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);}
+
+        int _iterations;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp
new file mode 100644
index 000000000..300990b18
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp
@@ -0,0 +1,12 @@
+#pragma once
+#include "IPNSWRunner.hpp"
+#include "IPNSWResultReader.hpp"
+
+namespace ipnsw {
+    class IProductUBmkResultReader : public IPNSWResultReader {
+    public:
+        void readResults(const IPNSWRunner & runner) {
+            std::cout << "Done" << std::endl;
+        }
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile
new file mode 100644
index 000000000..6e814f018
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/Makefile
@@ -0,0 +1,351 @@
+# Copyright (c) 2019, University of Washington All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+# 
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+# 
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+################################################################################
+# Paths / Environment Configuration
+################################################################################
+_REPO_ROOT ?= $(shell git rev-parse --show-toplevel)
+CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+-include $(_REPO_ROOT)/environment.mk
+
+################################################################################
+# Define BSG_MACHINE_PATH, the location of the Makefile.machine.include file
+# that defines the machine to compile and simulate on. Using BSG_F1_DIR (which
+# is set in environment.mk) uses the same machine as in bsg_replicant.
+################################################################################
+
+BSG_MACHINE_PATH=$(BSG_F1_DIR)/machines/pod_X1Y1_ruche_X16Y8_hbm
+
+################################################################################
+# Define the range of versions
+################################################################################
+# Kernel versions. See kernel/README.md for more information.  Version names do
+# not need to use v* and can be any string
+VERSIONS := greedy_walk    #  inner product with ipc=0.3 (8x4)
+VERSIONS += greedy_walk_v1 #  inner product with ipc=0.43 (8x4)
+VERSIONS += greedy_walk_v2 #  inner product with FLOPS/cycle=0.2  (8x4)
+VERSIONS += greedy_walk_v3 #  inner product with FLOPS/cycle=0.26 (8x4)
+VERSIONS += beam_search    #  very slow - uses a very dumb sparse set
+VERSIONS += beam_search_v1 #  dense set - inner product with ipc=0.3  (8x4)
+VERSIONS += beam_search_v2 #  dense set - inner product with ipc=0.43 (8x4)
+VERSIONS += beam_search_v3 #  + inner_product_v2 (flops/cycle=0.2039) (8x4)
+VERSIONS += beam_search_v4 #  + inner_product_v3 (flops/cycle=0.2663) (8x4)
+VERSIONS += beam_search_v5 #  + Bit vector for dense set
+VERSIONS += debug
+VERSIONS += iproduct_ubmk # baseline - ipc = 0.3
+VERSIONS += iproduct_ubmk_v1 # using clang and NBLs, ipc = 0.43, flops/cycle = 0.1867
+VERSIONS += iproduct_ubmk_v2 # + FMA, ipc = 0.386, flops/cycle = 0.2039
+VERSIONS += iproduct_ubmk_v3 # + explicit parallel fma (ipc=0.45,flops/cycle = 0.2663) (8x4)
+VERSIONS += iproduct_ubmk_v4 # Slightly cleaner code than v3 - similar performance
+
+_KERNEL_COMPILER = CLANG
+################################################################################
+# Define any sources that should be used compiled during kernel compilation,
+# including the source file with the kernel itself. kernel.riscv will
+# be the name of the compiled RISC-V Binary for the Manycore
+#
+# Use KERNEL_*LIBRARIES list sources that should be compiled and linked with all
+# kernel.cpp versions. However, if you have version-specific sources you must
+# come up with your own solution.
+# 
+# Use KERNEL_INCLUDES to specify the path to directories that contain headers.
+################################################################################
+
+# C Libraries
+KERNEL_CLIBRARIES   +=
+# C++ Libraries
+KERNEL_CXXLIBRARIES +=
+
+KERNEL_INCLUDES     += -I$(CURRENT_PATH)/kernel/include
+
+# Define the default kernel.cpp file. If KERNEL_DEFAULT is not defined it will
+# be set to kernel.cpp in the same directory as this Makefile.
+DEFAULT_VERSION     := greedy_walk_v3
+KERNEL_DEFAULT      := kernel/$(DEFAULT_VERSION)/kernel.cpp
+#KERNEL_DEFAULT      := kernel/$(DEFAULT_VERSION)/kernel.c
+
+################################################################################
+# Include the kernel build rules (This must be included after KERNEL_*LIBRARIES,
+# KERNEL_DEFAULT, KERNEL_INCLUDES, etc)
+################################################################################
+
+-include $(EXAMPLES_PATH)/examples/cuda/riscv.mk
+
+################################################################################
+# END OF KERNEL-SPECIFIC RULES / START OF HOST-SPECIFIC RULES
+################################################################################
+
+
+################################################################################
+# Define the $(HOST_TARGET), the name of the host executable to generate. The
+# cosimulation host executable will be called
+# $(HOST_TARGET).cosim. HOST_*SOURCES list the host files that should be
+# compiled and linked into the executable.
+################################################################################
+
+HOST_TARGET         := ipnsw
+HOST_CSOURCES       := 
+HOST_CXXSOURCES     += GreedyWalkResults.cpp
+HOST_INCLUDES       := -I$(CURRENT_PATH)
+
+################################################################################
+# Include the Cosimulation host build rules (This must be included after
+# HOST_*SOURCES, HOST_TARGET, HOST_INCLUDES, etc)
+################################################################################
+
+ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/database_music100.bin
+ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/query_music100.bin
+ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_0
+ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1
+ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2
+ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3
+
+
+################################
+# Inner Product U-Benchmarking #
+################################
+# number iproducts
+N-IPRODUCTS := 150 500 1000 1500 2000 3000
+IPRODUCT-BASENAME := iproduct_ubmk_v4
+
+define IPRODUCT-UBMK-RULE
+# creates run directory from template
+kernel/iproduct_ubmk-$(1)/kernel.cpp: kernel/$(IPRODUCT-BASENAME)/kernel.cpp
+	mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+# adds arguments
+kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: ARGS += --num-iproducts $(1)
+
+# adds to list of iproduct u-bmk
+IPRODUCT-UBMK-VERSIONS += iproduct_ubmk-$(1)
+endef
+
+# Expand rule for each inner product input
+$(foreach nip,$(N-IPRODUCTS),$(eval $(call IPRODUCT-UBMK-RULE,$(nip))))
+
+.PHONY: create-iproduct-ubmk
+.PHONY: purge-iproduct-ubmk
+.PHONY: iproduct-ubmk-stats
+
+# create rule
+create-iproduct-ubmk: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/kernel.cpp)
+
+# purge rule
+purge-iproduct-ubmk:
+	rm -rf $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v)
+
+# collect stats for all
+iproduct-ubmk-stats: create-iproduct-ubmk
+iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/stats)
+
+# Add to versions
+VERSIONS += $(IPRODUCT-UBMK-VERSIONS)
+
+####################
+# Greedy Walk Runs #
+####################
+GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490
+GREEDY-WALK-BASENAME := greedy_walk_v3
+define GREEDY-WALK-RULE
+# creates run directory from template
+kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.cpp
+	mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+# adds arguments
+kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: ARGS += --queries $(1)
+
+# adds to list of greedy walk versions
+GREEDY-WALK-VERSIONS += greedy_walk-query$(1)
+endef
+
+# Expand rule for each query
+$(foreach q,$(GREEDY-WALK-QUERIES),$(eval $(call GREEDY-WALK-RULE,$(q))))
+
+.PHONY: create-greedy-walk
+.PHONY: purge-greedy-walk
+.PHONY: greedy-walk-stats
+
+# create rule
+create-greedy-walk: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/kernel.cpp)
+
+# purge rule
+purge-greedy-walk:
+	rm -rf $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v)
+
+# collect stats for all
+greedy-walk-stats: create-greedy-walk
+greedy-walk-stats: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/stats)
+
+# Add to versions
+VERSIONS += $(GREEDY-WALK-VERSIONS)
+
+####################
+# Beam Search Runs #
+####################
+BEAM-SEARCH-QUERIES := 2 188 229 355 427 472
+BEAM-SEARCH-BASENAME := beam_search_v5
+
+define BEAM-SEARCH-RULE
+# creates run directory from template
+kernel/beam_search-query$(1)/kernel.cpp: kernel/$(BEAM-SEARCH-BASENAME)/kernel.cpp
+	mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+# adds arguments
+kernel/beam_search-query$(1)/$(HOST_TARGET).log: ARGS += --queries $(1)
+
+# adds to list of greedy walk versions
+BEAM-SEARCH-VERSIONS += beam_search-query$(1)
+endef
+
+
+# Expand rule for each query
+$(foreach q,$(BEAM-SEARCH-QUERIES),$(eval $(call BEAM-SEARCH-RULE,$(q))))
+
+.PHONY: create-beam-search
+.PHONY: purge-beam-search
+.PHONY: beam-search-stats
+
+# create rule
+create-beam-search: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/kernel.cpp)
+
+# purge rule
+purge-beam-search:
+	rm -rf $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v)
+
+# collect stats for all
+beam-search-stats: create-beam-search
+beam-search-stats: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/stats)
+
+# Add to versions
+VERSIONS += $(BEAM-SEARCH-VERSIONS)
+
+########################################
+# Continue including cosim build rules #
+########################################
+
+-include $(FRAGMENTS_PATH)/host/cosim.mk
+
+GRAPH-TOOLS := $(CURRENT_PATH)/graph-tools
+graphtools-dir := $(GRAPH-TOOLS)
+
+include $(GRAPH-TOOLS)/libgraphtools.mk
+
+HB-HELPERS := $(CURRENT_PATH)/hammerblade-helpers
+include $(HB-HELPERS)/libhammerblade-helpers-host.mk
+
+CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags)
+CXXFLAGS += $(libgraphtools-interface-cxxflags)
+
+LDFLAGS  += $(libhammerblade-helpers-host-interface-ldflags)
+LDFLAGS  += $(libgraphtools-interface-ldflags)
+VSOURCES += GreedyWalkResults.cpp
+
+$(HOST_TARGET): $(libhammerblade-helpers-host-interface-headers)
+$(HOST_TARGET): $(libgraphtools-interface-headers)
+$(HOST_TARGET): $(libgraphtools-interface-libraries)
+$(HOST_TARGET): GreedyWalkResults.o
+
+GreedyWalkResults.o: GreedyWalkResults.cpp
+GreedyWalkResults.o: GreedyWalkResults.hpp
+
+ipnsw.o: IO.hpp
+ipnsw.o: IPNSWGraph.hpp
+ipnsw.o: IPNSWRunner.hpp
+ipnsw.o: IPNSWKernelRunner.hpp
+ipnsw.o: GreedyWalkKernelRunner.hpp
+ipnsw.o: BeamSearchKernelRunner.hpp
+ipnsw.o: IProductUBmkKernelRunner.hpp
+ipnsw.o: IPNSWResultReader.hpp
+ipnsw.o: GreedyWalkResultReader.hpp
+ipnsw.o: BeamSearchResultReader.hpp
+ipnsw.o: GreedyWalkResults.hpp
+ipnsw.o: IPNSWFactory.hpp
+ipnsw.o: GreedyWalkFactory.hpp
+ipnsw.o: BeamSearchFactory.hpp
+ipnsw.o: IProductUBmkFactory.hpp
+ipnsw.o: StringHelpers.hpp
+################################################################################
+# Define the clean rules. clean calls the makefile-specific cleans, whereas
+# users can add commands and dependencies to custom.clean.
+################################################################################
+version.clean:
+	rm -rf kernel/*/*{.csv,.log,.rvo,.riscv,.vpd,.key,.png,.dis}
+	rm -rf kernel/*/{stats,pc_stats}
+
+custom.clean: version.clean
+
+clean: cosim.clean analysis.clean cudalite.clean custom.clean
+
+################################################################################
+# Define overall-goals. The all rule runs all kernel versions, and the default
+# kernel.
+################################################################################
+
+_HELP_STRING := "Makefile Rules\n"
+
+_HELP_STRING += "    default: \n"
+_HELP_STRING += "        - Run the default kernel ($KERNEL_DEFAULT) and generate all of the\n"
+_HELP_STRING += "          analysis products\n"
+default: pc_stats graphs stats
+
+_HELP_STRING += "    analysis: \n"
+_HELP_STRING += "        - Launch indpendent cosimulation executions of each kernel version.\n"
+_HELP_STRING += "          When execution finishes, it generates all the analysis products \n"
+_HELP_STRING += "          for each kernel in each respective kernel/<version_name>/ \n"
+_HELP_STRING += "          directory\n"
+analysis: $(foreach v,$(VERSIONS),kernel/$v/pc_stats kernel/$v/graphs kernel/$v/stats)
+
+_HELP_STRING += "    statistics: \n"
+_HELP_STRING += "        - Launch indpendent cosimulation executions of each kernel version.\n"
+_HELP_STRING += "          When execution finishes, it generates ONLY the parsed operation \n"
+_HELP_STRING += "          stats for each kernel in each respective kernel/<version_name>/ \n"
+_HELP_STRING += "          directory\n"
+statistics: $(foreach v,$(VERSIONS),kernel/$v/stats)
+
+_HELP_STRING += "    all: \n"
+_HELP_STRING += "        - Launch both the default and analysis target\n"
+all: analysis default
+
+.DEFAULT_GOAL = help
+_HELP_STRING += "    help: \n"
+_HELP_STRING += "        - Output a friendly help message.\n"
+help:
+	@echo -e $(HELP_STRING)
+
+# Always re-run, if asked.
+.PHONY: default analysis help
+
+# These last three lines ensure that _HELP_STRING is appended to the top of
+# whatever else comes before it.
+_HELP_STRING += "\n"
+_HELP_STRING += $(HELP_STRING)
+HELP_STRING := $(_HELP_STRING)
+
diff --git a/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp b/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp
new file mode 100644
index 000000000..39e09b1a9
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp
@@ -0,0 +1,17 @@
+#pragma once
+#include <string>
+#include <sstream>
+
+namespace ipnsw {
+    static bool startswith(const std::string &st, const std::string &prefix) {
+        return st.rfind(prefix, 0) == 0;
+    }
+
+    template <typename T>
+    T from_string(const std::string &str) {
+        std::stringstream ss(str);
+        T v;
+        ss >> v;
+        return v;
+    }
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/hb-prog-eval b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval
new file mode 160000
index 000000000..5915cc2c4
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval
@@ -0,0 +1 @@
+Subproject commit 5915cc2c4bc6336102c452a4e7d0a7b06ccf9222
diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
new file mode 100644
index 000000000..ea920b295
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
@@ -0,0 +1,86 @@
+#include "ipnsw.hpp"
+#include "HammerBlade.hpp"
+#include "Graph500Data.hpp"
+#include "Graph.hpp"
+#include "IO.hpp"
+#include "IPNSWGraph.hpp"
+#include "IPNSWRunner.hpp"
+#include "IProductUBmkKernelRunner.hpp"
+#include "IProductUBmkResultReader.hpp"
+#include "IProductUBmkFactory.hpp"
+#include "BeamSearchKernelRunner.hpp"
+#include "BeamSearchResultReader.hpp"
+#include "BeamSearchFactory.hpp"
+#include "GreedyWalkKernelRunner.hpp"
+#include "GreedyWalkResultReader.hpp"
+#include "GreedyWalkFactory.hpp"
+#include "GreedyWalkResults.hpp"
+#include "StringHelpers.hpp"
+#include <iostream>
+#include <memory>
+
+#include "GreedyWalkResults.cpp"
+
+using namespace ipnsw;
+
+int Main(int argc, char *argv[])
+{
+    Parser args;
+    args.parse(argc, argv);
+
+    std::unique_ptr<IPNSWRunner> runner;
+    std::unique_ptr<IPNSWFactory> factory;
+
+    if (ipnsw::startswith(args.version(), "greedy_walk")) {
+        factory = std::unique_ptr<IPNSWFactory>(new GreedyWalkFactory);
+    } else if (ipnsw::startswith(args.version(), "beam_search")) {
+        factory = std::unique_ptr<IPNSWFactory>(new BeamSearchFactory);
+    } else if (ipnsw::startswith(args.version(), "iproduct_ubmk")) {
+        /* parse the number of inner products */
+        std::cout << "num inner products " << args.num_iproducts() << std::endl;
+        int n_iproducts = args.num_iproducts();
+        factory = std::unique_ptr<IPNSWFactory>(new IProductUBmkFactory(n_iproducts));
+    } else if (args._version == "debug") {
+        /* just for debugging */
+        std::cout << "--num-iproducts=" << args.num_iproducts() << std::endl;
+        std::cout << "--queries=";
+        auto do_queries = args.do_queries();
+        for (auto q : do_queries) {
+            std::cout << q << " ";
+        }
+        std::cout << std::endl;
+        return 0;
+    } else {
+        return 0;
+    }
+
+    runner = std::unique_ptr<IPNSWRunner>(new IPNSWRunner(args, factory));
+    runner->run();
+
+    return 0;
+}
+
+#ifdef COSIM
+void cosim_main(uint32_t *exit_code, char * args) {
+    // We aren't passed command line arguments directly so we parse them
+    // from *args. args is a string from VCS - to pass a string of arguments
+    // to args, pass c_args to VCS as follows: +c_args="<space separated
+    // list of args>"
+    int argc = get_argc(args);
+    char *argv[argc];
+    get_argv(args, argc, argv);
+
+#ifdef VCS
+    svScope scope;
+    scope = svGetScopeFromName("tb");
+    svSetScope(scope);
+#endif
+    int rc = Main(argc, argv);
+    *exit_code = rc;
+    return;
+}
+#else
+int main(int argc, char ** argv) {
+    return Main(argc, argv);
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp
new file mode 100644
index 000000000..385873c50
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp
@@ -0,0 +1,37 @@
+// Copyright (c) 2019, University of Washington All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+// 
+// Redistributions of source code must retain the above copyright notice, this list
+// of conditions and the following disclaimer.
+// 
+// Redistributions in binary form must reproduce the above copyright notice, this
+// list of conditions and the following disclaimer in the documentation and/or
+// other materials provided with the distribution.
+// 
+// Neither the name of the copyright holder nor the names of its contributors may
+// be used to endorse or promote products derived from this software without
+// specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma  once
+#include <cstring>
+#include <cstdlib>
+#include <random>
+#include <limits>
+#include <iostream>
+#include <typeinfo>
+#include <bsg_manycore_errno.h>
+#include <bsg_manycore_cuda.h>
+#include "../common.h"
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp
new file mode 100644
index 000000000..a75d5b1bf
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp
@@ -0,0 +1,182 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_beam_search(const graph *Gs, const float *database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DynSet<int, std::less<int>> seen(seen_mem, N_V);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+            //v_worst = std::get<1>(results.top());
+            bsg_print_int(-v_best);
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+                bsg_print_int(dst);
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp
new file mode 100644
index 000000000..a69965073
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp
@@ -0,0 +1,188 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs, const float *database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp
new file mode 100644
index 000000000..b0f374a4c
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp
@@ -0,0 +1,189 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp
new file mode 100644
index 000000000..f98216636
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp
@@ -0,0 +1,189 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v2<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp
new file mode 100644
index 000000000..01f62555f
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp
@@ -0,0 +1,189 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp
new file mode 100644
index 000000000..18a29fd33
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp
@@ -0,0 +1,189 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+n_res, LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp
new file mode 100644
index 000000000..9c761e94d
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp
@@ -0,0 +1,2 @@
+extern "C" int empty() {
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp
new file mode 100644
index 000000000..385e69d8a
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, const float *database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp
new file mode 100644
index 000000000..67533d6da
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp
new file mode 100644
index 000000000..d7c2bd9c3
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v2<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp
new file mode 100644
index 000000000..ddea465b0
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp
new file mode 100644
index 000000000..aaaf5317d
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp
@@ -0,0 +1,40 @@
+#pragma once
+#include <array>
+#include <algorithm>
+
+template <typename T, typename Comparitor>
+class DynHeap {
+public:
+    DynHeap(T *data, int N):
+        _data(data),
+        _data_N(N),
+        _n(0){
+    }
+
+    void push(T i) {
+        _data[_n++] = i;
+        std::push_heap(_data, _data+_n, Comparitor());
+        if (_n > _data_N) pop();
+    }
+
+    T pop() {
+        std::pop_heap(_data, _data+_n--, Comparitor());
+        return _data[_n];
+    }
+
+    T top() const {
+        return _data[0];
+    }
+
+    bool empty() const {
+        return _n == 0;
+    }
+
+    int size() const {
+        return _n;
+    }
+
+    int _n;
+    int _data_N;
+    T  *_data;
+};
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp
new file mode 100644
index 000000000..95d6b291e
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp
@@ -0,0 +1,6 @@
+#ifndef __HELLO_WORLD_HPP
+#define __HELLO_WORLD_HPP
+
+#include <cstdint>
+
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp
new file mode 100644
index 000000000..8bb83077a
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp
@@ -0,0 +1,89 @@
+#pragma once
+#include "bsg_striped_array.hpp"
+#include <cmath>
+#include <numeric>
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
+__attribute__((noinline))
+FLOAT_T inner_product(const FLOAT_T *__restrict a, const FLOAT_T *__restrict b)
+{
+    FLOAT_T r = 0.0;
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) {
+        #pragma GCC unroll 32
+        for (int j = 0; j < BSIZE; ++j) {
+            r += a[i + j]*b[i + j];
+        }
+    }
+    return r;
+}
+
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
+__attribute__((noinline))
+FLOAT_T inner_product_v1(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    FLOAT_T r = 0.0;
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) {
+        #pragma GCC unroll 32
+        for (int j = 0; j < BSIZE; ++j) {
+            r += a[i + j]*b[i + j];
+        }
+    }
+    return r;
+}
+
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
+__attribute__((noinline))
+FLOAT_T inner_product_v2(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    FLOAT_T r = 0.0;
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) {
+        #pragma GCC unroll 32
+        for (int j = 0; j < BSIZE; ++j) {
+            r = fmaf(a[i+j], b[i+j], r);
+        }
+    }
+    return r;
+}
+
+
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
+__attribute__((noinline))
+FLOAT_T inner_product_v3(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    FLOAT_T r0 = 0.0, r1 = 0.0;
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += 2 * BSIZE * TG_X * TG_Y) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+            r0 = fmaf(a[i+j+0*BSIZE], b[i+j+0*BSIZE], r0);
+            r1 = fmaf(a[i+j+1*BSIZE], b[i+j+1*BSIZE], r1);
+        }
+    }
+    return r0+r1;
+}
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=5, int UNROLL=5>
+__attribute__((noinline))
+FLOAT_T inner_product_v4(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    register FLOAT_T r[UNROLL];
+    for (int i = __bsg_id * BSIZE; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+#pragma bsg_unroll(32)
+            for (int k =0 ; k < UNROLL; ++k) {
+                r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]);
+            }
+        }
+    }
+    int rs = 0.0;
+    for (int i = 0; i < UNROLL; ++i)
+        rs += r[i];
+    return rs;
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp
new file mode 100644
index 000000000..2ef683838
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp
@@ -0,0 +1,73 @@
+#pragma once
+#include <algorithm>
+#include <atomic>
+template<typename T, typename Comparitor>
+class DynSet {
+public:
+    DynSet(T *data, int N):
+        _data(data),
+        _data_N(N),
+        _n(0) {
+    }
+
+    void insert(T i) {
+        _data[_n++] = i;
+        std::sort(_data, _data+_n, Comparitor());
+    }
+
+    bool in(T i) {
+        return std::binary_search(_data, _data+_n, i, Comparitor());
+    }
+
+    int size() const {
+        return _n;
+    }
+
+    T    *_data;
+    int   _n;
+    int   _data_N;
+};
+
+template<typename T>
+class DenseSet {
+public:
+    DenseSet(int *data):
+        _data(data) {
+    }
+
+    void insert(T i) {
+        _data[i] = 1;
+    }
+
+    bool in(T i) {
+        return _data[i] == 1;
+    }
+
+    int *_data;
+};
+
+template<typename T>
+class DenseSet_v1 {
+public:
+    DenseSet_v1(int *data) :
+        _data(data){
+    }
+
+    void insert(T i) {
+        _data[word(i)] |= (1 << bit(i));
+    }
+
+    bool in(T i) {
+        return _data[word(i)] & (1 << bit(i));
+    }
+
+    int word(T i) const {
+        return  i >> 5;
+    }
+
+    int bit(T i) const {
+        return i & 31;
+    }
+    int *_data;
+};
+
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp
new file mode 100644
index 000000000..fc8dd7c82
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp
@@ -0,0 +1,71 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    int inner_product_ubmk(const float *database, const float *query, int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            const float *b = &database[i*3*VSIZE];
+            r += inner_product<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(q,b);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp
new file mode 100644
index 000000000..2deb68437
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+    
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            //const float *b = &database[i*3*VSIZE];
+            r += iproduct(q, &database[i*3*VSIZE]);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp
new file mode 100644
index 000000000..0d4fce43b
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v2<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            //const float *b = &database[i*3*VSIZE];
+            r += iproduct(q, &database[i*3*VSIZE]);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp
new file mode 100644
index 000000000..8f1058017
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            //const float *b = &database[i*3*VSIZE];
+            r += iproduct(q, &database[i*3*VSIZE]);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp
new file mode 100644
index 000000000..c1ab7a9ba
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v4<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N)
+    {
+        float q[VSIZE];
+        float r = 0;
+
+        bsg_print_int(N);
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        // perform a random inner product N times
+        for (int i = 0; i < N; ++i) {
+            //const float *b = &database[i*3*VSIZE];
+            r += iproduct(q, &database[i*3*VSIZE]);
+        }
+        bsg_cuda_print_stat_end(0);
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif

From 0723d9070199aae2b6b9d4d52a9869ae190c692a Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Fri, 30 Apr 2021 15:31:30 -0700
Subject: [PATCH 02/22] [ipnsw] Older versions

---
 .../ipnsw/BeamSearchKernelRunner.hpp          |  42 ++-
 .../ipnsw/BeamSearchResultReader.hpp          |   5 +-
 .../ipnsw/GreedyWalkKernelRunner.hpp          |  24 +-
 .../ipnsw/GreedyWalkResultReader.hpp          |   4 +-
 .../sdh-eval-workloads/ipnsw/GroupData.hpp    |  10 +
 examples/sdh-eval-workloads/ipnsw/IO.hpp      |  32 ++
 .../ipnsw/IPNSWKernelRunner.hpp               |  15 +-
 .../sdh-eval-workloads/ipnsw/IPNSWRunner.hpp  | 231 ++++++++++++---
 .../ipnsw/IProductUBmkFactory.hpp             |   6 +-
 .../ipnsw/IProductUBmkKernelRunner.hpp        |   9 +-
 .../ipnsw/IProductUBmkParallelFactory.hpp     |  20 ++
 .../IProductUBmkParallelKernelRunner.hpp      |  64 ++++
 examples/sdh-eval-workloads/ipnsw/Makefile    | 120 +++-----
 examples/sdh-eval-workloads/ipnsw/ipnsw.cpp   |  50 ++--
 examples/sdh-eval-workloads/ipnsw/ipnsw.hpp   |   1 -
 .../ipnsw/kernel/beam_search_v1/kernel.cpp    |   2 +-
 .../ipnsw/kernel/beam_search_v10/kernel.cpp   | 279 ++++++++++++++++++
 .../beam_search_v5-ipv4serial/kernel.cpp      | 192 ++++++++++++
 .../ipnsw/kernel/beam_search_v5/kernel.cpp    |  11 +-
 .../ipnsw/kernel/beam_search_v6/kernel.cpp    | 195 ++++++++++++
 .../ipnsw/kernel/beam_search_v7/kernel.cpp    | 194 ++++++++++++
 .../ipnsw/kernel/beam_search_v8/kernel.cpp    | 270 +++++++++++++++++
 .../ipnsw/kernel/beam_search_v9/kernel.cpp    | 249 ++++++++++++++++
 .../greedy_walk_v3-ipv4serial/kernel.cpp      | 147 +++++++++
 .../ipnsw/kernel/greedy_walk_v3/kernel.cpp    |   2 +-
 .../ipnsw/kernel/greedy_walk_v4/kernel.cpp    | 152 ++++++++++
 .../kernel/greedy_walk_v4/kernel.loc.cpp      | 113 +++++++
 .../ipnsw/kernel/greedy_walk_v4/loc.sh        |   1 +
 .../ipnsw/kernel/include/inner_product.hpp    | 236 ++++++++++++++-
 .../ipnsw/kernel/include/set.hpp              |  15 +-
 .../kernel/include/sleep_until_valid.hpp      |  28 ++
 .../iproduct_ubmk-parallel_v1/kernel.cpp      | 180 +++++++++++
 .../iproduct_ubmk-parallel_v2/kernel.cpp      | 154 ++++++++++
 .../iproduct_ubmk-parallel_v3/kernel.cpp      |  80 +++++
 .../kernel/iproduct_ubmk_parallel/kernel.cpp  |  94 ++++++
 35 files changed, 3030 insertions(+), 197 deletions(-)
 create mode 100644 examples/sdh-eval-workloads/ipnsw/GroupData.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp
 create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp

diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp
index 426042f6d..6fe724b68 100644
--- a/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp
@@ -9,30 +9,44 @@ namespace ipnsw {
             return "ipnsw_beam_search";
         }
 
+        Dim tgd(const IPNSWRunner & runner) const {
+            return Dim(runner.cfg().grp_x(),
+                       runner.cfg().grp_y());
+        }
+
+        Dim gd(const IPNSWRunner & runner) const {
+            return Dim(runner.cfg().grid_x(),
+                       runner.cfg().grid_y());
+        }
         std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
             int v_curr;
             float d_curr;
-            v_curr = std::get<GWR_VERT>(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]);
-            d_curr = std::get<GWR_DIST>(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]);
+            std::vector<int> do_queries = runner._io->do_queries();
+            if (do_queries.empty()) {
+                v_curr = std::get<GWR_VERT>(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]);
+                d_curr = std::get<GWR_DIST>(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]);
+            } else {
+                v_curr = std::get<GWR_VERT>(GREEDY_WALK_RESULTS[do_queries[0]]);
+                d_curr = std::get<GWR_DIST>(GREEDY_WALK_RESULTS[do_queries[0]]);
+            }
 
             HammerBlade::Ptr hb = HammerBlade::Get();
-            hb->write(runner.v_curr_dev(), &v_curr, sizeof(v_curr));
-            hb->write(runner.d_curr_dev(), &d_curr, sizeof(d_curr));
+            hb->write(runner.v_curr_dev(0), &v_curr, sizeof(v_curr));
+            hb->write(runner.d_curr_dev(0), &d_curr, sizeof(d_curr));
 
             std::vector<hb_mc_eva_t> argv = {
                 runner.graph_metadata_dev(),
                 runner.db_dev(),
-                runner.query_dev(),
-                runner.seen_dev(),
-                runner.v_curr_dev(),
-                runner.d_curr_dev(),
-                runner.candidates_dev(),
-                runner.results_dev(),
-                runner.n_results_dev(),
+                runner.query_dev(0),
+                runner.seen_dev(0),
+                runner.v_curr_dev(0),
+                runner.d_curr_dev(0),
+                runner.candidates_dev(0),
+                runner.results_dev(0),
+                runner.n_results_dev(0),
             };
             return argv;
-        };
-        Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);}
-        Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);}
+        }
+
     };
 }
diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp
index ce77d324f..3d4cc7493 100644
--- a/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp
@@ -9,11 +9,12 @@ namespace ipnsw {
         void readResults(const IPNSWRunner & runner) {
             HammerBlade::Ptr hb = HammerBlade::Get();
 
+            hb_mc_eva_t grp = 0;
             int n_results;
-            hb->read(runner.n_results_dev(), &n_results, sizeof(int));
+            hb->read(runner.n_results_dev(grp), &n_results, sizeof(int));
 
             std::vector<GreedyWalkResult> results(n_results);
-            hb->push_read(runner.results_dev(), &results[0], n_results * sizeof(GreedyWalkResult));
+            hb->push_read(runner.results_dev(grp), &results[0], n_results * sizeof(GreedyWalkResult));
             hb->sync_read();
 
             std::cout << "Beam search:" << std::endl;
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp
index 72eea9f0f..ac51739b4 100644
--- a/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp
@@ -4,6 +4,17 @@
 
 namespace ipnsw {
     class GreedyWalkKernelRunner : public IPNSWKernelRunner {
+
+        Dim tgd(const IPNSWRunner & runner) const {
+            return Dim(runner.cfg().grp_x(),
+                       runner.cfg().grp_y());
+        }
+
+        Dim gd(const IPNSWRunner & runner) const {
+            return Dim(runner.cfg().grid_x(),
+                       runner.cfg().grid_y());
+        }
+
         std::string kernelName(const IPNSWRunner & runner) const {
             return "ipnsw_greedy_search";
         }
@@ -12,14 +23,13 @@ namespace ipnsw {
             std::vector<hb_mc_eva_t> argv = {
                 runner.graph_metadata_dev(),
                 runner.db_dev(),
-                runner.query_dev(),
-                runner.seen_dev(),
-                runner.v_curr_dev(),
-                runner.d_curr_dev(),
+                runner.query_dev(0),
+                runner.seen_dev(0),
+                runner.v_curr_dev(0),
+                runner.d_curr_dev(0),
             };
             return argv;
-        };
-        Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);}
-        Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);}
+        }
+
     };
 }
diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp
index ae57cd548..6ca7851ff 100644
--- a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp
@@ -10,8 +10,8 @@ namespace ipnsw {
             int v_curr;
             float d_curr;
 
-            hb->read(runner.v_curr_dev(), &v_curr, sizeof(int));
-            hb->read(runner.d_curr_dev(), &d_curr, sizeof(float));
+            hb->read(runner.v_curr_dev(0), &v_curr, sizeof(int));
+            hb->read(runner.d_curr_dev(0), &d_curr, sizeof(float));
 
             std::cout << "Greedy walk (v_curr,d_curr) = "
                       << "(" << v_curr << "," << d_curr << ")"
diff --git a/examples/sdh-eval-workloads/ipnsw/GroupData.hpp b/examples/sdh-eval-workloads/ipnsw/GroupData.hpp
new file mode 100644
index 000000000..b9052ab23
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/GroupData.hpp
@@ -0,0 +1,10 @@
+#include <bsg_manycore_cuda.h>
+namespace ipnsw {
+    struct GroupData {
+        hb_mc_eva_t seen_mem;
+        hb_mc_eva_t candidates_mem;
+        hb_mc_eva_t results_mem;
+        hb_mc_eva_t curr;
+        hb_mc_eva_t n_results;
+    };
+};
diff --git a/examples/sdh-eval-workloads/ipnsw/IO.hpp b/examples/sdh-eval-workloads/ipnsw/IO.hpp
index 7dd4ef05e..52f0bad5b 100644
--- a/examples/sdh-eval-workloads/ipnsw/IO.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/IO.hpp
@@ -136,6 +136,38 @@ namespace ipnsw {
             return n;
         }
 
+        int grid_x() const {
+            auto s = option("--grid-x");
+            if (!s.empty())
+                return from_string<int>(s);
+            else
+                return 1;
+        }
+
+        int grid_y() const {
+            auto s = option("--grid-y");
+            if (!s.empty())
+                return from_string<int>(s);
+            else
+                return 1;
+        }
+
+        int grp_x() const {
+            auto s = option("--group-x");
+            if (!s.empty())
+                return from_string<int>(s);
+            else
+                return 1;
+        }
+
+        int grp_y() const {
+            auto s = option("--group-y");
+            if (!s.empty())
+                return from_string<int>(s);
+            else
+                return 1;
+        }
+
         std::string ucode() const   { return _ucode; }
         std::string version() const { return _version; }
         std::string exe() const     { return _exe; }
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp
index e6042acaa..1604cb93e 100644
--- a/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp
@@ -9,15 +9,24 @@ namespace ipnsw {
     public:
         using HammerBlade = hammerblade::host::HammerBlade;
         using Dim = hammerblade::host::Dim;
-        IPNSWKernelRunner() {}
+        IPNSWKernelRunner(){}
 
     protected:
         virtual std::string kernelName(const IPNSWRunner & runner) const =0;
         virtual std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const =0;
-        virtual Dim gd(const IPNSWRunner &runner) const = 0;
-        virtual Dim tgd(const IPNSWRunner &runner) const = 0;
 
     public:
+        virtual Dim gd(const IPNSWRunner &runner) const {
+            return Dim(1,1);
+        }
+        virtual Dim tgd(const IPNSWRunner &runner) const {
+            return Dim(1,1);
+        }
+
+    public:
+        virtual void beforeLaunchKernel(const IPNSWRunner &runner) { }
+        virtual void afterLaunchKernel(const IPNSWRunner &runner)  { }
+        
         void runKernel(IPNSWRunner &runner) {
             HammerBlade::Ptr hb = HammerBlade::Get();
             hb->push_jobv(gd(runner),
diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp
index 3dbca5bec..feebf121d 100644
--- a/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp
@@ -6,10 +6,59 @@
 #include "IPNSWKernelRunner.hpp"
 #include "IPNSWResultReader.hpp"
 #include "GreedyWalkResults.hpp"
+#include "GroupData.hpp"
 #include <memory>
 
 namespace ipnsw {
 
+    class IPNSWRunnerConfig {
+    public:
+        typedef enum {
+            Dense,
+            BitVector,
+            Sparse,
+        } SetType;
+
+        IPNSWRunnerConfig():
+            _set_type(BitVector),
+            _grid_x(1),
+            _grid_y(1),
+            _grp_x(1),
+            _grp_y(1) {
+        }
+
+        SetType set_type() const { return _set_type; }
+        SetType & set_type() { return _set_type; }
+
+        std::string set_type_str() const {
+            switch (set_type()) {
+            case Dense:
+                return "Dense";
+            case BitVector:
+                return "Dense Bit Vector";
+            case Sparse:
+                return "Sparse";
+            }
+        }
+
+        int & grid_x()       { return _grid_x; }
+        int   grid_x() const { return _grid_x; }
+        int & grid_y()       { return _grid_y; }
+        int   grid_y() const { return _grid_y; }
+
+        int & grp_x()       { return _grp_x; }
+        int   grp_x() const { return _grp_x; }
+        int & grp_y()       { return _grp_y; }
+        int   grp_y() const { return _grp_y; }
+
+    private:
+        SetType _set_type;
+        int     _grid_x;
+        int     _grid_y;
+        int     _grp_x;
+        int     _grp_y;
+    };
+
     class IPNSWRunner {
     public:
         //static constexpr int QUERY = 276; // fewest dot products for greedy walk
@@ -24,12 +73,23 @@ namespace ipnsw {
         //static constexpr int QUERY = 461;
         //static constexpr int QUERY = 470;
 
+
+        static constexpr size_t CANDIDATES_MAX = 513;
+        static constexpr size_t RESULTS_MAX    = 129;
+
         using HammerBlade = hammerblade::host::HammerBlade;
         using Dim = hammerblade::host::Dim;
 
         IPNSWRunner(const Parser &p,
-                    std::unique_ptr<IPNSWFactory> & fact):
-            _factory(std::move(fact)) {
+                    std::unique_ptr<IPNSWFactory> & fact) :
+            IPNSWRunner(p, fact, IPNSWRunnerConfig()) {
+        }
+        
+        IPNSWRunner(const Parser &p,
+                    std::unique_ptr<IPNSWFactory> & fact,
+                    const IPNSWRunnerConfig &cfg):
+            _factory(std::move(fact)),
+            _cfg(cfg) {
             _io = std::unique_ptr<IO>(new IO(p));
             _hb = HammerBlade::Get();
             _kernel_runner = _factory->KernelRunner();
@@ -63,19 +123,41 @@ namespace ipnsw {
 
         void initializeDeviceMemoryQuery() {
             std::cout << "Initializing query "  << std::endl;
-            int query = QUERY;
 
-            auto do_queries = _io->do_queries();
-            if (!do_queries.empty())
-                query = do_queries[0];
+            std::vector<int> do_queries = _io->do_queries();
+            if (do_queries.empty()) {
+                do_queries = {QUERY};
+            }
+
+            _query_dev = _hb->alloc(sizeof(_queries[0]) * do_queries.size());
 
-            _query_dev = _hb->alloc(sizeof(_queries[query]));
-            _hb->push_write(_query_dev, &_queries[query], sizeof(_queries[query]));
+            for (hb_mc_eva_t qidx = 0; qidx < do_queries.size(); ++qidx) {
+                int query = do_queries[qidx];
+                _hb->push_write(_query_dev + qidx * sizeof(_queries[query]),
+                                &_queries[query],
+                                sizeof(_queries[query]));
+            }
         }
 
+        size_t seen_dev_size_per_group() const {
+            size_t size, words;
+            switch (_cfg.set_type()) {                
+            case IPNSWRunnerConfig::Dense:
+            case IPNSWRunnerConfig::Sparse:                
+                return _db.size() * sizeof(int);
+            case IPNSWRunnerConfig::BitVector:
+                words = _db.size()/32;
+                if (_db.size() % 32 != 0)
+                    words += 1;
+                return words * sizeof(int);
+            }
+        }
         void initializeDeviceMemorySeen() {
             std::cout << "Initializing seen set " << std::endl;
-            _seen_dev = _hb->alloc(_db.size() * sizeof(int));
+            for (int i = 0; i < numGroups(); ++i) {
+                hb_mc_eva_t dev = _hb->alloc(seen_dev_size_per_group());
+                _seen_dev.push_back(dev);
+            }
         }
 
         void initializeDeviceMemoryGraphs() {
@@ -85,23 +167,55 @@ namespace ipnsw {
             _graph_metadata_dev = Graph::InitializeMetadataOnDevice(_graphs);
         }
 
-        void initializeDeviceVCurr() {
-            _v_curr_dev = _hb->alloc(sizeof(int));
+        void initializeDeviceVCurrDCurr() {
+            _curr_dev = _hb->alloc(sizeof(GreedyWalkResult) * numGroups());
+            hb_mc_eva_t grp = 0;
+            std::cout << std::hex;
+            std::cout << "_curr_dev=" << std::hex << _curr_dev << std::endl;
+            std::cout << "  curr(" << std::dec << grp << ")=" << std::hex <<   curr_dev(grp) << std::endl;
+            std::cout << "v_curr(" << std::dec << grp << ")=" << std::hex << v_curr_dev(grp) << std::endl;
+            std::cout << "d_curr(" << std::dec << grp << ")=" << std::hex << d_curr_dev(grp) << std::endl;
+            std::cout << std::dec;
         }
-        void initializeDeviceDCurr() {
-            _d_curr_dev = _hb->alloc(sizeof(float));
+
+        size_t candidates_dev_size_per_group() const {
+            return sizeof(GreedyWalkResult) * CANDIDATES_MAX;
         }
 
         void initializeDeviceCandidateDev() {
-            _candidates_dev = _hb->alloc(sizeof(GreedyWalkResult)*513);
+            for (int i = 0; i < numGroups(); ++i) {
+                hb_mc_eva_t dev = _hb->alloc(candidates_dev_size_per_group());
+                _candidates_dev.push_back(dev);
+            }
+        }
+
+        size_t results_dev_size_per_group() const {
+            return sizeof(GreedyWalkResult) * RESULTS_MAX;
         }
 
         void initializeDeviceResultsDev() {
-            _results_dev = _hb->alloc(sizeof(GreedyWalkResult) * 129);
+            for (int i = 0; i < numGroups(); ++i) {
+                hb_mc_eva_t dev = _hb->alloc(results_dev_size_per_group());
+                _results_dev.push_back(dev);
+            }
         }
 
         void initializeDeviceNResultsDev() {
-            _n_results_dev = _hb->alloc(sizeof(int));
+            _n_results_dev = _hb->alloc(sizeof(int) * numGroups());
+        }
+
+        void initializeGroupData() {
+            _group_data_dev = _hb->alloc(sizeof(GroupData) * numGroups());
+            for (int i = 0; i < numGroups(); ++i) {
+                GroupData gd = {
+                    .seen_mem       = seen_dev(i),
+                    .candidates_mem = candidates_dev(i),
+                    .results_mem    = results_dev(i),
+                    .curr           = curr_dev(i),
+                    .n_results      = n_results_dev(i),
+                };
+                _hb->push_write(group_data_dev(i), &gd, sizeof(gd));
+            }
         }
 
         void initializeDeviceMemory() {
@@ -109,19 +223,21 @@ namespace ipnsw {
             initializeDeviceMemoryQuery();
             initializeDeviceMemorySeen();
             initializeDeviceMemoryGraphs();
-            initializeDeviceVCurr();
-            initializeDeviceDCurr();
+            initializeDeviceVCurrDCurr();
             initializeDeviceCandidateDev();
             initializeDeviceResultsDev();
             initializeDeviceNResultsDev();
-            // sync
-            std::cout << "Starting DMA" << std::endl;
-            _hb->sync_rw();
+            initializeGroupData();
         }
 
         void runKernel() {
+            _kernel_runner->beforeLaunchKernel(*this);
+            // sync
+            std::cout << "Starting DMA" << std::endl;
+            _hb->sync_rw();
             std::cout << "Launching kernel" << std::endl;
             _kernel_runner->runKernel(*this);
+            _kernel_runner->afterLaunchKernel(*this);
         }
 
         void readResults() {
@@ -145,36 +261,75 @@ namespace ipnsw {
         }
 
         hb_mc_eva_t db_dev() const { return _db_dev; }
-        hb_mc_eva_t query_dev() const { return _query_dev; }
-        hb_mc_eva_t seen_dev() const { return _seen_dev; }
-        hb_mc_eva_t v_curr_dev() const { return _v_curr_dev; }
-        hb_mc_eva_t d_curr_dev() const { return _d_curr_dev; }
+        hb_mc_eva_t query_dev(hb_mc_eva_t qidx) const {
+            return _query_dev + qidx * sizeof(_queries[qidx]);
+        }
+
+        hb_mc_eva_t seen_dev(hb_mc_eva_t grp) const {
+            return _seen_dev[grp];
+        }
+
+        hb_mc_eva_t curr_dev(hb_mc_eva_t grp = 0) const {
+            return _curr_dev + (grp*sizeof(GreedyWalkResult));
+        }
+
+        hb_mc_eva_t v_curr_dev(hb_mc_eva_t grp) const {
+            return curr_dev(grp) + sizeof(float);
+        }
+        hb_mc_eva_t d_curr_dev(hb_mc_eva_t grp) const {
+            return curr_dev(grp);
+        }
+
         hb_mc_eva_t graph_metadata_dev() const { return _graph_metadata_dev; }
-        hb_mc_eva_t candidates_dev() const { return _candidates_dev; }
-        hb_mc_eva_t results_dev() const { return _results_dev; }
-        hb_mc_eva_t n_results_dev() const { return _n_results_dev; }
 
+        hb_mc_eva_t candidates_dev(hb_mc_eva_t grp) const {
+            return _candidates_dev[grp];
+        }
+
+        hb_mc_eva_t results_dev(hb_mc_eva_t grp) const {
+            return _results_dev[grp];
+        }
+
+        hb_mc_eva_t n_results_dev(hb_mc_eva_t grp) const {
+            return _n_results_dev + grp * sizeof(int);
+        }
+        
+        hb_mc_eva_t group_data_dev(hb_mc_eva_t grp) const {
+            return _group_data_dev + grp * sizeof(GroupData);
+        }
+
+        int numGroups() const { return _kernel_runner->gd(*this).x() * _kernel_runner->gd(*this).y(); }
+
+        const std::vector<std::array<float,100>> & db() const { return _db; }
+
+        const IPNSWRunnerConfig & cfg() const { return _cfg; }
         /////////////
         // Setters //
         /////////////
 
     private:
-        std::unique_ptr<IO>                  _io;
-        std::vector<Graph>                   _graphs;
-        std::vector<std::array<float, 100>>  _db;
-        std::vector<std::array<float, 100>>  _queries;
-        HammerBlade::Ptr                     _hb;
+        IPNSWRunnerConfig                     _cfg;
+
+    public:
+        std::unique_ptr<IO>                   _io;
+
+    private:
+        std::vector<Graph>                    _graphs;
+        std::vector<std::array<float, 100>>   _db;
+        std::vector<std::array<float, 100>>   _queries;
+        std::vector<GroupData>                _group_data;
+        HammerBlade::Ptr                      _hb;
 
         // device pointers
         hb_mc_eva_t _db_dev;
         hb_mc_eva_t _query_dev;
-        hb_mc_eva_t _seen_dev;
-        hb_mc_eva_t _v_curr_dev;
-        hb_mc_eva_t _d_curr_dev;
+        std::vector<hb_mc_eva_t> _seen_dev;
+        hb_mc_eva_t _curr_dev;
         hb_mc_eva_t _graph_metadata_dev;
-        hb_mc_eva_t _candidates_dev;
-        hb_mc_eva_t _results_dev;
+        std::vector<hb_mc_eva_t> _candidates_dev;
+        std::vector<hb_mc_eva_t> _results_dev;
         hb_mc_eva_t _n_results_dev;
+        hb_mc_eva_t _group_data_dev;
 
         // composites
         std::unique_ptr<IPNSWKernelRunner> _kernel_runner;
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp
index 9a4861844..ff0468903 100644
--- a/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp
@@ -9,9 +9,9 @@ namespace ipnsw {
             _iterations(iterations) {
         }
 
-    private:
-        IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkKernelRunner(_iterations); }
-        IPNSWResultReader *_ResultReader() const { return new IProductUBmkResultReader; }
+    protected:
+        virtual IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkKernelRunner(_iterations); }
+        virtual IPNSWResultReader *_ResultReader() const { return new IProductUBmkResultReader; }
 
         int _iterations;
     };
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp
index e9d3010bc..1ee4da763 100644
--- a/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp
@@ -15,17 +15,16 @@ namespace ipnsw {
             return "inner_product_ubmk";
         }
 
-        std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+        virtual std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
             std::vector<hb_mc_eva_t> argv = {
                 runner.db_dev(), // database
-                runner.query_dev(), // query
+                runner.query_dev(0), // query
                 static_cast<hb_mc_eva_t>(_iterations), // number of inner products
             };
             return argv;
-        };
-        Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);}
-        Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);}
+        }
 
+    protected:
         int _iterations;
     };
 }
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp
new file mode 100644
index 000000000..964cc2d8e
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp
@@ -0,0 +1,20 @@
+#pragma once
+#include "IPNSWFactory.hpp"
+#include "IProductUBmkKernelRunner.hpp"
+#include "IProductUBmkResultReader.hpp"
+#include "IProductUBmkFactory.hpp"
+#include "IProductUBmkParallelKernelRunner.hpp"
+
+namespace ipnsw {
+    class IProductUBmkParallelFactory : public IProductUBmkFactory {
+    public:
+        IProductUBmkParallelFactory(int itertions = 10):
+            IProductUBmkFactory(itertions) {
+        }
+
+    private:
+        IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkParallelKernelRunner(_iterations); }
+
+    };
+}
+
diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp
new file mode 100644
index 000000000..668114fb2
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp
@@ -0,0 +1,64 @@
+#pragma once
+#include "IPNSWKernelRunner.hpp"
+#include "IProductUBmkKernelRunner.hpp"
+#include "IPNSWRunner.hpp"
+#include "HammerBlade.hpp"
+#include <algorithm>
+
+namespace ipnsw {
+    class IProductUBmkParallelKernelRunner : public IProductUBmkKernelRunner {
+    public:
+        IProductUBmkParallelKernelRunner(int iterations = 10) :
+            IProductUBmkKernelRunner(iterations) {
+        }
+
+    private:
+        using HammerBlade = hammerblade::host::HammerBlade;
+
+        void beforeLaunchKernel(const IPNSWRunner &runner) {
+            HammerBlade::Ptr _hb = HammerBlade::Get();
+            
+            _visit.clear();
+            
+            for (int i = 0; i < _iterations * runner.numGroups(); ++i) {
+                _visit.push_back((i*3) % runner.db().size());
+            }
+            std::random_shuffle(_visit.begin(), _visit.end());
+            
+            _visit_dev = _hb->alloc(sizeof(int) * _visit.size());
+
+            std::cout << "beforeLaunchKernel called: _visit_dev = " << std::hex << _visit_dev << std::endl;
+            std::cout << std::dec;
+
+            _hb->push_write(_visit_dev, &_visit[0], sizeof(int) * _visit.size());
+        }
+
+        std::vector<hb_mc_eva_t> argv(const IPNSWRunner & runner) const {
+            std::cout << "Called" << std::endl;
+            std::vector<hb_mc_eva_t> argv = {
+                runner.db_dev(), // database
+                runner.query_dev(0), // query                
+                static_cast<hb_mc_eva_t>(_iterations), // number of inner products
+                _visit_dev, // vectors to visit
+            };
+            return argv;
+        }
+
+        void afterLaunchKernel(const IPNSWRunner &runner) {
+            HammerBlade::Ptr _hb = HammerBlade::Get();
+            _hb->free(_visit_dev);
+            _visit.clear();
+        }
+
+        virtual Dim gd(const IPNSWRunner &runner) const {
+            return Dim(runner.cfg().grid_x(),runner.cfg().grid_y());
+        }
+
+        virtual Dim tgd(const IPNSWRunner &runner) const {
+            return Dim(runner.cfg().grp_x(),runner.cfg().grp_y());
+        }
+
+        hb_mc_eva_t          _visit_dev;
+        std::vector<int>     _visit;
+    };
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile
index 6e814f018..b4bfa09d7 100644
--- a/examples/sdh-eval-workloads/ipnsw/Makefile
+++ b/examples/sdh-eval-workloads/ipnsw/Makefile
@@ -100,30 +100,17 @@ KERNEL_DEFAULT      := kernel/$(DEFAULT_VERSION)/kernel.cpp
 # END OF KERNEL-SPECIFIC RULES / START OF HOST-SPECIFIC RULES
 ################################################################################
 
-
-################################################################################
-# Define the $(HOST_TARGET), the name of the host executable to generate. The
-# cosimulation host executable will be called
-# $(HOST_TARGET).cosim. HOST_*SOURCES list the host files that should be
-# compiled and linked into the executable.
-################################################################################
-
-HOST_TARGET         := ipnsw
-HOST_CSOURCES       := 
-HOST_CXXSOURCES     += GreedyWalkResults.cpp
-HOST_INCLUDES       := -I$(CURRENT_PATH)
-
 ################################################################################
 # Include the Cosimulation host build rules (This must be included after
 # HOST_*SOURCES, HOST_TARGET, HOST_INCLUDES, etc)
 ################################################################################
-
-ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/database_music100.bin
-ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/query_music100.bin
-ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_0
-ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1
-ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2
-ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3
+HOST_TARGET = ipnsw
+C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/database_music100.bin
+C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/query_music100.bin
+C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_0
+C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1
+C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2
+C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3
 
 
 ################################
@@ -140,7 +127,9 @@ kernel/iproduct_ubmk-$(1)/kernel.cpp: kernel/$(IPRODUCT-BASENAME)/kernel.cpp
 	cp $$< $$@
 
 # adds arguments
-kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: ARGS += --num-iproducts $(1)
+kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: C_ARGS += --num-iproducts $(1)
+kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv
+kernel/iproduct_ubmk-$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
 
 # adds to list of iproduct u-bmk
 IPRODUCT-UBMK-VERSIONS += iproduct_ubmk-$(1)
@@ -162,7 +151,7 @@ purge-iproduct-ubmk:
 
 # collect stats for all
 iproduct-ubmk-stats: create-iproduct-ubmk
-iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/stats)
+iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log)
 
 # Add to versions
 VERSIONS += $(IPRODUCT-UBMK-VERSIONS)
@@ -179,7 +168,9 @@ kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.c
 	cp $$< $$@
 
 # adds arguments
-kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: ARGS += --queries $(1)
+kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: C_ARGS += --queries $(1)
+kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv
+kernel/greedy_walk-query$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
 
 # adds to list of greedy walk versions
 GREEDY-WALK-VERSIONS += greedy_walk-query$(1)
@@ -201,7 +192,7 @@ purge-greedy-walk:
 
 # collect stats for all
 greedy-walk-stats: create-greedy-walk
-greedy-walk-stats: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/stats)
+greedy-walk-stats: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log)
 
 # Add to versions
 VERSIONS += $(GREEDY-WALK-VERSIONS)
@@ -219,7 +210,9 @@ kernel/beam_search-query$(1)/kernel.cpp: kernel/$(BEAM-SEARCH-BASENAME)/kernel.c
 	cp $$< $$@
 
 # adds arguments
-kernel/beam_search-query$(1)/$(HOST_TARGET).log: ARGS += --queries $(1)
+kernel/beam_search-query$(1)/$(HOST_TARGET).log: C_ARGS += --queries $(1)
+kernel/beam_search-query$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/beam_search-query$(1)/kernel.riscv
+kernel/beam_search-query$$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
 
 # adds to list of greedy walk versions
 BEAM-SEARCH-VERSIONS += beam_search-query$(1)
@@ -242,7 +235,7 @@ purge-beam-search:
 
 # collect stats for all
 beam-search-stats: create-beam-search
-beam-search-stats: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/stats)
+beam-search-stats: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log)
 
 # Add to versions
 VERSIONS += $(BEAM-SEARCH-VERSIONS)
@@ -251,31 +244,32 @@ VERSIONS += $(BEAM-SEARCH-VERSIONS)
 # Continue including cosim build rules #
 ########################################
 
--include $(FRAGMENTS_PATH)/host/cosim.mk
-
-GRAPH-TOOLS := $(CURRENT_PATH)/graph-tools
+GRAPH-TOOLS := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/graph-tools
 graphtools-dir := $(GRAPH-TOOLS)
 
 include $(GRAPH-TOOLS)/libgraphtools.mk
 
-HB-HELPERS := $(CURRENT_PATH)/hammerblade-helpers
+HB-HELPERS := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hammerblade-helpers
+hammerblade-helpers-dir := $(HB-HELPERS)
 include $(HB-HELPERS)/libhammerblade-helpers-host.mk
 
 CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags)
 CXXFLAGS += $(libgraphtools-interface-cxxflags)
+CXXFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw
+CXXFLAGS += -DCOSIM
 
 LDFLAGS  += $(libhammerblade-helpers-host-interface-ldflags)
 LDFLAGS  += $(libgraphtools-interface-ldflags)
-VSOURCES += GreedyWalkResults.cpp
-
-$(HOST_TARGET): $(libhammerblade-helpers-host-interface-headers)
-$(HOST_TARGET): $(libgraphtools-interface-headers)
-$(HOST_TARGET): $(libgraphtools-interface-libraries)
-$(HOST_TARGET): GreedyWalkResults.o
 
+GreedyWalkResults.o: $(libhammerblade-helpers-host-interface-headers)
+GreedyWalkResults.o: $(libgraphtools-interface-headers)
+GreedyWalkResults.o: $(libgraphtools-interface-libraries)
 GreedyWalkResults.o: GreedyWalkResults.cpp
 GreedyWalkResults.o: GreedyWalkResults.hpp
 
+ipnsw.o: $(libhammerblade-helpers-host-interface-headers)
+ipnsw.o: $(libgraphtools-interface-headers)
+ipnsw.o: $(libgraphtools-interface-libraries)
 ipnsw.o: IO.hpp
 ipnsw.o: IPNSWGraph.hpp
 ipnsw.o: IPNSWRunner.hpp
@@ -292,6 +286,13 @@ ipnsw.o: GreedyWalkFactory.hpp
 ipnsw.o: BeamSearchFactory.hpp
 ipnsw.o: IProductUBmkFactory.hpp
 ipnsw.o: StringHelpers.hpp
+
+TEST_SOURCES = ipnsw.cpp GreedyWalkResults.cpp
+
+-include $(EXAMPLES_PATH)/compilation.mk
+-include $(EXAMPLES_PATH)/link.mk
+-include $(EXAMPLES_PATH)/execution.mk
+
 ################################################################################
 # Define the clean rules. clean calls the makefile-specific cleans, whereas
 # users can add commands and dependencies to custom.clean.
@@ -302,50 +303,3 @@ version.clean:
 
 custom.clean: version.clean
 
-clean: cosim.clean analysis.clean cudalite.clean custom.clean
-
-################################################################################
-# Define overall-goals. The all rule runs all kernel versions, and the default
-# kernel.
-################################################################################
-
-_HELP_STRING := "Makefile Rules\n"
-
-_HELP_STRING += "    default: \n"
-_HELP_STRING += "        - Run the default kernel ($KERNEL_DEFAULT) and generate all of the\n"
-_HELP_STRING += "          analysis products\n"
-default: pc_stats graphs stats
-
-_HELP_STRING += "    analysis: \n"
-_HELP_STRING += "        - Launch indpendent cosimulation executions of each kernel version.\n"
-_HELP_STRING += "          When execution finishes, it generates all the analysis products \n"
-_HELP_STRING += "          for each kernel in each respective kernel/<version_name>/ \n"
-_HELP_STRING += "          directory\n"
-analysis: $(foreach v,$(VERSIONS),kernel/$v/pc_stats kernel/$v/graphs kernel/$v/stats)
-
-_HELP_STRING += "    statistics: \n"
-_HELP_STRING += "        - Launch indpendent cosimulation executions of each kernel version.\n"
-_HELP_STRING += "          When execution finishes, it generates ONLY the parsed operation \n"
-_HELP_STRING += "          stats for each kernel in each respective kernel/<version_name>/ \n"
-_HELP_STRING += "          directory\n"
-statistics: $(foreach v,$(VERSIONS),kernel/$v/stats)
-
-_HELP_STRING += "    all: \n"
-_HELP_STRING += "        - Launch both the default and analysis target\n"
-all: analysis default
-
-.DEFAULT_GOAL = help
-_HELP_STRING += "    help: \n"
-_HELP_STRING += "        - Output a friendly help message.\n"
-help:
-	@echo -e $(HELP_STRING)
-
-# Always re-run, if asked.
-.PHONY: default analysis help
-
-# These last three lines ensure that _HELP_STRING is appended to the top of
-# whatever else comes before it.
-_HELP_STRING += "\n"
-_HELP_STRING += $(HELP_STRING)
-HELP_STRING := $(_HELP_STRING)
-
diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
index ea920b295..8de8e073e 100644
--- a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
+++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
@@ -1,3 +1,4 @@
+#include "bsg_manycore_regression.h"
 #include "ipnsw.hpp"
 #include "HammerBlade.hpp"
 #include "Graph500Data.hpp"
@@ -8,6 +9,7 @@
 #include "IProductUBmkKernelRunner.hpp"
 #include "IProductUBmkResultReader.hpp"
 #include "IProductUBmkFactory.hpp"
+#include "IProductUBmkParallelFactory.hpp"
 #include "BeamSearchKernelRunner.hpp"
 #include "BeamSearchResultReader.hpp"
 #include "BeamSearchFactory.hpp"
@@ -19,8 +21,6 @@
 #include <iostream>
 #include <memory>
 
-#include "GreedyWalkResults.cpp"
-
 using namespace ipnsw;
 
 int Main(int argc, char *argv[])
@@ -31,6 +31,12 @@ int Main(int argc, char *argv[])
     std::unique_ptr<IPNSWRunner> runner;
     std::unique_ptr<IPNSWFactory> factory;
 
+    IPNSWRunnerConfig cfg;
+    cfg.grid_x() = args.grid_x();
+    cfg.grid_y() = args.grid_y();
+    cfg.grp_x()  = args.grp_x();
+    cfg.grp_y()  = args.grp_y();
+
     if (ipnsw::startswith(args.version(), "greedy_walk")) {
         factory = std::unique_ptr<IPNSWFactory>(new GreedyWalkFactory);
     } else if (ipnsw::startswith(args.version(), "beam_search")) {
@@ -39,11 +45,20 @@ int Main(int argc, char *argv[])
         /* parse the number of inner products */
         std::cout << "num inner products " << args.num_iproducts() << std::endl;
         int n_iproducts = args.num_iproducts();
-        factory = std::unique_ptr<IPNSWFactory>(new IProductUBmkFactory(n_iproducts));
+
+        bool parallel = args.version().find("parallel") != std::string::npos;
+        if (parallel) {
+            factory = std::unique_ptr<IPNSWFactory>(new IProductUBmkParallelFactory(n_iproducts));
+        } else {
+            factory = std::unique_ptr<IPNSWFactory>(new IProductUBmkFactory(n_iproducts)) ;
+        }
+
     } else if (args._version == "debug") {
         /* just for debugging */
         std::cout << "--num-iproducts=" << args.num_iproducts() << std::endl;
-        std::cout << "--queries=";
+        std::cout << "--queries=" << std::endl;
+        std::cout << "--group-x=" << args.grp_x() << std::endl;
+        std::cout << "--group-y=" << args.grp_y() << std::endl;
         auto do_queries = args.do_queries();
         for (auto q : do_queries) {
             std::cout << q << " ";
@@ -54,33 +69,10 @@ int Main(int argc, char *argv[])
         return 0;
     }
 
-    runner = std::unique_ptr<IPNSWRunner>(new IPNSWRunner(args, factory));
+    runner = std::unique_ptr<IPNSWRunner>(new IPNSWRunner(args, factory, cfg));
     runner->run();
 
     return 0;
 }
 
-#ifdef COSIM
-void cosim_main(uint32_t *exit_code, char * args) {
-    // We aren't passed command line arguments directly so we parse them
-    // from *args. args is a string from VCS - to pass a string of arguments
-    // to args, pass c_args to VCS as follows: +c_args="<space separated
-    // list of args>"
-    int argc = get_argc(args);
-    char *argv[argc];
-    get_argv(args, argc, argv);
-
-#ifdef VCS
-    svScope scope;
-    scope = svGetScopeFromName("tb");
-    svSetScope(scope);
-#endif
-    int rc = Main(argc, argv);
-    *exit_code = rc;
-    return;
-}
-#else
-int main(int argc, char ** argv) {
-    return Main(argc, argv);
-}
-#endif
+declare_program_main("IPNSW", Main);
diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp
index 385873c50..9c91c72bd 100644
--- a/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp
@@ -34,4 +34,3 @@
 #include <typeinfo>
 #include <bsg_manycore_errno.h>
 #include <bsg_manycore_cuda.h>
-#include "../common.h"
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp
index a69965073..9ee2ce5e7 100644
--- a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp
@@ -175,7 +175,7 @@ extern "C" {
         }
 
         int n_res = std::min(results.size(), N_RESULTS);
-        std::sort(results_mem, results_mem+n_res, LT());
+        std::sort(results_mem, results_mem+results.size(), LT());
         bsg_cuda_print_stat_end(0);
 
         *n_results = n_res;
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp
new file mode 100644
index 000000000..d55e7e900
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp
@@ -0,0 +1,279 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 8
+#define BSG_TILE_GROUP_Y_DIM 4
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.hpp>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v4_serial(v0, v1))
+
+
+    static constexpr int SYNC_INV  = -1;
+    static constexpr int SYNC_DONE = -2;
+
+    void ipnsw_distance_slave(bsg_attr_remote const float *__restrict database,
+                              const float *query,
+                              int   *dst_p,
+                              float *distance_p,
+                              int   *done_p,
+                              DenseSet_v1<int> *seen)
+    {
+        float *result = bsg_tile_group_remote_pointer<float>(0, 0, &distance_p[__bsg_id]);
+        int   *done   = bsg_tile_group_remote_pointer<int>(  0, 0, &done_p[__bsg_id]);
+        while (true) {
+            int dst = sleep_until_valid(dst_p, SYNC_INV);
+            if (dst == SYNC_DONE)
+                break;
+
+            if (!seen->in(dst)) {
+                seen->atomic_insert(dst);
+                //bsg_print_int(dst);
+                float tmp = distance(query, &database[dst * VSIZE]);
+                //bsg_print_float(tmp);
+                *result = tmp;
+            } else {
+                *result = -INFINITY;
+            }
+            *done = 1;
+        }
+    }
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database,
+                          const float *query,
+                          int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        int   dst_slave = SYNC_INV;
+        float dist_result[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+        int   dist_done  [BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+
+        if (__bsg_id != 0) {
+            ipnsw_distance_slave(database, q, &dst_slave, dist_result, dist_done, &seen);
+        } else {
+            bsg_saif_start();
+            int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+            for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x)
+                for (int y = 0; y < BSG_TILE_GROUP_Y_DIM; ++y) {
+                    dst_slave_ptr[bsg_x_y_to_id(x,y)]
+                        = bsg_tile_group_remote_pointer(x, y, &dst_slave);
+                    dist_result[bsg_x_y_to_id(x,y)] = INFINITY;
+                    dist_done[bsg_x_y_to_id(x,y)]   = 0;
+                }
+
+            // retrieve results from greedy walk
+            int v_curr   = *v_curr_o;
+            float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+            bsg_print_int(v_curr);
+            bsg_print_float(d_curr);
+#endif
+
+            // initialize priority queues
+            DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+            DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+            candidates.push({d_curr, v_curr});
+            results.push({d_curr, v_curr});
+
+            float d_worst = d_curr;
+            seen.insert(v_curr);
+
+            while (!candidates.empty()) {
+                int   v_best;
+                float d_best;
+
+                auto best = candidates.pop();
+                v_best = std::get<1>(best);
+                d_best = std::get<0>(best);
+
+                d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(-v_best);
+#endif
+
+                if (d_best > d_worst) {
+                    break;
+                }
+
+                // traverse neighbors of v_best
+                int dst_0 = G.offsets[v_best];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+
+                // traverse neighbors
+                for (int dst_i = 0;
+                     dst_i < degree;
+                     dst_i += BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM) {
+                     // read-in work
+                    int dst_n = std::min(BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM, degree-dst_i);
+                    int dst_v[dst_n];
+                    memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v));
+
+                    // delegate work
+                    int dst;
+                    for (int dst_j = 1; dst_j < dst_n; ++dst_j) {                        
+                        dst = dst_v[dst_j];
+                        *dst_slave_ptr[dst_j] = dst;                        
+                    }
+                    // work myself 
+                    {
+                        dst = dst_v[0];
+                        if (!seen.in(dst)) {
+                            seen.atomic_insert(dst);                            
+                            dist_result[0] = distance(q, &database[dst * VSIZE]);
+                        } else {
+                            dist_result[0] = -INFINITY;
+                        }
+                        dist_done[0] = 1;
+                    }
+                    // reduce
+                    for (int dst_j = 0; dst_j < dst_n; ++dst_j) {
+                        dst = dst_v[dst_j];
+
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                        bsg_print_int(dst);
+#endif
+                        bsg_wait_local_int_asm_blind(&dist_done[dst_j], 1);
+                        dist_done[dst_j] = 0;
+                        float d_neib = dist_result[dst_j];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                        bsg_print_float(d_neib);
+#endif
+                        // already seen?
+                        if (d_neib == -INFINITY)
+                            continue;
+
+                        d_worst = std::get<0>(results.top());
+                        // if there's room for new result or this distance is promising
+                        if ((results.size() < EF) || (d_neib < d_worst)) {
+                            
+                            // push onto candidates and results
+                            candidates.push({d_neib, dst});
+                            results.push({d_neib, dst});
+                            
+                            // prune down to recall
+                            if (results.size() > EF)
+                                results.pop();
+                        }
+                    }                    
+                    
+                }
+
+            }
+
+            int n_res = std::min(results.size(), N_RESULTS);
+            std::sort(results_mem, results_mem+results.size(), LT());
+            *n_results = n_res;
+            bsg_saif_end();
+        
+        }        
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp
new file mode 100644
index 000000000..1ced33c51
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp
@@ -0,0 +1,192 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v4_serial(v0, v1))
+
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = distance(q, &database[dst*VSIZE]);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+results.size(), LT());
+        bsg_cuda_print_stat_end(0);
+
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp
index 18a29fd33..7073bb548 100644
--- a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp
@@ -92,7 +92,8 @@ extern "C" {
     }
 
 // Uncomment to turn on debugging
-#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
 
 #define distance(v0, v1)                                                \
     (-1 * inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(v0, v1))
@@ -117,8 +118,10 @@ extern "C" {
         // retrieve results from greedy walk
         int v_curr   = *v_curr_o;
         float d_curr = *d_curr_o;
-        //bsg_print_int(v_curr);
-        //bsg_print_float(d_curr);
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif
 
         // initialize priority queues
         DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
@@ -176,7 +179,7 @@ extern "C" {
         }
 
         int n_res = std::min(results.size(), N_RESULTS);
-        std::sort(results_mem, results_mem+n_res, LT());
+        std::sort(results_mem, results_mem+results.size(), LT());
         bsg_cuda_print_stat_end(0);
 
         *n_results = n_res;
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp
new file mode 100644
index 000000000..e88095dea
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp
@@ -0,0 +1,195 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+
+        // Pepare other tiles for parallel inner products
+        InnerProduct ip(database, q);
+
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+        ip.init();
+
+        if (__bsg_id == 0) {
+
+            // retrieve results from greedy walk
+            int v_curr   = *v_curr_o;
+            float d_curr = *d_curr_o;
+            //bsg_print_int(v_curr);
+            //bsg_print_float(d_curr);
+
+            // initialize priority queues
+            DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+            DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+            candidates.push({d_curr, v_curr});
+            results.push({d_curr, v_curr});
+
+            float d_worst = d_curr;
+            seen.insert(v_curr);
+
+            while (!candidates.empty()) {
+                int   v_best;
+                float d_best;
+
+                auto best = candidates.pop();
+                v_best = std::get<1>(best);
+                d_best = std::get<0>(best);
+
+                d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(-v_best);
+#endif
+
+                if (d_best > d_worst) {
+                    break;
+                }
+
+                // traverse neighbors of v_best
+                int dst_0 = G.offsets[v_best];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                    bsg_print_int(dst);
+#endif
+                    if (!seen.in(dst)) {
+                        // mark as seen
+                        seen.insert(dst);
+                        float d_neib = -1 * ip.inner_product(dst);
+                        d_worst = std::get<0>(results.top());
+                        // if there's room for new result or this distance is promising
+                        if ((results.size() < EF) || (d_neib < d_worst)) {
+                            // push onto candidates and results
+                            candidates.push({d_neib, dst});
+                            results.push({d_neib, dst});
+
+                            // prune down to recall
+                            if (results.size() > EF)
+                                results.pop();
+                        }
+                    }
+                }
+
+            }
+
+            //ip.exit();
+
+            int n_res = std::min(results.size(), N_RESULTS);
+            std::sort(results_mem, results_mem+results.size(), LT());
+            *n_results = n_res;
+        }
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp
new file mode 100644
index 000000000..37d995573
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp
@@ -0,0 +1,194 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+
+        // Pepare other tiles for parallel inner products
+        InnerProduct ip(database, q);
+
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+        ip.init();
+
+        // retrieve results from greedy walk
+        int v_curr   = *v_curr_o;
+        float d_curr = *d_curr_o;
+        //bsg_print_int(v_curr);
+        //bsg_print_float(d_curr);
+
+        // initialize priority queues
+        DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+        DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+        candidates.push({d_curr, v_curr});
+        results.push({d_curr, v_curr});
+
+        float d_worst = d_curr;
+        seen.insert(v_curr);
+
+        while (!candidates.empty()) {
+            int   v_best;
+            float d_best;
+
+            auto best = candidates.pop();
+            v_best = std::get<1>(best);
+            d_best = std::get<0>(best);
+
+            d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+            bsg_print_int(-v_best);
+#endif
+
+            if (d_best > d_worst) {
+                break;
+            }
+
+            // traverse neighbors of v_best
+            int dst_0 = G.offsets[v_best];
+            int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+            for (int dst_i = 0; dst_i < degree; dst_i++) {
+                int dst = G.neighbors[dst_0+dst_i];
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(dst);
+#endif
+                if (!seen.in(dst)) {
+                    // mark as seen
+                    seen.insert(dst);
+                    float d_neib = -1 * ip.inner_product(dst);
+                    d_worst = std::get<0>(results.top());
+                    // if there's room for new result or this distance is promising
+                    if ((results.size() < EF) || (d_neib < d_worst)) {
+                        // push onto candidates and results
+                        candidates.push({d_neib, dst});
+                        results.push({d_neib, dst});
+
+                        // prune down to recall
+                        if (results.size() > EF)
+                            results.pop();
+                    }
+                }
+            }
+
+        }
+
+        //ip.exit();
+
+        int n_res = std::min(results.size(), N_RESULTS);
+        std::sort(results_mem, results_mem+results.size(), LT());
+
+        bsg_cuda_print_stat_end(0);
+        *n_results = n_res;
+
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp
new file mode 100644
index 000000000..d87eaf3bd
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp
@@ -0,0 +1,270 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.hpp>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v4_serial(v0, v1))
+
+
+    static constexpr int SYNC_INV  = -1;
+    static constexpr int SYNC_DONE = -2;
+
+    void ipnsw_distance_slave(bsg_attr_remote const float *__restrict database,
+                              const float *query,
+                              int   *dst_p,
+                              float *distance_p,
+                              DenseSet_v1<int> *seen)
+    {
+        float *result = bsg_tile_group_remote_pointer<float>(0, 0, &distance_p[__bsg_id]);
+        while (true) {
+            int dst = sleep_until_valid(dst_p, SYNC_INV);
+            if (dst == SYNC_DONE)
+                break;
+
+            if (!seen->in(dst)) {
+                seen->atomic_insert(dst);
+                //bsg_print_int(dst);
+                float tmp = distance(query, &database[dst * VSIZE]);
+                //bsg_print_float(tmp);
+                *result = tmp;
+            } else {
+                *result = -INFINITY;
+            }
+        }
+    }
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database,
+                          const float *query,
+                          int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);
+        memcpy(q, query, sizeof(q));
+
+        int   dst_slave = SYNC_INV;
+        float dist_result[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+
+        if (__bsg_id != 0) {
+            ipnsw_distance_slave(database, q, &dst_slave, dist_result, &seen);
+        } else {
+
+            int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM];
+            for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x)
+                for (int y = 0; y < BSG_TILE_GROUP_Y_DIM; ++y) {
+                    dst_slave_ptr[bsg_x_y_to_id(x,y)]
+                        = bsg_tile_group_remote_pointer(x, y, &dst_slave);
+                    dist_result[bsg_x_y_to_id(x,y)] = INFINITY;
+                }
+
+            // retrieve results from greedy walk
+            int v_curr   = *v_curr_o;
+            float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+            bsg_print_int(v_curr);
+            bsg_print_float(d_curr);
+#endif
+
+            // initialize priority queues
+            DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+            DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+            candidates.push({d_curr, v_curr});
+            results.push({d_curr, v_curr});
+
+            float d_worst = d_curr;
+            seen.insert(v_curr);
+
+            while (!candidates.empty()) {
+                int   v_best;
+                float d_best;
+
+                auto best = candidates.pop();
+                v_best = std::get<1>(best);
+                d_best = std::get<0>(best);
+
+                d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                bsg_print_int(-v_best);
+#endif
+
+                if (d_best > d_worst) {
+                    break;
+                }
+
+                // traverse neighbors of v_best
+                int dst_0 = G.offsets[v_best];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+
+                // traverse neighbors
+                for (int dst_i = 0;
+                     dst_i < degree;
+                     dst_i += BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM) {
+                     // read-in work
+                    int dst_n = std::min(BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM, degree-dst_i);
+                    int dst_v[dst_n];
+                    memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v));
+
+                    // delegate work
+                    int dst;
+                    for (int dst_j = 1; dst_j < dst_n; ++dst_j) {
+                        dst = dst_v[dst_j];
+                        *dst_slave_ptr[dst_j] = dst;                        
+                    }
+                    // work myself 
+                    {
+                        dst = dst_v[0];
+                        if (!seen.in(dst)) {
+                            seen.atomic_insert(dst);                            
+                            dist_result[0] = distance(q, &database[dst * VSIZE]);
+                        } else {
+                            dist_result[0] = -INFINITY;
+                        }
+                    }
+                    // reduce
+                    for (int dst_j = 0; dst_j < dst_n; ++dst_j) {
+                        dst = dst_v[dst_j];
+
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                        bsg_print_int(dst);
+#endif                        
+                        float d_neib = sleep_until_valid(&dist_result[dst_j], INFINITY);
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                        bsg_print_float(d_neib);
+#endif
+                        // already seen?
+                        if (d_neib == -INFINITY)
+                            continue;
+
+                        d_worst = std::get<0>(results.top());
+                        // if there's room for new result or this distance is promising
+                        if ((results.size() < EF) || (d_neib < d_worst)) {
+                            
+                            // push onto candidates and results
+                            candidates.push({d_neib, dst});
+                            results.push({d_neib, dst});
+                            
+                            // prune down to recall
+                            if (results.size() > EF)
+                                results.pop();
+                        }
+                    }                    
+                    
+                }
+
+            }
+
+            int n_res = std::min(results.size(), N_RESULTS);
+            std::sort(results_mem, results_mem+results.size(), LT());
+            *n_results = n_res;
+        
+        }        
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp
new file mode 100644
index 000000000..69def7bdd
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp
@@ -0,0 +1,249 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 4
+#define BSG_TILE_GROUP_Y_DIM 4
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.hpp>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+#include "set.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+#define N_V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+class LT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return  std::get<0>(lhs) < std::get<0>(rhs);
+    }
+};
+
+class GT {
+public:
+    bool operator()(const std::pair<float, int> &lhs, const std::pair<float, int> &rhs) {
+        return std::get<0>(lhs) > std::get<0>(rhs);
+    }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Uncomment to turn on debugging
+#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+//#define DEBUG_BEAM_SEARCH_INPUT
+
+    using InnerProduct = InnerProductParallel_Y<BSG_TILE_GROUP_Y_DIM>;
+
+    static constexpr int SYNC_INV  = -1;
+    static constexpr int SYNC_DONE = -2;
+
+    void ipnsw_x_master(bsg_attr_remote const float *__restrict database,
+                             const float *query,
+                             int   *dst_p,
+                             float *distance_p,
+                        DenseSet_v1<int> *seen,
+                        InnerProduct *ip_y)
+    {
+        float *result = bsg_tile_group_remote_pointer<float>(0, 0, &distance_p[__bsg_x]);
+        while (true) {
+            int dst = sleep_until_valid(dst_p, SYNC_INV);
+            if (dst == SYNC_DONE)
+                break;
+
+            if (!seen->in(dst)) {
+                seen->atomic_insert(dst);
+                //bsg_print_int(dst);
+                *result  = -1.0 * ip_y->inner_product(dst);
+            } else {
+                *result = -INFINITY;
+            }
+        }
+    }
+
+    int ipnsw_beam_search(const graph *Gs,
+                          bsg_attr_remote const float *__restrict database,
+                          const float *query,
+                          int *seen_mem,
+                          int *v_curr_o, float *d_curr_o,
+                          std::pair<float, int> *candidates_mem,
+                          std::pair<float, int> *results_mem,
+                          int *n_results)
+    {
+        // keep track of vertices seen
+        DenseSet_v1<int>seen(seen_mem);
+
+        // fetch graph and q out of memory
+        struct graph G = Gs[G_0];
+        float q[VSIZE];
+        bsg_cuda_print_stat_start(0);        
+        memcpy(q, query, sizeof(q));
+
+        InnerProduct ip_y(database, q);
+        ip_y.init();
+
+        int   dst_slave = SYNC_INV;
+        float dist_result[BSG_TILE_GROUP_X_DIM];
+        
+        if (__bsg_y == 0) {        
+            if (__bsg_x == 0) {
+                
+                int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM];
+                for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x) {
+                    dst_slave_ptr[x] = bsg_tile_group_remote_pointer(x, 0, &dst_slave);
+                    dist_result[x] = INFINITY;
+                }
+
+                // retrieve results from greedy walk
+                int v_curr   = *v_curr_o;
+                float d_curr = *d_curr_o;
+#ifdef DEBUG_BEAM_SEARCH_INPUT
+                bsg_print_int(v_curr);
+                bsg_print_float(d_curr);
+#endif
+
+                // initialize priority queues
+                DynHeap<std::pair<float, int>, GT> candidates(candidates_mem, 512);
+                DynHeap<std::pair<float, int>, LT> results(results_mem, 128);
+
+                candidates.push({d_curr, v_curr});
+                results.push({d_curr, v_curr});
+
+                float d_worst = d_curr;
+                seen.insert(v_curr);
+
+                while (!candidates.empty()) {
+                    int   v_best;
+                    float d_best;
+
+                    auto best = candidates.pop();
+                    v_best = std::get<1>(best);
+                    d_best = std::get<0>(best);
+
+                    d_worst = std::get<0>(results.top());
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                    bsg_print_int(-v_best);
+#endif
+
+                    if (d_best > d_worst) {
+                        break;
+                    }
+
+                    // traverse neighbors of v_best
+                    int dst_0 = G.offsets[v_best];
+                    int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0;
+
+                    // traverse neighbors
+                    for (int dst_i = 0;
+                         dst_i < degree;
+                         dst_i += BSG_TILE_GROUP_X_DIM) {
+                        // read-in work
+                        int dst_n = std::min(BSG_TILE_GROUP_X_DIM, degree-dst_i);
+                        int dst_v[dst_n];
+                        memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v));
+
+                        // delegate work
+                        int dst;
+                        for (int dst_j = 1; dst_j < dst_n; ++dst_j) {
+                            dst = dst_v[dst_j];
+                            *dst_slave_ptr[dst_j] = dst;                        
+                        }
+                        // work myself 
+                        {
+                            dst = dst_v[0];
+                            if (!seen.in(dst)) {
+                                seen.atomic_insert(dst);                            
+                                dist_result[0] = -1.0 * ip_y.inner_product(dst);
+                            } else {
+                                dist_result[0] = -INFINITY;
+                            }
+                        }
+                        // reduce
+                        for (int dst_j = 0; dst_j < dst_n; ++dst_j) {
+                            dst = dst_v[dst_j];
+
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                            bsg_print_int(dst);
+#endif                        
+                            float d_neib = sleep_until_valid(&dist_result[dst_j], INFINITY);
+#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE
+                            bsg_print_float(d_neib);
+#endif
+                            // already seen?
+                            if (d_neib == -INFINITY)
+                                continue;
+
+                            d_worst = std::get<0>(results.top());
+                            // if there's room for new result or this distance is promising
+                            if ((results.size() < EF) || (d_neib < d_worst)) {
+                            
+                                // push onto candidates and results
+                                candidates.push({d_neib, dst});
+                                results.push({d_neib, dst});
+                            
+                                // prune down to recall
+                                if (results.size() > EF)
+                                    results.pop();
+                            }
+                        }                    
+                    
+                    }
+
+                }
+
+                // signal all columns done
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM; ++tile)
+                    *dst_slave_ptr[tile] = SYNC_DONE;
+
+                int n_res = std::min(results.size(), N_RESULTS);
+                std::sort(results_mem, results_mem+results.size(), LT());
+                *n_results = n_res;
+        
+            } else { // bsg_x != 0
+                ipnsw_x_master(database, q, &dst_slave, dist_result, &seen, &ip_y);
+            }
+        }
+
+        ip_y.exit();
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp
new file mode 100644
index 000000000..aafefe6fd
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp
@@ -0,0 +1,147 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+//#define DEBUG_GREEDY_VIS_TR
+
+#define distance(v0, v1)                                                \
+    (-1 * inner_product_v4_serial(v0, v1))
+
+    int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        int   v_curr = V_ENTRY;
+        float d_curr = 0;
+
+        d_curr = distance(q, &database[v_curr*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+        bsg_print_int(v_curr);
+        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+        for (int i = 0; i < NG-1; i++) {
+            struct graph G = Gs[i];
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                // fetch neighbors
+                int dst_0 = G.offsets[v_curr];
+                int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                for (int dst_i = 0; dst_i < degree; dst_i++) {
+                    int dst = G.neighbors[dst_0+dst_i];
+                    // calc. iproduct
+                    float d = distance(q, &database[dst*VSIZE]);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                    bsg_print_int(dst);
+                    bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                    if (d < d_curr) {
+                        d_curr = d;
+                        v_curr = dst;
+                        changed = true;
+
+#if defined(DEBUG_GREEDY_VCURR_TR)
+                        bsg_print_int(v_curr);
+                        bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                    }
+                }
+            }
+        }
+
+        *v_curr_o = v_curr;
+        *d_curr_o = d_curr;
+
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp
index ddea465b0..99614fc8b 100644
--- a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp
@@ -126,7 +126,7 @@ extern "C" {
                         v_curr = dst;
                         changed = true;
 
-#if defined(DEBUG_GREEDY_VIS_TR)
+#if defined(DEBUG_GREEDY_VCURR_TR)
                         bsg_print_int(v_curr);
                         bsg_print_float(d_curr);
 #endif // #if defined(DEBUG_GREEDY_VIS_TR)
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp
new file mode 100644
index 000000000..c60b3e125
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp
@@ -0,0 +1,152 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+    int input_test(const graph *Gs, const float *database, const float *query, int *seen)
+    {
+#if defined(DEBUG_INPUT_TEST)
+        bsg_printf("Gs = %08x\n",       Gs);
+        bsg_printf("database = %08x\n", database);
+        bsg_printf("query = %08x\n",    query);
+        bsg_printf("seen  = %08x\n",    seen);
+#endif // #if defined(DEBUG_INPUT_TEST)
+
+        struct graph G;
+        int v_i [] = {G_0, G_1, G_2, G_3};
+        for (int j = 0; j < 4; ++j) {
+            int i = v_i[j];
+            memcpy(&G, &Gs[i], sizeof(G));
+#if defined(DEBUG_INPUT_TEST)
+            bsg_printf("G[%d].offsets   = %08x\n", j, G.offsets);
+            bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors);
+            bsg_printf("G[%d].V = %d\n", j, G.V);
+            bsg_printf("G[%d].E = %d\n", j, G.E);
+#endif // #if defined(DEBUG_INPUT_TEST)
+        }
+
+        return 0;
+    }
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+#define DEBUG_GREEDY_VIS_TR
+
+    int ipnsw_greedy_search (const graph *Gs,
+                             bsg_attr_remote const float *__restrict database,
+                             const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+        float q[VSIZE];
+
+        bsg_cuda_print_stat_start(0);
+
+        memcpy(q, query, sizeof(q));
+
+        InnerProduct ip(database, q);
+        ip.init();
+        if (__bsg_id == 0) {
+            bsg_saif_start();
+            int   v_curr = V_ENTRY;
+            float d_curr = 0;
+
+            d_curr = -1.0 * ip.inner_product(v_curr);
+
+#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+            bsg_print_int(v_curr);
+            bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR)
+
+            for (int i = 0; i < NG-1; i++) {
+                struct graph G = Gs[i];
+                bool changed = true;
+                while (changed) {
+                    changed = false;
+                    // fetch neighbors
+                    int dst_0 = G.offsets[v_curr];
+                    int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0;
+                    for (int dst_i = 0; dst_i < degree; dst_i++) {
+                        int dst = G.neighbors[dst_0+dst_i];
+                        // calc. iproduct
+                        float d = -1.0 * ip.inner_product(dst);
+
+#if defined(DEBUG_GREEDY_VIS_TR)
+                        bsg_print_int(dst);
+                        bsg_print_float(d);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+
+                        if (d < d_curr) {
+                            d_curr = d;
+                            v_curr = dst;
+                            changed = true;
+
+#if defined(DEBUG_GREEDY_VCURR_TR)
+                            bsg_print_int(v_curr);
+                            bsg_print_float(d_curr);
+#endif // #if defined(DEBUG_GREEDY_VIS_TR)
+                        }
+                    }
+                }
+            }
+
+            *v_curr_o = v_curr;
+            *d_curr_o = d_curr;
+            bsg_saif_end();
+        }
+        bsg_cuda_print_stat_end(0);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp
new file mode 100644
index 000000000..12da1aebb
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp
@@ -0,0 +1,113 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_INPUT_TEST
+
+// Uncomment to turn on debugging
+//#define DEBUG_GREEDY_VCURR_TR
+#define DEBUG_GREEDY_VIS_TR
+
+    /**/
+    int ipnsw_greedy_search (const graph *Gs,
+                             bsg_attr_remote const float *__restrict database,
+                             const float *query, int *seen,
+                             int *v_curr_o, float *d_curr_o)
+    {
+    /* loc:2 */
+        /**/
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+        /* loc:2 */
+
+        /* init code - can be hidden by library*/
+        InnerProduct ip(database, q);
+        ip.init();
+        if (__bsg_id == 0) {
+            bsg_saif_start();
+            /**/
+            int   v_curr = V_ENTRY;
+            float d_curr = 0;
+
+            d_curr = -1.0 * ip.inner_product(v_curr);
+
+            /**/
+            for (int i = 0; i < NG-1; i++) {
+                struct graph G = Gs[i];
+                bool changed = true;
+                while (changed) {
+                    changed = false;
+                    /* loc:5 */
+                    // fetch neighbors
+                    /**/
+                    for (int dst : G.neighbors(v_curr)) {
+                        float d = -1.0 * ip.inner_product(dst);
+                        if (d < d_curr) {
+                            d_curr = d;
+                            v_curr = dst;
+                            changed = true;
+                        }
+                    }            
+                }
+            }
+            /* loc: 10 */
+            /**/
+            *v_curr_o = v_curr;
+            *d_curr_o = d_curr;
+        }
+        return 0;
+    }
+    /* loc: 5 */
+    
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh
new file mode 100644
index 000000000..1f12e76ba
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh
@@ -0,0 +1 @@
+cat kernel.loc.cpp | grep loc: | cut -d: -f2 | cut -d* -f1 | awk 'BEGIN{x=0}{x = x+$1}END{print x}'
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp
index 8bb83077a..6099411c2 100644
--- a/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp
@@ -2,6 +2,8 @@
 #include "bsg_striped_array.hpp"
 #include <cmath>
 #include <numeric>
+#include <bsg_manycore.hpp>
+#include "sleep_until_valid.hpp"
 
 template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=10>
 __attribute__((noinline))
@@ -72,8 +74,8 @@ __attribute__((noinline))
 FLOAT_T inner_product_v4(const FLOAT_T *__restrict a,
                          bsg_attr_remote const FLOAT_T *__restrict b)
 {
-    register FLOAT_T r[UNROLL];
-    for (int i = __bsg_id * BSIZE; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) {
+    register FLOAT_T r[UNROLL] = {0};
+    for (int i = __bsg_id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) {
 #pragma bsg_unroll(32)
         for (int j = 0; j < BSIZE; ++j) {
 #pragma bsg_unroll(32)
@@ -82,8 +84,236 @@ FLOAT_T inner_product_v4(const FLOAT_T *__restrict a,
             }
         }
     }
-    int rs = 0.0;
+    FLOAT_T rs = 0.0;
     for (int i = 0; i < UNROLL; ++i)
         rs += r[i];
     return rs;
 }
+
+template<std::size_t TG_X, std::size_t TG_Y, typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=5, int UNROLL=5>
+__attribute__((noinline))
+FLOAT_T inner_product_parallel_v1(const FLOAT_T *__restrict a,
+                                  bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    register FLOAT_T r[UNROLL] = {0.0};
+
+    for (int i = __bsg_id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+#pragma bsg_unroll(32)
+            for (int k =0 ; k < UNROLL; ++k) {
+                r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]);
+            }
+        }
+    }
+    FLOAT_T rs = 0.0;
+    for (int i = 0; i < UNROLL; ++i)
+        rs += r[i];
+
+    return rs;
+}
+
+
+template<typename FLOAT_T=float, std::size_t VSIZE=100, std::size_t BSIZE=5, int UNROLL=5>
+__attribute__((noinline))
+FLOAT_T inner_product_v4_serial(const FLOAT_T *__restrict a,
+                         bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    register FLOAT_T r[UNROLL] = {0};
+    for (int i = 0; i < VSIZE; i += UNROLL * BSIZE) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+#pragma bsg_unroll(32)
+            for (int k =0 ; k < UNROLL; ++k) {
+                r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]);
+            }
+        }
+    }
+    FLOAT_T rs = 0.0;
+    for (int i = 0; i < UNROLL; ++i)
+        rs += r[i];
+    return rs;
+}
+
+
+template<std::size_t TG_N,
+         typename    FLOAT_T=float,
+         std::size_t VSIZE=100,
+         std::size_t BSIZE=5,
+         int         UNROLL=5>
+FLOAT_T inner_product_parallel_v2(
+    int id,
+    const FLOAT_T *__restrict a,
+    bsg_attr_remote const FLOAT_T *__restrict b)
+{
+    register FLOAT_T r[UNROLL] = {0.0};
+
+    for (int i = id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_N) {
+#pragma bsg_unroll(32)
+        for (int j = 0; j < BSIZE; ++j) {
+#pragma bsg_unroll(32)
+            for (int k =0 ; k < UNROLL; ++k) {
+                r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]);
+            }
+        }
+    }
+    FLOAT_T rs = 0.0;
+    for (int i = 0; i < UNROLL; ++i)
+        rs += r[i];
+
+    return rs;
+}
+
+template <std::size_t TG_X, std::size_t TG_Y>
+class InnerProductParallel_v1 {
+public:
+    static constexpr std::size_t VSIZE = 100;
+    static constexpr std::size_t TG_N = TG_X * TG_Y;
+    static constexpr int SYNC_DONE = -2;
+    static constexpr int SYNC_INV  = -1;
+
+    InnerProductParallel_v1(bsg_attr_remote const float *t1, const float *t2) {
+        _inf = INFINITY;
+        for (int i = 0; i < TG_N; ++i)
+            _partial[i] = _inf;
+
+        for (int x = 0; x < TG_X; ++x)
+            for (int y = 0; y < TG_Y; ++y)
+                _t1_idx_group[bsg_x_y_to_id(x,y)]
+                    = bsg_tile_group_remote_pointer(x,y,&_t1_idx);
+
+        _t1 = t1;
+        _t2 = t2;
+        _t1_idx = SYNC_INV;
+    }
+
+    void init() {
+        if (__bsg_id == 0) {
+            return;
+        }
+
+        float p = 0.0;
+        int t1_idx;
+        float *partial_result = bsg_tile_group_remote_pointer(0, 0, &_partial[__bsg_id]);
+
+        while (true) {
+            t1_idx = sleep_until_valid(&_t1_idx, SYNC_INV);
+            if (t1_idx == SYNC_DONE)
+                break;
+
+            p = inner_product_parallel_v1<TG_X,TG_Y>(_t2, &_t1[t1_idx * VSIZE]);
+            *partial_result = p;
+        }
+    }
+
+    float inner_product(int idx) {
+        if (__bsg_id != 0)
+            return 0.0;
+
+        for (int tile = 0; tile < TG_X*TG_Y; ++tile)
+            *_t1_idx_group[tile] = idx;
+
+        _partial[__bsg_id] = inner_product_parallel_v1<TG_X,TG_Y>(_t2, &_t1[idx * VSIZE]);
+
+        float r = 0.0;
+        for (int tile = 0; tile <TG_X*TG_Y; ++tile) {
+            float tmp = sleep_until_valid(&_partial[tile], _inf);
+            r += tmp;
+        }
+
+        return r;
+    }
+
+    void exit() {
+        if (__bsg_id != 0)
+            return;
+
+        for (int tile = 0; tile < TG_X*TG_Y; ++tile)
+            *_t1_idx_group[tile] = SYNC_DONE;
+
+        return;
+    }
+
+    bsg_attr_remote const float   *_t1;
+    const float                   *_t2;
+    int                            _t1_idx;
+    int                           *_t1_idx_group[TG_N];
+    float                          _partial[TG_N];
+    float                          _inf;
+};
+
+template <std::size_t TG_Y>
+class InnerProductParallel_Y {
+public:
+    static constexpr std::size_t VSIZE = 100;
+    static constexpr int SYNC_DONE = -2;
+    static constexpr int SYNC_INV  = -1;
+
+    InnerProductParallel_Y(bsg_attr_remote const float *t1, const float *t2) {        
+        _inf = INFINITY;
+        for (int i = 0; i < TG_Y; ++i)
+            _partial[i] = _inf;
+
+        for (int y = 0; y < TG_Y; ++y)
+            _t1_idx_group[y] = bsg_tile_group_remote_pointer(__bsg_x, y, &_t1_idx);        
+
+        _t1 = t1;
+        _t2 = t2;
+        _t1_idx = SYNC_INV;
+    }
+
+    void init() {
+        if (__bsg_y == 0) {
+            return;
+        }
+
+        float p = 0.0;
+        int t1_idx;
+        float *partial_result = bsg_tile_group_remote_pointer(__bsg_x, 0, &_partial[__bsg_y]);
+
+        while (true) {
+            t1_idx = sleep_until_valid(&_t1_idx, SYNC_INV);
+            if (t1_idx == SYNC_DONE)
+                break;
+
+            p = inner_product_parallel_v2<TG_Y>(__bsg_y, _t2, &_t1[t1_idx * VSIZE]);
+            *partial_result = p;
+        }
+    }
+
+    float inner_product(int idx) {
+        if (__bsg_y != 0)
+            return 0.0;
+
+        for (int tile = 0; tile < TG_Y; ++tile)
+            *_t1_idx_group[tile] = idx;
+
+        _partial[__bsg_y] = inner_product_parallel_v2<TG_Y>(__bsg_y, _t2, &_t1[idx * VSIZE]);
+
+        float r = 0.0;
+        for (int tile = 0; tile <TG_Y; ++tile) {
+            float tmp = sleep_until_valid(&_partial[tile], _inf);
+            r += tmp;
+        }
+
+        return r;
+    }
+
+    void exit() {
+        if (__bsg_y != 0)
+            return;
+
+        for (int tile = 0; tile < TG_Y; ++tile)
+            *_t1_idx_group[tile] = SYNC_DONE;
+
+        return;
+    }
+
+    bsg_attr_remote const float   *_t1;
+    const float                   *_t2;
+    int                            _t1_idx;
+    int                           *_t1_idx_group [TG_Y];
+    float                          _partial      [TG_Y];
+    float                          _inf;
+};
+
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp
index 2ef683838..2e3621fef 100644
--- a/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp
@@ -39,6 +39,10 @@ class DenseSet {
         _data[i] = 1;
     }
 
+    void atomic_insert(T i) {
+        insert(i);
+    }
+
     bool in(T i) {
         return _data[i] == 1;
     }
@@ -57,6 +61,16 @@ class DenseSet_v1 {
         _data[word(i)] |= (1 << bit(i));
     }
 
+    void atomic_insert(T i) {
+        int *ptr = &_data[word(i)];
+        int r    = 1 << bit(i);
+        asm volatile ("amoor.w x0, %[r], %[ptr]" :
+                      :
+                      [r] "r" (r),
+                      [ptr] "m" (*ptr));
+        return;
+    }
+
     bool in(T i) {
         return _data[word(i)] & (1 << bit(i));
     }
@@ -70,4 +84,3 @@ class DenseSet_v1 {
     }
     int *_data;
 };
-
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp
new file mode 100644
index 000000000..d59088d75
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp
@@ -0,0 +1,28 @@
+#pragma once
+#include <bsg_manycore.h>
+template <typename T>
+static inline T sleep_on_update(volatile T *ptr)
+{
+    T r;
+    asm volatile ("lr.w.aq %[r], %[ptr]" :
+                  [r]   "=r" (r) :
+                  [ptr] "m"  (*ptr)
+        );
+    return r;
+}
+
+template <typename T>
+static inline T sleep_until_valid(volatile T *ptr, T not_valid)
+{
+    T r;
+
+    asm volatile ("lr.w %[r], %[ptr]" :
+                  [r] "=r" (r) :
+                  [ptr] "m" (*ptr));
+
+    while (r == not_valid) {
+        r = sleep_on_update(ptr);
+    }
+    *ptr = not_valid;
+    return r;
+}
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp
new file mode 100644
index 000000000..9fe605f3a
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp
@@ -0,0 +1,180 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+//#include <bsg_tile_group_barrier.h>
+#include <bsg_tile_group_barrier.hpp>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+#include <atomic>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define VISIT_BUFSIZE 512
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+//#define DEBUG_SLAVE
+//#define DEBUG_MASTER
+
+using barrier = bsg_barrier<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_parallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+#define SYNC_DONE -1
+   
+    __attribute__((noinline))
+    int inner_product_ubmk_master(bsg_attr_remote const float * __restrict database,
+                                  const float * __restrict query,
+                                  int N,
+                                  int *visit_remote_all,
+                                  barrier *group_barrier,
+                                  std::atomic<int> *kp,
+                                  std::atomic<float> *rp)
+    {
+        float r = 0.0;
+        int visit[VISIT_BUFSIZE];
+        int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id];
+
+        // pre-compute addresses on remote tiles
+        std::atomic<int>   *kp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM];
+        std::atomic<float> *rp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM];
+
+        for (int tile_x = 0; tile_x < BSG_TILE_GROUP_X_DIM; ++tile_x) {
+            for (int tile_y = 0; tile_y < BSG_TILE_GROUP_Y_DIM; ++tile_y) {                
+                kp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, kp);
+                rp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, rp);
+            }
+        }
+
+        for (int i = 0; i < N; i += VISIT_BUFSIZE) {
+            size_t sz = std::min(VISIT_BUFSIZE, (N-i));
+            memcpy(visit, &visit_remote[i], sz*sizeof(int));
+
+            for (int j = 0; j < sz; ++j) {
+                // read k
+                int k = visit[j];
+
+                // set k on all tiles
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile)
+                    kp_group[tile]->store(k, std::memory_order_relaxed);
+
+                // do inner product
+                group_barrier->sync(); // signal ready
+                float r_local = iproduct(query, &database[k * VSIZE]);
+#ifdef DEBUG_MASTER
+                bsg_print_float(r_local);
+#endif
+                rp_group[__bsg_id]->store(r_local, std::memory_order_relaxed);
+                group_barrier->sync(); // signal done
+
+                // read r from all tiles
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) {
+                    float r_remote = rp_group[tile]->load(std::memory_order_relaxed);
+#ifdef DEBUG_MASTER
+                    bsg_print_float(r_remote);
+#endif
+                    r += r_remote;
+                }
+            }
+        }
+
+        return (int)r;
+    }
+
+    __attribute__((noinline))
+    void inner_product_ubmk_slave(bsg_attr_remote const float * __restrict database,
+                                  const float * __restrict query,
+                                  barrier *group_barrier,
+                                  std::atomic<int> *kp,
+                                  std::atomic<float> *rp)
+    {
+        float r = 0.0;
+        int k;
+
+        while (true) {
+            // load next
+            group_barrier->sync(); // signal ready
+            k = kp->load(std::memory_order_relaxed);
+            if (k == SYNC_DONE)
+                break;
+
+            // do inner product
+            r = iproduct(query, &database[k * VSIZE]);
+            rp->store(r, std::memory_order_relaxed);
+#ifdef DEBUG_SLAVE
+            bsg_print_float(r);
+#endif
+            group_barrier->sync(); // signal done
+        }
+    }
+    
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N,
+                           int *visit_remote_all)
+    {
+        static barrier group_barrier;
+        static std::atomic<int> k;
+        static std::atomic<float> r;
+        float rr;
+
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        if (__bsg_id == 0) {
+            // enter master loop
+            rr = inner_product_ubmk_master(database, q, N, visit_remote_all,
+                                           &group_barrier, &k, &r);
+        } else {
+            // enter slave loop
+            inner_product_ubmk_slave(database, q, &group_barrier, &k, &r);
+        }
+        bsg_cuda_print_stat_end(0);
+        
+        return (int)(rr);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp
new file mode 100644
index 000000000..df8be2dae
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+//#include <bsg_tile_group_barrier.h>
+#include <bsg_tile_group_barrier.hpp>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+#include <atomic>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+#include "sleep_until_valid.hpp"
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define VISIT_BUFSIZE 512
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define DEBUG_MASTER
+//#define DEBUG_SLAVE
+#define iproduct(x,y)                                                   \
+    inner_product_parallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    #define SYNC_INV  -2
+    #define SYNC_DONE -1
+
+        __attribute__((noinline))
+    int inner_product_ubmk_master(bsg_attr_remote const float * __restrict database,
+                                  const float * __restrict query,
+                                  int N,
+                                  int *visit_remote_all,
+                                  int   *kp,
+                                  float *rp)
+    {
+        float r = 0.0;
+        int visit[VISIT_BUFSIZE];
+        int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id];
+
+        // pre-compute addresses on remote tiles
+        int   *kp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM];
+
+        for (int tile_x = 0; tile_x < BSG_TILE_GROUP_X_DIM; ++tile_x) {
+            for (int tile_y = 0; tile_y < BSG_TILE_GROUP_Y_DIM; ++tile_y) {                
+                kp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, kp);
+            }
+        }
+
+        for (int i = 0; i < N; i += VISIT_BUFSIZE) {
+            size_t sz = std::min(VISIT_BUFSIZE, (N-i));
+            memcpy(visit, &visit_remote[i], sz*sizeof(int));
+
+            for (int j = 0; j < sz; ++j) {
+                // read k
+                int k = visit[j];
+
+                // set k on all tiles
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile)
+                    *kp_group[tile] = k;
+
+                float r_local = iproduct(query, &database[k * VSIZE]);
+#ifdef DEBUG_MASTER
+                bsg_print_float(r_local);
+#endif
+                rp[__bsg_id] = r_local;
+
+                // read r from all tiles
+                for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) {
+                    float r_remote = sleep_until_valid(&rp[tile], INFINITY);
+#ifdef DEBUG_MASTER
+                    bsg_print_float(r_remote);
+#endif
+                    r += r_remote;
+                }
+            }
+        }
+
+        for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile)
+            *kp_group[tile] = SYNC_DONE;
+
+        return (int)r;
+    }
+
+    __attribute__((noinline))
+    void inner_product_ubmk_slave(bsg_attr_remote const float * __restrict database,
+                                  const float * __restrict query,
+                                  int   *kp,
+                                  float *rp)
+    {
+        float r = 0.0;
+        int k;
+
+        while (true) {
+            // load next
+            k = sleep_until_valid(kp, SYNC_INV);
+            if (k == SYNC_DONE)
+                break;
+
+            r = iproduct(query, &database[k * VSIZE]);
+            *rp = r;
+#ifdef DEBUG_SLAVE
+            bsg_print_float(r);
+#endif
+        }
+    }
+    
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N,
+                           int *visit_remote_all)
+    {
+        static int   k = SYNC_INV;
+        static float r [BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM] = {INFINITY};
+        float rr;
+
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        if (__bsg_id == 0) {
+            // enter master loop
+            rr = inner_product_ubmk_master(database, q, N, visit_remote_all, &k, r);
+        } else {
+            // enter slave loop
+            inner_product_ubmk_slave(database, q, &k, bsg_tile_group_remote_pointer(0,0, &r[__bsg_id]));
+        }
+        bsg_cuda_print_stat_end(0);
+        
+        return (int)(rr);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp
new file mode 100644
index 000000000..be92dfcc9
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp
@@ -0,0 +1,80 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 2
+#define BSG_TILE_GROUP_Y_DIM 2
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+//#include <bsg_tile_group_barrier.h>
+#include <bsg_tile_group_barrier.hpp>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+#include <atomic>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+#include "sleep_until_valid.hpp"
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define VISIT_BUFSIZE 512
+
+using InnerProduct = InnerProductParallel_v1<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+using barrier = bsg_barrier<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N,
+                           int *visit_remote_all)
+    {
+        float q[VSIZE];
+        memcpy(q, query, sizeof(q));
+        barrier b;
+
+        bsg_cuda_print_stat_start(0);
+        
+        InnerProduct ip(database, q);
+        ip.init();        
+        float r = 0.0;
+        int visit[VISIT_BUFSIZE];
+        int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id];
+
+        for (int i = 0; i < N; i += VISIT_BUFSIZE) {
+            size_t sz = std::min(VISIT_BUFSIZE, (N-i));
+            memcpy(visit, &visit_remote[i], sz*sizeof(int));
+
+            for (int j = 0; j < sz; ++j) {
+                // read k
+                int k = visit[j];                
+                float rp = ip.inner_product(k);
+                r += rp;
+            }
+        }
+
+        ip.exit();
+        bsg_cuda_print_stat_end(0);
+        b.sync();
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp
new file mode 100644
index 000000000..9d9dcbc11
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp
@@ -0,0 +1,94 @@
+/*
+ * This kernel prints the Hello World message
+ */
+
+// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
+// before bsg_manycore.h and bsg_tile_group_barrier.h are
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+#define BSG_TILE_GROUP_X_DIM 1
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_manycore.h>
+#include <bsg_tile_group_barrier.h>
+#include <string.h>
+#include <queue>
+#include <algorithm>
+#include <array>
+//#include <hello_world.hpp>
+#include "inner_product.hpp"
+#include "heap.hpp"
+//#include "inner_product.h"
+
+/* We wrap all external-facing C++ kernels with `extern "C"` to
+ * prevent name mangling
+ */
+
+//#define V  1000000
+#define VSIZE 100
+#define NG 4
+#define V_ENTRY 82026
+
+#define EF        128
+#define N_RESULTS 10
+
+#define VISIT_BUFSIZE 512
+
+#define G_0 3
+#define G_1 2
+#define G_2 1
+#define G_3 0
+
+struct graph {
+    const int *offsets;
+    const int *neighbors;
+    int V;
+    int E;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define iproduct(x,y)                                                   \
+    inner_product_v3<BSG_TILE_GROUP_X_DIM, BSG_TILE_GROUP_Y_DIM>(x,y)
+
+    int inner_product_ubmk(bsg_attr_remote const float * __restrict database,
+                           const float * __restrict query,
+                           int N,
+                           int *visit_remote_all)
+    {
+        float q[VSIZE];
+        float r = 0;
+        int visit[VISIT_BUFSIZE];
+        //int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id_x * __bsg_tile_group_id_y];
+        int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id];
+        //int *visit_remote = &visit_remote_all[0];
+
+        bsg_print_int(-1 * __bsg_tile_group_id);
+        bsg_print_int(N);
+        bsg_print_hexadecimal(reinterpret_cast<unsigned>(database));
+        bsg_print_hexadecimal(reinterpret_cast<unsigned>(query));
+        bsg_print_hexadecimal(reinterpret_cast<unsigned>(visit_remote_all));
+
+        memcpy(q, query, sizeof(q));
+
+        bsg_cuda_print_stat_start(0);
+        for (int i = 0; i < N; i += VISIT_BUFSIZE) {
+            size_t sz = std::min(VISIT_BUFSIZE, (N-i));
+            memcpy(visit, &visit_remote[i], sz*sizeof(int));
+            
+            for (int j = 0; j < sz; ++j) {
+                int k = visit[j];
+                //r += iproduct(q, &database[(i+j*3)*VSIZE]);
+                r += iproduct(q, &database[k*VSIZE]);
+            }
+        }
+        bsg_cuda_print_stat_end(0);
+
+        return (int)(r);
+    }
+#ifdef __cplusplus
+}
+#endif

From 2d040fb880e4ba9351718c3b45424322cd46ded7 Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Fri, 30 Apr 2021 16:03:50 -0700
Subject: [PATCH 03/22] [ipnsw] Newer versions

---
 examples/sdh-eval-workloads/ipnsw/Makefile | 203 ++++++++++++++++++---
 1 file changed, 182 insertions(+), 21 deletions(-)

diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile
index b4bfa09d7..a8350c2cd 100644
--- a/examples/sdh-eval-workloads/ipnsw/Makefile
+++ b/examples/sdh-eval-workloads/ipnsw/Makefile
@@ -44,24 +44,125 @@ BSG_MACHINE_PATH=$(BSG_F1_DIR)/machines/pod_X1Y1_ruche_X16Y8_hbm
 ################################################################################
 # Define the range of versions
 ################################################################################
-# Kernel versions. See kernel/README.md for more information.  Version names do
-# not need to use v* and can be any string
-VERSIONS := greedy_walk    #  inner product with ipc=0.3 (8x4)
-VERSIONS += greedy_walk_v1 #  inner product with ipc=0.43 (8x4)
-VERSIONS += greedy_walk_v2 #  inner product with FLOPS/cycle=0.2  (8x4)
-VERSIONS += greedy_walk_v3 #  inner product with FLOPS/cycle=0.26 (8x4)
-VERSIONS += beam_search    #  very slow - uses a very dumb sparse set
-VERSIONS += beam_search_v1 #  dense set - inner product with ipc=0.3  (8x4)
-VERSIONS += beam_search_v2 #  dense set - inner product with ipc=0.43 (8x4)
-VERSIONS += beam_search_v3 #  + inner_product_v2 (flops/cycle=0.2039) (8x4)
-VERSIONS += beam_search_v4 #  + inner_product_v3 (flops/cycle=0.2663) (8x4)
-VERSIONS += beam_search_v5 #  + Bit vector for dense set
+ #  inner product with ipc=0.3 (8x4)
+VERSIONS := greedy_walk
+greedy_walk-grp-x := 1
+greedy_walk-grp-y := 1
+#  inner product with ipc=0.43 (8x4)
+VERSIONS += greedy_walk_v1
+greedy_walk_v1-grp-x := 1
+greedy_walk_v1-grp-y := 1
+#  inner product with FLOPS/cycle=0.2  (8x4)
+VERSIONS += greedy_walk_v2
+greedy_walk_v2-grp-x := 1
+greedy_walk_v2-grp-y := 1
+#  inner product with FLOPS/cycle=0.26 (8x4)
+VERSIONS += greedy_walk_v3
+greedy_walk_v3-grp-x := 1
+greedy_walk_v3-grp-y := 1
+#  inner product v4-serial
+VERSIONS += greedy_walk_v3-ipv4serial
+greedy_walk_v3-ipv4serial-grp-x := 1
+greedy_walk_v3-ipv4serial-grp-y := 1
+#  greedy_walk_v3 + ParallelInnerProduct_v1
+VERSIONS += greedy_walk_v4
+greedy_walk_v4-grp-x := 2
+greedy_walk_v4-grp-y := 2
+
+#  very slow - uses a very dumb sparse set
+VERSIONS += beam_search
+beam_search-grp-x := 1
+beam_search-grp-y := 1
+#  dense set - inner product with ipc=0.3  (8x4)
+VERSIONS += beam_search_v1
+beam_search_v1-grp-x := 1
+beam_search_v1-grp-y := 1
+#  dense set - inner product with ipc=0.43 (8x4)
+VERSIONS += beam_search_v2
+beam_search_v2-grp-x := 1
+beam_search_v2-grp-y := 1
+#  + inner_product_v2 (flops/cycle=0.2039) (8x4)
+VERSIONS += beam_search_v3
+beam_search_v3-grp-x := 1
+beam_search_v3-grp-y := 1
+#  + inner_product_v3 (flops/cycle=0.2663) (8x4)
+VERSIONS += beam_search_v4
+beam_search_v4-grp-x := 1
+beam_search_v4-grp-y := 1
+#  + Bit vector for dense set
+VERSIONS += beam_search_v5
+beam_search_v5-grp-x := 1
+beam_search_v5-grp-y := 1
+#  + Bit vector for dense set + inner product v4 seria;
+VERSIONS += beam_search_v5-ipv4serial
+beam_search_v5-ipv4serial-grp-x := 1
+beam_search_v5-ipv4serial-grp-y := 1
+#  beam_search_v5 + inner_product_parallel_v3
+VERSIONS += beam_search_v6
+beam_search_v6-grp-x := 2
+beam_search_v6-grp-y := 2
+# beam_search_v6 but with 1x2 tile group
+VERSIONS += beam_search_v7
+beam_search_v7-grp-x := 1
+beam_search_v7-grp-y := 2
+# beam_search_v5 but edges of candidates traversed in parallel
+VERSIONS += beam_search_v8
+beam_search_v8-grp-x := 4
+beam_search_v8-grp-y := 4
+# combination of beam_search_v8 + beam_search_v6
+VERSIONS += beam_search_v9
+beam_search_v9-grp-x := 4
+beam_search_v9-grp-y := 4
+# beam_search_v5 but edges of candidates traversed in parallel
+VERSIONS += beam_search_v10
+beam_search_v10-grp-x := 8
+beam_search_v10-grp-y := 4
+
+# debugging this makefile
 VERSIONS += debug
-VERSIONS += iproduct_ubmk # baseline - ipc = 0.3
-VERSIONS += iproduct_ubmk_v1 # using clang and NBLs, ipc = 0.43, flops/cycle = 0.1867
-VERSIONS += iproduct_ubmk_v2 # + FMA, ipc = 0.386, flops/cycle = 0.2039
-VERSIONS += iproduct_ubmk_v3 # + explicit parallel fma (ipc=0.45,flops/cycle = 0.2663) (8x4)
-VERSIONS += iproduct_ubmk_v4 # Slightly cleaner code than v3 - similar performance
+debug-grp-x := 0
+debug-grp-y := 0
+
+ # baseline - ipc = 0.3
+VERSIONS += iproduct_ubmk
+iproduct_ubmk-grp-x := 1
+iproduct_ubmk-grp-y := 1
+# using clang and NBLs, ipc = 0.43, flops/cycle = 0.1867
+VERSIONS += iproduct_ubmk_v1
+iproduct_ubmk_v1-grp-x := 1
+iproduct_ubmk_v1-grp-y := 1
+# + FMA, ipc = 0.386, flops/cycle = 0.2039
+VERSIONS += iproduct_ubmk_v2
+iproduct_ubmk_v2-grp-x := 1
+iproduct_ubmk_v2-grp-y := 1
+# + explicit parallel fma (ipc=0.45,flops/cycle = 0.2663) (8x4)
+VERSIONS += iproduct_ubmk_v3
+iproduct_ubmk_v3-grp-x := 1
+iproduct_ubmk_v3-grp-y := 1
+# Slightly cleaner code than v3 - similar performance
+VERSIONS += iproduct_ubmk_v4
+iproduct_ubmk_v4-grp-x := 1
+iproduct_ubmk_v4-grp-y := 1
+ #...
+VERSIONS += iproduct_ubmk_parallel
+iproduct_ubmk_parallel-grp-x := 1
+iproduct_ubmk_parallel-grp-y := 1
+ #...
+VERSIONS += iproduct_ubmk-parallel_v1
+iproduct_ubmk-parallel_v1-grp-x := 2
+iproduct_ubmk-parallel_v1-grp-y := 2
+ #...
+VERSIONS += iproduct_ubmk-parallel_v1.1
+iproduct_ubmk-parallel_v1.1-grp-x := 2
+iproduct_ubmk-parallel_v1.1-grp-y := 2
+ #...
+VERSIONS += iproduct_ubmk-parallel_v2
+iproduct_ubmk-parallel_v2-grp-x := 2
+iproduct_ubmk-parallel_v2-grp-y := 2
+#... same as v2 but with (1x4 tg)
+VERSIONS += iproduct_ubmk-parallel_v3
+iproduct_ubmk-parallel_v3-grp-x := 2
+iproduct_ubmk-parallel_v3-grp-y := 2
 
 _KERNEL_COMPILER = CLANG
 ################################################################################
@@ -112,6 +213,12 @@ C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1
 C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2
 C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3
 
+# set group x/y values
+define VERSION-SET-ARGS
+kernel/$(1)/$(HOST_TARGET).log: ARGS += --group-x $($(1)-grp-x)
+kernel/$(1)/$(HOST_TARGET).log: ARGS += --group-y $($(1)-grp-y)
+endef
+$(foreach v,$(VERSIONS),$(eval $(call VERSION-SET-ARGS,$v)))
 
 ################################
 # Inner Product U-Benchmarking #
@@ -156,11 +263,61 @@ iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/$(HOST_TARG
 # Add to versions
 VERSIONS += $(IPRODUCT-UBMK-VERSIONS)
 
+#########################################
+# Parallel Inner Product U-Benchmarking #
+#########################################
+# number iproducts
+N-IPRODUCTS := 1500
+GRID-X := 1 2 4 8
+GRID-Y := 1 2 4
+
+IPRODUCT-PARALLEL-BASENAME := iproduct_ubmk-parallel_v1
+
+define IPRODUCT-UBMK-PARALLEL-RULE
+# creates run directory from template
+kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.cpp: kernel/$(IPRODUCT-PARALLEL-BASENAME)/kernel.cpp
+	mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+# adds arguments
+kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --num-iproducts $(1)
+kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --grid-x $(2)
+kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --grid-y $(3)
+kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --group-x $($(IPRODUCT-PARALLEL-BASENAME)-grp-x)
+kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --group-y $($(IPRODUCT-PARALLEL-BASENAME)-grp-y)
+kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.riscv
+kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
+# adds to list of iproduct u-bmk
+IPRODUCT-UBMK-PARALLEL-VERSIONS += iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)
+endef
+
+# Expand rule for each inner product input
+$(foreach gy,$(GRID-Y),$(foreach gx,$(GRID-X),$(foreach nip,$(N-IPRODUCTS), $(eval $(call IPRODUCT-UBMK-PARALLEL-RULE,$(nip),$(gx),$(gy))))))
+
+.PHONY: create-iproduct-ubmk-parallel
+.PHONY: purge-iproduct-ubmk-parallel
+.PHONY: iproduct-ubmk-parallel-stats
+
+# create rule
+create-iproduct-ubmk-parallel: $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v/kernel.cpp)
+
+# purge rule
+purge-iproduct-ubmk-parallel:
+	rm -rf $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v)
+
+# collect stats for all
+iproduct-ubmk-parallel-stats: create-iproduct-ubmk-parallel
+iproduct-ubmk-parallel-stats: $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log)
+
+# Add to versions
+VERSIONS += $(IPRODUCT-UBMK-PARALLEL-VERSIONS)
+
 ####################
 # Greedy Walk Runs #
 ####################
 GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490
-GREEDY-WALK-BASENAME := greedy_walk_v3
+GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490
+GREEDY-WALK-BASENAME := greedy_walk_v4
 define GREEDY-WALK-RULE
 # creates run directory from template
 kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.cpp
@@ -168,8 +325,8 @@ kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.c
 	cp $$< $$@
 
 # adds arguments
-kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: C_ARGS += --queries $(1)
-kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv
+kernel/greedy_walk-query$(1)/$(HOST_TARGET).exec.log: C_ARGS += --queries $(1)
+kernel/greedy_walk-query$(1)/$(HOST_TARGET).exec.log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv
 kernel/greedy_walk-query$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
 
 # adds to list of greedy walk versions
@@ -201,7 +358,11 @@ VERSIONS += $(GREEDY-WALK-VERSIONS)
 # Beam Search Runs #
 ####################
 BEAM-SEARCH-QUERIES := 2 188 229 355 427 472
-BEAM-SEARCH-BASENAME := beam_search_v5
+BEAM-SEARCH-QUERIES += 25  74  112 140 148 178
+BEAM-SEARCH-QUERIES += 214 244 278 302 331
+BEAM-SEARCH-QUERIES += 396 420 452 489 511
+
+BEAM-SEARCH-BASENAME := beam_search_v10
 
 define BEAM-SEARCH-RULE
 # creates run directory from template

From 020fdccb99cac330e88ffa75c8df707d6b216d47 Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Fri, 30 Apr 2021 16:11:43 -0700
Subject: [PATCH 04/22] [ipnsw] submodules

---
 .gitmodules                                           | 6 ++++++
 examples/sdh-eval-workloads/ipnsw/graph-tools         | 1 +
 examples/sdh-eval-workloads/ipnsw/hammerblade-helpers | 1 +
 examples/sdh-eval-workloads/ipnsw/hb-prog-eval        | 2 +-
 4 files changed, 9 insertions(+), 1 deletion(-)
 create mode 160000 examples/sdh-eval-workloads/ipnsw/graph-tools
 create mode 160000 examples/sdh-eval-workloads/ipnsw/hammerblade-helpers

diff --git a/.gitmodules b/.gitmodules
index 5083eb4ea..099c758cb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,9 @@
 [submodule "n"]
 	path = examples/sdh-eval-workloads/ipnsw/hb-prog-eval
 	url = git@github.com:bespoke-silicon-group/hb-prog-eval
+[submodule "examples/sdh-eval-workloads/ipnsw/graph-tools"]
+	path = examples/sdh-eval-workloads/ipnsw/graph-tools
+	url = git@github.com:mrutt92/graph-tools
+[submodule "examples/sdh-eval-workloads/ipnsw/hammerblade-helpers"]
+	path = examples/sdh-eval-workloads/ipnsw/hammerblade-helpers
+	url = git@github.com:mrutt92/hammerblade-helpers
diff --git a/examples/sdh-eval-workloads/ipnsw/graph-tools b/examples/sdh-eval-workloads/ipnsw/graph-tools
new file mode 160000
index 000000000..a7304c67c
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/graph-tools
@@ -0,0 +1 @@
+Subproject commit a7304c67c34070877e57719fd183c4a5ee569904
diff --git a/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers b/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers
new file mode 160000
index 000000000..9a26b6d0c
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers
@@ -0,0 +1 @@
+Subproject commit 9a26b6d0cbe04a9cc627cce7049a0ba97ca66621
diff --git a/examples/sdh-eval-workloads/ipnsw/hb-prog-eval b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval
index 5915cc2c4..f113c0865 160000
--- a/examples/sdh-eval-workloads/ipnsw/hb-prog-eval
+++ b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval
@@ -1 +1 @@
-Subproject commit 5915cc2c4bc6336102c452a4e7d0a7b06ccf9222
+Subproject commit f113c0865d2d9491551dab8f8b500445b75429bc

From 663f2089bea13bd027a205eb0934c1f05dc9aad4 Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Fri, 30 Apr 2021 19:20:26 -0700
Subject: [PATCH 05/22] [ipnsw] Finally ported and working

---
 examples/sdh-eval-workloads/ipnsw/Makefile  | 514 ++++----------------
 examples/sdh-eval-workloads/ipnsw/ipnsw.cpp |   2 +
 2 files changed, 109 insertions(+), 407 deletions(-)

diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile
index a8350c2cd..c53ea4368 100644
--- a/examples/sdh-eval-workloads/ipnsw/Makefile
+++ b/examples/sdh-eval-workloads/ipnsw/Makefile
@@ -1,466 +1,166 @@
-# Copyright (c) 2019, University of Washington All rights reserved.
-# 
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-# 
-# Redistributions of source code must retain the above copyright notice, this list
-# of conditions and the following disclaimer.
-# 
-# Redistributions in binary form must reproduce the above copyright notice, this
-# list of conditions and the following disclaimer in the documentation and/or
-# other materials provided with the distribution.
-# 
-# Neither the name of the copyright holder nor the names of its contributors may
-# be used to endorse or promote products derived from this software without
-# specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-################################################################################
-# Paths / Environment Configuration
-################################################################################
-_REPO_ROOT ?= $(shell git rev-parse --show-toplevel)
-CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
-
--include $(_REPO_ROOT)/environment.mk
-
-################################################################################
-# Define BSG_MACHINE_PATH, the location of the Makefile.machine.include file
-# that defines the machine to compile and simulate on. Using BSG_F1_DIR (which
-# is set in environment.mk) uses the same machine as in bsg_replicant.
-################################################################################
-
-BSG_MACHINE_PATH=$(BSG_F1_DIR)/machines/pod_X1Y1_ruche_X16Y8_hbm
+#####################
+# Standard includes #
+#####################
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+include $(REPLICANT_PATH)/environment.mk
+
+#######################################
+# Base clase run directory generation #
+#######################################
+# $1 = name
+# $2 = version
+# $3 = args
+define run-dir
+run/$1/kernel.cpp: kernel/$2/kernel.cpp
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+	@echo "MAKING $$@"
+
+run/$1/Makefile: template.mk
+	@mkdir -p $$(dir $$@)
+	@cat $$< > $$@
+	@echo "C_ARGS += $3" >> $$@
+	@echo "MAKING $$@"
+
+.PHONY: generate-$1 build-$1 purge-$1 run-$1
+
+generate-$1: run/$1/Makefile run/$1/kernel.cpp
+purge-$1:
+	rm -rf run/$1
+build-$1: generate-$1
+	$(MAKE) -C run/$1 main.riscv
+run-$1: generate-$1
+	$(MAKE) -C run/$1 main.exec.log
+endef
 
-################################################################################
-# Define the range of versions
-################################################################################
+#################################
+# Common command line arguments #
+#################################
+C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/database_music100.bin
+C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/query_music100.bin
+C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_0
+C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_1
+C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_2
+C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_3
+
+###############
+# Greedy Walk #
+###############
+# greedy-walk version -> dimensions
  #  inner product with ipc=0.3 (8x4)
-VERSIONS := greedy_walk
 greedy_walk-grp-x := 1
 greedy_walk-grp-y := 1
 #  inner product with ipc=0.43 (8x4)
-VERSIONS += greedy_walk_v1
 greedy_walk_v1-grp-x := 1
 greedy_walk_v1-grp-y := 1
 #  inner product with FLOPS/cycle=0.2  (8x4)
-VERSIONS += greedy_walk_v2
 greedy_walk_v2-grp-x := 1
 greedy_walk_v2-grp-y := 1
 #  inner product with FLOPS/cycle=0.26 (8x4)
-VERSIONS += greedy_walk_v3
 greedy_walk_v3-grp-x := 1
 greedy_walk_v3-grp-y := 1
 #  inner product v4-serial
-VERSIONS += greedy_walk_v3-ipv4serial
 greedy_walk_v3-ipv4serial-grp-x := 1
 greedy_walk_v3-ipv4serial-grp-y := 1
 #  greedy_walk_v3 + ParallelInnerProduct_v1
-VERSIONS += greedy_walk_v4
 greedy_walk_v4-grp-x := 2
 greedy_walk_v4-grp-y := 2
 
+# $1 = version
+# $2 = query
+greedy-walk-name = $(1)_query$(2)
+define greedy-walk
+$(eval $(call run-dir,$(call greedy-walk-name,$1,$2),$1,\
+$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call greedy-walk-name,$1,$2)/kernel.riscv \
+$1 \
+$(C_ARGS) \
+--queries $(2) \
+--group-x $($(1)-grp-x) \
+--group-y $($(1)-grp-y) \
+))
+greedy-walk-generate: generate-$(call greedy-walk-name,$1,$2)
+greedy-walk-purge:    purge-$(call greedy-walk-name,$1,$2)
+greedy-walk-build:    build-$(call greedy-walk-name,$1,$2)
+greedy-walk-run:      run-$(call greedy-walk-name,$1,$2)
+endef
+.PHONY: greedy-walk-generate
+.PHONY: greedy-walk-purge
+.PHONY: greedy-walk-build
+.PHONY: greedy-walk-run
+
+###############
+# Beam Search #
+###############
+# beam-search version -> dimensions
+
 #  very slow - uses a very dumb sparse set
-VERSIONS += beam_search
 beam_search-grp-x := 1
 beam_search-grp-y := 1
 #  dense set - inner product with ipc=0.3  (8x4)
-VERSIONS += beam_search_v1
 beam_search_v1-grp-x := 1
 beam_search_v1-grp-y := 1
 #  dense set - inner product with ipc=0.43 (8x4)
-VERSIONS += beam_search_v2
 beam_search_v2-grp-x := 1
 beam_search_v2-grp-y := 1
 #  + inner_product_v2 (flops/cycle=0.2039) (8x4)
-VERSIONS += beam_search_v3
 beam_search_v3-grp-x := 1
 beam_search_v3-grp-y := 1
 #  + inner_product_v3 (flops/cycle=0.2663) (8x4)
-VERSIONS += beam_search_v4
 beam_search_v4-grp-x := 1
 beam_search_v4-grp-y := 1
 #  + Bit vector for dense set
-VERSIONS += beam_search_v5
 beam_search_v5-grp-x := 1
 beam_search_v5-grp-y := 1
 #  + Bit vector for dense set + inner product v4 seria;
-VERSIONS += beam_search_v5-ipv4serial
 beam_search_v5-ipv4serial-grp-x := 1
 beam_search_v5-ipv4serial-grp-y := 1
 #  beam_search_v5 + inner_product_parallel_v3
-VERSIONS += beam_search_v6
 beam_search_v6-grp-x := 2
 beam_search_v6-grp-y := 2
 # beam_search_v6 but with 1x2 tile group
-VERSIONS += beam_search_v7
 beam_search_v7-grp-x := 1
 beam_search_v7-grp-y := 2
 # beam_search_v5 but edges of candidates traversed in parallel
-VERSIONS += beam_search_v8
 beam_search_v8-grp-x := 4
 beam_search_v8-grp-y := 4
 # combination of beam_search_v8 + beam_search_v6
-VERSIONS += beam_search_v9
 beam_search_v9-grp-x := 4
 beam_search_v9-grp-y := 4
 # beam_search_v5 but edges of candidates traversed in parallel
-VERSIONS += beam_search_v10
 beam_search_v10-grp-x := 8
 beam_search_v10-grp-y := 4
 
-# debugging this makefile
-VERSIONS += debug
-debug-grp-x := 0
-debug-grp-y := 0
-
- # baseline - ipc = 0.3
-VERSIONS += iproduct_ubmk
-iproduct_ubmk-grp-x := 1
-iproduct_ubmk-grp-y := 1
-# using clang and NBLs, ipc = 0.43, flops/cycle = 0.1867
-VERSIONS += iproduct_ubmk_v1
-iproduct_ubmk_v1-grp-x := 1
-iproduct_ubmk_v1-grp-y := 1
-# + FMA, ipc = 0.386, flops/cycle = 0.2039
-VERSIONS += iproduct_ubmk_v2
-iproduct_ubmk_v2-grp-x := 1
-iproduct_ubmk_v2-grp-y := 1
-# + explicit parallel fma (ipc=0.45,flops/cycle = 0.2663) (8x4)
-VERSIONS += iproduct_ubmk_v3
-iproduct_ubmk_v3-grp-x := 1
-iproduct_ubmk_v3-grp-y := 1
-# Slightly cleaner code than v3 - similar performance
-VERSIONS += iproduct_ubmk_v4
-iproduct_ubmk_v4-grp-x := 1
-iproduct_ubmk_v4-grp-y := 1
- #...
-VERSIONS += iproduct_ubmk_parallel
-iproduct_ubmk_parallel-grp-x := 1
-iproduct_ubmk_parallel-grp-y := 1
- #...
-VERSIONS += iproduct_ubmk-parallel_v1
-iproduct_ubmk-parallel_v1-grp-x := 2
-iproduct_ubmk-parallel_v1-grp-y := 2
- #...
-VERSIONS += iproduct_ubmk-parallel_v1.1
-iproduct_ubmk-parallel_v1.1-grp-x := 2
-iproduct_ubmk-parallel_v1.1-grp-y := 2
- #...
-VERSIONS += iproduct_ubmk-parallel_v2
-iproduct_ubmk-parallel_v2-grp-x := 2
-iproduct_ubmk-parallel_v2-grp-y := 2
-#... same as v2 but with (1x4 tg)
-VERSIONS += iproduct_ubmk-parallel_v3
-iproduct_ubmk-parallel_v3-grp-x := 2
-iproduct_ubmk-parallel_v3-grp-y := 2
-
-_KERNEL_COMPILER = CLANG
-################################################################################
-# Define any sources that should be used compiled during kernel compilation,
-# including the source file with the kernel itself. kernel.riscv will
-# be the name of the compiled RISC-V Binary for the Manycore
-#
-# Use KERNEL_*LIBRARIES list sources that should be compiled and linked with all
-# kernel.cpp versions. However, if you have version-specific sources you must
-# come up with your own solution.
-# 
-# Use KERNEL_INCLUDES to specify the path to directories that contain headers.
-################################################################################
-
-# C Libraries
-KERNEL_CLIBRARIES   +=
-# C++ Libraries
-KERNEL_CXXLIBRARIES +=
-
-KERNEL_INCLUDES     += -I$(CURRENT_PATH)/kernel/include
-
-# Define the default kernel.cpp file. If KERNEL_DEFAULT is not defined it will
-# be set to kernel.cpp in the same directory as this Makefile.
-DEFAULT_VERSION     := greedy_walk_v3
-KERNEL_DEFAULT      := kernel/$(DEFAULT_VERSION)/kernel.cpp
-#KERNEL_DEFAULT      := kernel/$(DEFAULT_VERSION)/kernel.c
-
-################################################################################
-# Include the kernel build rules (This must be included after KERNEL_*LIBRARIES,
-# KERNEL_DEFAULT, KERNEL_INCLUDES, etc)
-################################################################################
-
--include $(EXAMPLES_PATH)/examples/cuda/riscv.mk
-
-################################################################################
-# END OF KERNEL-SPECIFIC RULES / START OF HOST-SPECIFIC RULES
-################################################################################
-
-################################################################################
-# Include the Cosimulation host build rules (This must be included after
-# HOST_*SOURCES, HOST_TARGET, HOST_INCLUDES, etc)
-################################################################################
-HOST_TARGET = ipnsw
-C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/database_music100.bin
-C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/query_music100.bin
-C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_0
-C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1
-C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2
-C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3
-
-# set group x/y values
-define VERSION-SET-ARGS
-kernel/$(1)/$(HOST_TARGET).log: ARGS += --group-x $($(1)-grp-x)
-kernel/$(1)/$(HOST_TARGET).log: ARGS += --group-y $($(1)-grp-y)
-endef
-$(foreach v,$(VERSIONS),$(eval $(call VERSION-SET-ARGS,$v)))
-
-################################
-# Inner Product U-Benchmarking #
-################################
-# number iproducts
-N-IPRODUCTS := 150 500 1000 1500 2000 3000
-IPRODUCT-BASENAME := iproduct_ubmk_v4
-
-define IPRODUCT-UBMK-RULE
-# creates run directory from template
-kernel/iproduct_ubmk-$(1)/kernel.cpp: kernel/$(IPRODUCT-BASENAME)/kernel.cpp
-	mkdir -p $$(dir $$@)
-	cp $$< $$@
-
-# adds arguments
-kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: C_ARGS += --num-iproducts $(1)
-kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv
-kernel/iproduct_ubmk-$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
-
-# adds to list of iproduct u-bmk
-IPRODUCT-UBMK-VERSIONS += iproduct_ubmk-$(1)
+# $1 = version
+# $2 = query
+beam-search-name = $(1)_query$(2)
+define beam-search
+$(eval $(call run-dir,$(call beam-search-name,$1,$2),$1,\
+$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call beam-search-name,$1,$2)/kernel.riscv \
+$1 \
+$(C_ARGS) \
+--queries $(2) \
+--group-x $($(1)-grp-x) \
+--group-y $($(1)-grp-y) \
+))
+beam-search-generate: generate-$(call beam-search-name,$1,$2)
+beam-search-purge:    purge-$(call beam-search-name,$1,$2)
+beam-search-build:    build-$(call beam-search-name,$1,$2)
+beam-search-run:      run-$(call beam-search-name,$1,$2)
 endef
-
-# Expand rule for each inner product input
-$(foreach nip,$(N-IPRODUCTS),$(eval $(call IPRODUCT-UBMK-RULE,$(nip))))
-
-.PHONY: create-iproduct-ubmk
-.PHONY: purge-iproduct-ubmk
-.PHONY: iproduct-ubmk-stats
-
-# create rule
-create-iproduct-ubmk: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/kernel.cpp)
-
-# purge rule
-purge-iproduct-ubmk:
-	rm -rf $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v)
-
-# collect stats for all
-iproduct-ubmk-stats: create-iproduct-ubmk
-iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log)
-
-# Add to versions
-VERSIONS += $(IPRODUCT-UBMK-VERSIONS)
-
-#########################################
-# Parallel Inner Product U-Benchmarking #
-#########################################
-# number iproducts
-N-IPRODUCTS := 1500
-GRID-X := 1 2 4 8
-GRID-Y := 1 2 4
-
-IPRODUCT-PARALLEL-BASENAME := iproduct_ubmk-parallel_v1
-
-define IPRODUCT-UBMK-PARALLEL-RULE
-# creates run directory from template
-kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.cpp: kernel/$(IPRODUCT-PARALLEL-BASENAME)/kernel.cpp
-	mkdir -p $$(dir $$@)
-	cp $$< $$@
-
-# adds arguments
-kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --num-iproducts $(1)
-kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --grid-x $(2)
-kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --grid-y $(3)
-kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --group-x $($(IPRODUCT-PARALLEL-BASENAME)-grp-x)
-kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --group-y $($(IPRODUCT-PARALLEL-BASENAME)-grp-y)
-kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.riscv
-kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
-# adds to list of iproduct u-bmk
-IPRODUCT-UBMK-PARALLEL-VERSIONS += iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)
-endef
-
-# Expand rule for each inner product input
-$(foreach gy,$(GRID-Y),$(foreach gx,$(GRID-X),$(foreach nip,$(N-IPRODUCTS), $(eval $(call IPRODUCT-UBMK-PARALLEL-RULE,$(nip),$(gx),$(gy))))))
-
-.PHONY: create-iproduct-ubmk-parallel
-.PHONY: purge-iproduct-ubmk-parallel
-.PHONY: iproduct-ubmk-parallel-stats
-
-# create rule
-create-iproduct-ubmk-parallel: $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v/kernel.cpp)
-
-# purge rule
-purge-iproduct-ubmk-parallel:
-	rm -rf $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v)
-
-# collect stats for all
-iproduct-ubmk-parallel-stats: create-iproduct-ubmk-parallel
-iproduct-ubmk-parallel-stats: $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log)
-
-# Add to versions
-VERSIONS += $(IPRODUCT-UBMK-PARALLEL-VERSIONS)
-
-####################
-# Greedy Walk Runs #
-####################
-GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490
-GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490
-GREEDY-WALK-BASENAME := greedy_walk_v4
-define GREEDY-WALK-RULE
-# creates run directory from template
-kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.cpp
-	mkdir -p $$(dir $$@)
-	cp $$< $$@
-
-# adds arguments
-kernel/greedy_walk-query$(1)/$(HOST_TARGET).exec.log: C_ARGS += --queries $(1)
-kernel/greedy_walk-query$(1)/$(HOST_TARGET).exec.log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv
-kernel/greedy_walk-query$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
-
-# adds to list of greedy walk versions
-GREEDY-WALK-VERSIONS += greedy_walk-query$(1)
-endef
-
-# Expand rule for each query
-$(foreach q,$(GREEDY-WALK-QUERIES),$(eval $(call GREEDY-WALK-RULE,$(q))))
-
-.PHONY: create-greedy-walk
-.PHONY: purge-greedy-walk
-.PHONY: greedy-walk-stats
-
-# create rule
-create-greedy-walk: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/kernel.cpp)
-
-# purge rule
-purge-greedy-walk:
-	rm -rf $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v)
-
-# collect stats for all
-greedy-walk-stats: create-greedy-walk
-greedy-walk-stats: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log)
-
-# Add to versions
-VERSIONS += $(GREEDY-WALK-VERSIONS)
-
-####################
-# Beam Search Runs #
-####################
-BEAM-SEARCH-QUERIES := 2 188 229 355 427 472
-BEAM-SEARCH-QUERIES += 25  74  112 140 148 178
-BEAM-SEARCH-QUERIES += 214 244 278 302 331
-BEAM-SEARCH-QUERIES += 396 420 452 489 511
-
-BEAM-SEARCH-BASENAME := beam_search_v10
-
-define BEAM-SEARCH-RULE
-# creates run directory from template
-kernel/beam_search-query$(1)/kernel.cpp: kernel/$(BEAM-SEARCH-BASENAME)/kernel.cpp
-	mkdir -p $$(dir $$@)
-	cp $$< $$@
-
-# adds arguments
-kernel/beam_search-query$(1)/$(HOST_TARGET).log: C_ARGS += --queries $(1)
-kernel/beam_search-query$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/beam_search-query$(1)/kernel.riscv
-kernel/beam_search-query$$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
-
-# adds to list of greedy walk versions
-BEAM-SEARCH-VERSIONS += beam_search-query$(1)
-endef
-
-
-# Expand rule for each query
-$(foreach q,$(BEAM-SEARCH-QUERIES),$(eval $(call BEAM-SEARCH-RULE,$(q))))
-
-.PHONY: create-beam-search
-.PHONY: purge-beam-search
-.PHONY: beam-search-stats
-
-# create rule
-create-beam-search: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/kernel.cpp)
-
-# purge rule
-purge-beam-search:
-	rm -rf $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v)
-
-# collect stats for all
-beam-search-stats: create-beam-search
-beam-search-stats: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log)
-
-# Add to versions
-VERSIONS += $(BEAM-SEARCH-VERSIONS)
-
-########################################
-# Continue including cosim build rules #
-########################################
-
-GRAPH-TOOLS := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/graph-tools
-graphtools-dir := $(GRAPH-TOOLS)
-
-include $(GRAPH-TOOLS)/libgraphtools.mk
-
-HB-HELPERS := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hammerblade-helpers
-hammerblade-helpers-dir := $(HB-HELPERS)
-include $(HB-HELPERS)/libhammerblade-helpers-host.mk
-
-CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags)
-CXXFLAGS += $(libgraphtools-interface-cxxflags)
-CXXFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw
-CXXFLAGS += -DCOSIM
-
-LDFLAGS  += $(libhammerblade-helpers-host-interface-ldflags)
-LDFLAGS  += $(libgraphtools-interface-ldflags)
-
-GreedyWalkResults.o: $(libhammerblade-helpers-host-interface-headers)
-GreedyWalkResults.o: $(libgraphtools-interface-headers)
-GreedyWalkResults.o: $(libgraphtools-interface-libraries)
-GreedyWalkResults.o: GreedyWalkResults.cpp
-GreedyWalkResults.o: GreedyWalkResults.hpp
-
-ipnsw.o: $(libhammerblade-helpers-host-interface-headers)
-ipnsw.o: $(libgraphtools-interface-headers)
-ipnsw.o: $(libgraphtools-interface-libraries)
-ipnsw.o: IO.hpp
-ipnsw.o: IPNSWGraph.hpp
-ipnsw.o: IPNSWRunner.hpp
-ipnsw.o: IPNSWKernelRunner.hpp
-ipnsw.o: GreedyWalkKernelRunner.hpp
-ipnsw.o: BeamSearchKernelRunner.hpp
-ipnsw.o: IProductUBmkKernelRunner.hpp
-ipnsw.o: IPNSWResultReader.hpp
-ipnsw.o: GreedyWalkResultReader.hpp
-ipnsw.o: BeamSearchResultReader.hpp
-ipnsw.o: GreedyWalkResults.hpp
-ipnsw.o: IPNSWFactory.hpp
-ipnsw.o: GreedyWalkFactory.hpp
-ipnsw.o: BeamSearchFactory.hpp
-ipnsw.o: IProductUBmkFactory.hpp
-ipnsw.o: StringHelpers.hpp
-
-TEST_SOURCES = ipnsw.cpp GreedyWalkResults.cpp
-
--include $(EXAMPLES_PATH)/compilation.mk
--include $(EXAMPLES_PATH)/link.mk
--include $(EXAMPLES_PATH)/execution.mk
-
-################################################################################
-# Define the clean rules. clean calls the makefile-specific cleans, whereas
-# users can add commands and dependencies to custom.clean.
-################################################################################
-version.clean:
-	rm -rf kernel/*/*{.csv,.log,.rvo,.riscv,.vpd,.key,.png,.dis}
-	rm -rf kernel/*/{stats,pc_stats}
-
-custom.clean: version.clean
+.PHONY: beam-search-generate
+.PHONY: beam-search-purge
+.PHONY: beam-search-build
+.PHONY: beam-search-run
+
+#############################################################
+# Define which queries we want to run and instantiate rules #
+#############################################################
+greedy-walk-queries := 4 16 229 276 461 470 490
+$(foreach q,$(greedy-walk-queries),$(eval $(call greedy-walk,greedy_walk_v4,$(q))))
+
+beam-search-queries := 2   188 229 355 427 472
+beam-search-queries += 25  74  112 140 148 178
+beam-search-queries += 214 244 278 302 331
+beam-search-queries += 396 420 452 489 511
+$(foreach q,$(beam-search-queries),$(eval $(call beam-search,beam_search_v10,$(q))))
 
diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
index 8de8e073e..23f2b20d1 100644
--- a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
+++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp
@@ -28,6 +28,8 @@ int Main(int argc, char *argv[])
     Parser args;
     args.parse(argc, argv);
 
+    std::cout << args.str() << std::endl;
+
     std::unique_ptr<IPNSWRunner> runner;
     std::unique_ptr<IPNSWFactory> factory;
 

From 79aad5fbc501efeef62a09fd2909880e3bec4df9 Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Fri, 30 Apr 2021 19:22:38 -0700
Subject: [PATCH 06/22] [ipnsw] adds a profile rule

---
 examples/sdh-eval-workloads/ipnsw/Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile
index c53ea4368..6d2c4ba49 100644
--- a/examples/sdh-eval-workloads/ipnsw/Makefile
+++ b/examples/sdh-eval-workloads/ipnsw/Makefile
@@ -22,7 +22,7 @@ run/$1/Makefile: template.mk
 	@echo "C_ARGS += $3" >> $$@
 	@echo "MAKING $$@"
 
-.PHONY: generate-$1 build-$1 purge-$1 run-$1
+.PHONY: generate-$1 build-$1 purge-$1 run-$1 profile-$1
 
 generate-$1: run/$1/Makefile run/$1/kernel.cpp
 purge-$1:
@@ -31,6 +31,8 @@ build-$1: generate-$1
 	$(MAKE) -C run/$1 main.riscv
 run-$1: generate-$1
 	$(MAKE) -C run/$1 main.exec.log
+profile-$1: generate-$1
+	$(MAKE) -C run/$1 main.profile.log
 endef
 
 #################################
@@ -82,11 +84,13 @@ greedy-walk-generate: generate-$(call greedy-walk-name,$1,$2)
 greedy-walk-purge:    purge-$(call greedy-walk-name,$1,$2)
 greedy-walk-build:    build-$(call greedy-walk-name,$1,$2)
 greedy-walk-run:      run-$(call greedy-walk-name,$1,$2)
+greedy-walk-profile:  profile-$(call greedy-walk-name,$1,$2)
 endef
 .PHONY: greedy-walk-generate
 .PHONY: greedy-walk-purge
 .PHONY: greedy-walk-build
 .PHONY: greedy-walk-run
+.PHONY: greedy-walk-profile
 
 ###############
 # Beam Search #
@@ -146,11 +150,13 @@ beam-search-generate: generate-$(call beam-search-name,$1,$2)
 beam-search-purge:    purge-$(call beam-search-name,$1,$2)
 beam-search-build:    build-$(call beam-search-name,$1,$2)
 beam-search-run:      run-$(call beam-search-name,$1,$2)
+beam-search-profile:  profile-$(call beam-search-name,$1,$2)
 endef
 .PHONY: beam-search-generate
 .PHONY: beam-search-purge
 .PHONY: beam-search-build
 .PHONY: beam-search-run
+.PHONY: beam-search-profile
 
 #############################################################
 # Define which queries we want to run and instantiate rules #

From de5dec8dc29e989664b40dcd2be8714a508324d8 Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Fri, 30 Apr 2021 19:22:54 -0700
Subject: [PATCH 07/22] [ipnsw] adds missing template makefile

---
 examples/sdh-eval-workloads/ipnsw/template.mk | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 examples/sdh-eval-workloads/ipnsw/template.mk

diff --git a/examples/sdh-eval-workloads/ipnsw/template.mk b/examples/sdh-eval-workloads/ipnsw/template.mk
new file mode 100644
index 000000000..13c1e5919
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/template.mk
@@ -0,0 +1,72 @@
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+include $(REPLICANT_PATH)/environment.mk
+include $(BSG_MACHINE_PATH)/Makefile.machine.include
+
+# kernel code
+BSG_MANYCORE_KERNELS = kernel.riscv
+
+RISCV_CCPPFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/kernel/include
+RISCV_CCPPFLAGS += -Dbsg_tiles_X=1
+RISCV_CCPPFLAGS += -Dbsg_tiles_Y=1
+
+RISCV_TARGET_OBJECTS = kernel.rvo
+kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
+RISCV_OPT_LEVEL = -O3
+include $(EXAMPLES_PATH)/cuda/riscv.mk
+RISCV_LDFLAGS := $(filter-out -nostdlib,$(RISCV_LDFLAGS))
+
+# host code
+graphtools-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/graph-tools
+hammerblade-helpers-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hammerblade-helpers
+
+include $(graphtools-dir)/libgraphtools.mk
+include $(hammerblade-helpers-dir)/libhammerblade-helpers-host.mk
+
+# header files
+TEST_HEADERS := $(libhammerblade-helpers-host-interface-headers)
+TEST_HEADERS += $(libgraphtools-interface-headers)
+TEST_HEADERS += GreedyWalkResults.hpp
+TEST_HEADERS += IO.hpp
+TEST_HEADERS += IPNSWGraph.hpp
+TEST_HEADERS += IPNSWRunner.hpp
+TEST_HEADERS += IPNSWKernelRunner.hpp
+TEST_HEADERS += GreedyWalkKernelRunner.hpp
+TEST_HEADERS += BeamSearchKernelRunner.hpp
+TEST_HEADERS += IProductUBmkKernelRunner.hpp
+TEST_HEADERS += IPNSWResultReader.hpp
+TEST_HEADERS += GreedyWalkResultReader.hpp
+TEST_HEADERS += BeamSearchResultReader.hpp
+TEST_HEADERS += GreedyWalkResults.hpp
+TEST_HEADERS += IPNSWFactory.hpp
+TEST_HEADERS += GreedyWalkFactory.hpp
+TEST_HEADERS += BeamSearchFactory.hpp
+TEST_HEADERS += IProductUBmkFactory.hpp
+TEST_HEADERS += StringHelpers.hpp
+
+# source files
+TEST_SOURCES := GreedyWalkResults.cpp
+TEST_SOURCES += ipnsw.cpp
+
+# cxxflags
+CXXFLAGS += $(libgraphtools-interface-cxxflags)
+CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags)
+CXXFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw
+CXXFLAGS += -DCOSIM
+
+# ldflags
+LDFLAGS += $(libgraphtools-interface-ldflags)
+LDFLAGS += $(libhammerblade-helpers-host-interface-ldflags)
+
+vpath %.cpp $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw
+vpath %.hpp $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw
+
+TEST_NAME = main
+
+include $(EXAMPLES_PATH)/compilation.mk
+include $(EXAMPLES_PATH)/link.mk
+
+# mark dependencies
+$(TEST_OBJECTS): $(libgraphtools-interface-libraries)
+$(TEST_OBJECTS): $(TEST_HEADERS)
+
+include $(EXAMPLES_PATH)/execution.mk

From 7b25a5944edaa439b3a098b89832f733d2586d9e Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Sun, 2 May 2021 09:24:33 -0700
Subject: [PATCH 08/22] [ipnsw] cleans up the directory a bit

---
 examples/sdh-eval-workloads/ipnsw/.gitignore |  1 +
 examples/sdh-eval-workloads/ipnsw/Makefile   | 82 +++++++++-----------
 2 files changed, 39 insertions(+), 44 deletions(-)
 create mode 100644 examples/sdh-eval-workloads/ipnsw/.gitignore

diff --git a/examples/sdh-eval-workloads/ipnsw/.gitignore b/examples/sdh-eval-workloads/ipnsw/.gitignore
new file mode 100644
index 000000000..737e26b00
--- /dev/null
+++ b/examples/sdh-eval-workloads/ipnsw/.gitignore
@@ -0,0 +1 @@
+run/
\ No newline at end of file
diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile
index 6d2c4ba49..d643c9e9f 100644
--- a/examples/sdh-eval-workloads/ipnsw/Makefile
+++ b/examples/sdh-eval-workloads/ipnsw/Makefile
@@ -4,6 +4,8 @@
 REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
 include $(REPLICANT_PATH)/environment.mk
 
+all:
+
 #######################################
 # Base clase run directory generation #
 #######################################
@@ -25,14 +27,24 @@ run/$1/Makefile: template.mk
 .PHONY: generate-$1 build-$1 purge-$1 run-$1 profile-$1
 
 generate-$1: run/$1/Makefile run/$1/kernel.cpp
+
 purge-$1:
 	rm -rf run/$1
+
 build-$1: generate-$1
-	$(MAKE) -C run/$1 main.riscv
-run-$1: generate-$1
-	$(MAKE) -C run/$1 main.exec.log
+	+$(MAKE) -C run/$1 main.riscv
+
+exec-$1: generate-$1
+	+$(MAKE) -C run/$1 main.exec.log
+
 profile-$1: generate-$1
-	$(MAKE) -C run/$1 main.profile.log
+	+$(MAKE) -C run/$1 main.profile.log
+
+debug-$1: generate-$1
+	+$(MAKE) -C run/$1/main.debug.log
+
+saif-$1: generate-$1
+	+$(MAKE) -C run/$1/main.saifgen.log
 endef
 
 #################################
@@ -68,30 +80,6 @@ greedy_walk_v3-ipv4serial-grp-y := 1
 greedy_walk_v4-grp-x := 2
 greedy_walk_v4-grp-y := 2
 
-# $1 = version
-# $2 = query
-greedy-walk-name = $(1)_query$(2)
-define greedy-walk
-$(eval $(call run-dir,$(call greedy-walk-name,$1,$2),$1,\
-$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call greedy-walk-name,$1,$2)/kernel.riscv \
-$1 \
-$(C_ARGS) \
---queries $(2) \
---group-x $($(1)-grp-x) \
---group-y $($(1)-grp-y) \
-))
-greedy-walk-generate: generate-$(call greedy-walk-name,$1,$2)
-greedy-walk-purge:    purge-$(call greedy-walk-name,$1,$2)
-greedy-walk-build:    build-$(call greedy-walk-name,$1,$2)
-greedy-walk-run:      run-$(call greedy-walk-name,$1,$2)
-greedy-walk-profile:  profile-$(call greedy-walk-name,$1,$2)
-endef
-.PHONY: greedy-walk-generate
-.PHONY: greedy-walk-purge
-.PHONY: greedy-walk-build
-.PHONY: greedy-walk-run
-.PHONY: greedy-walk-profile
-
 ###############
 # Beam Search #
 ###############
@@ -136,37 +124,43 @@ beam_search_v10-grp-y := 4
 
 # $1 = version
 # $2 = query
-beam-search-name = $(1)_query$(2)
-define beam-search
-$(eval $(call run-dir,$(call beam-search-name,$1,$2),$1,\
-$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call beam-search-name,$1,$2)/kernel.riscv \
+run-name = $(1)_query$(2)
+define run
+$(eval $(call run-dir,$(call run-name,$1,$2),$1,\
+$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call run-name,$1,$2)/kernel.riscv \
 $1 \
 $(C_ARGS) \
 --queries $(2) \
 --group-x $($(1)-grp-x) \
 --group-y $($(1)-grp-y) \
 ))
-beam-search-generate: generate-$(call beam-search-name,$1,$2)
-beam-search-purge:    purge-$(call beam-search-name,$1,$2)
-beam-search-build:    build-$(call beam-search-name,$1,$2)
-beam-search-run:      run-$(call beam-search-name,$1,$2)
-beam-search-profile:  profile-$(call beam-search-name,$1,$2)
+generate: generate-$(call run-name,$1,$2)
+purge:    purge-$(call run-name,$1,$2)
+build:    build-$(call run-name,$1,$2)
+exec:     exec-$(call run-name,$1,$2)
+profile:  profile-$(call run-name,$1,$2)
+debug:    debug-$(call run-name,$1,$2)
+saifgen:  saifgen-$(call run-name,$1,$2)
 endef
-.PHONY: beam-search-generate
-.PHONY: beam-search-purge
-.PHONY: beam-search-build
-.PHONY: beam-search-run
-.PHONY: beam-search-profile
+.PHONY: generate
+.PHONY: purge
+.PHONY: build
+.PHONY: exec
+.PHONY: profile
+.PHONY: debug
+.PHONY: saifgen
 
 #############################################################
 # Define which queries we want to run and instantiate rules #
 #############################################################
 greedy-walk-queries := 4 16 229 276 461 470 490
-$(foreach q,$(greedy-walk-queries),$(eval $(call greedy-walk,greedy_walk_v4,$(q))))
+$(foreach q,$(greedy-walk-queries),$(eval $(call run,greedy_walk_v4,$(q))))
 
 beam-search-queries := 2   188 229 355 427 472
 beam-search-queries += 25  74  112 140 148 178
 beam-search-queries += 214 244 278 302 331
 beam-search-queries += 396 420 452 489 511
-$(foreach q,$(beam-search-queries),$(eval $(call beam-search,beam_search_v10,$(q))))
+$(foreach q,$(beam-search-queries),$(eval $(call run,beam_search_v10,$(q))))
 
+.PHONY: all
+all: exec

From a36d3b03acc507dbd613a348b4196683cb741d14 Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Mon, 22 Mar 2021 10:01:46 -0700
Subject: [PATCH 09/22] starting a graphit test dir, adding
 test_vec_add_parallel as dummy test for now

---
 examples/Makefile                             |   2 +-
 examples/graphit/Makefile                     |  61 +++++
 examples/graphit/riscv.mk                     | 257 ++++++++++++++++++
 .../graphit/test_vec_add_parallel/Makefile    | 142 ++++++++++
 .../graphit/test_vec_add_parallel/kernel.cpp  |  20 ++
 examples/graphit/test_vec_add_parallel/main.c | 196 +++++++++++++
 6 files changed, 677 insertions(+), 1 deletion(-)
 create mode 100644 examples/graphit/Makefile
 create mode 100644 examples/graphit/riscv.mk
 create mode 100644 examples/graphit/test_vec_add_parallel/Makefile
 create mode 100644 examples/graphit/test_vec_add_parallel/kernel.cpp
 create mode 100644 examples/graphit/test_vec_add_parallel/main.c

diff --git a/examples/Makefile b/examples/Makefile
index 1bc3055e8..6fda8a5ef 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -45,7 +45,7 @@ include $(REPLICANT_PATH)/environment.mk
 include $(EXAMPLES_PATH)/link.mk
 
 # Supported example suites
-TARGETS = library spmd cuda python
+TARGETS = library spmd cuda python graphit
 
 # Define the tests that get run
 TESTS += test_loader
diff --git a/examples/graphit/Makefile b/examples/graphit/Makefile
new file mode 100644
index 000000000..1ac9533b9
--- /dev/null
+++ b/examples/graphit/Makefile
@@ -0,0 +1,61 @@
+# Copyright (c) 2019, University of Washington All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+# 
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+# 
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Makefile compiles, links, and executes examples Run `make help`
+# to see the available targets for the selected platform.
+
+# environment.mk verifies the build environment and sets the following
+# makefile variables:
+#
+# LIBRAIRES_PATH: The path to the libraries directory
+# HARDWARE_PATH: The path to the hardware directory
+# EXAMPLES_PATH: The path to the examples directory
+# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
+# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
+# CL_DIR: Path to the directory of this AWS F1 Project
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+
+include $(REPLICANT_PATH)/environment.mk
+
+# Defines REGRESSION_PREBUILD
+include $(EXAMPLES_PATH)/link.mk
+
+# Define the tests that get run
+TESTS += test_vec_add_parallel
+
+regression: $(TESTS)
+	@echo "GRAPHIT REGRESSION PASSED"
+
+$(TESTS): $(REGRESSION_PREBUILD)
+	$(MAKE) -C $@ regression
+
+clean: $(TESTS:=.clean)
+
+%.clean:
+	$(MAKE) -C $(@:.clean=) clean
+
+.PHONY: clean regression $(TESTS) %.clean
diff --git a/examples/graphit/riscv.mk b/examples/graphit/riscv.mk
new file mode 100644
index 000000000..1266a5e7f
--- /dev/null
+++ b/examples/graphit/riscv.mk
@@ -0,0 +1,257 @@
+# Copyright (c) 2019, University of Washington All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+#
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+#
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# TODO: Makefile comment
+ORANGE=\033[0;33m
+RED=\033[0;31m
+NC=\033[0m
+
+################################################################################
+# Paths
+################################################################################
+_REPO_ROOT ?= $(shell git rev-parse --show-toplevel)
+-include $(_REPO_ROOT)/environment.mk
+
+BSG_MANYCORE_SPMD_PATH = $(BSG_MANYCORE_DIR)/software/spmd/
+BSG_MANYCORE_CUDALITE_PATH = $(BSG_MANYCORE_SPMD_PATH)/bsg_cuda_lite_runtime/
+BSG_MANYCORE_CUDALITE_MAIN_PATH = $(BSG_MANYCORE_CUDALITE_PATH)/main
+
+BSG_MANYCORE_LIB_PATH    = $(BSG_MANYCORE_DIR)/software/bsg_manycore_lib
+BSG_MANYCORE_COMMON_PATH = $(BSG_MANYCORE_SPMD_PATH)/common/
+
+RISCV_TOOLS_PATH := $(BSG_MANYCORE_DIR)/software/riscv-tools/
+RISCV_GNU_PATH   := $(RISCV_TOOLS_PATH)/riscv-install
+RISCV_LLVM_PATH  := $(RISCV_TOOLS_PATH)/llvm/llvm-install
+
+################################################################################
+# Include RISC-V Tool Configuration
+################################################################################
+
+RISCV_LINK_GEN := $(BSG_MANYCORE_DIR)/software/py/bsg_manycore_link_gen.py
+
+# These flags are not supported by clang
+RISCV_GNU_FLAGS = -mno-fdiv -frerun-cse-after-loop -fweb -frename-registers
+
+RISCV_GCC        ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-gcc $(RISCV_GNU_FLAGS)
+RISCV_GXX        ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-g++ $(RISCV_GNU_FLAGS)
+RISCV_ELF2HEX    ?= LD_LIBRARY_PATH=$(RISCV_GNU_PATH)/lib $(RISCV_GNU_PATH)/bin/elf2hex
+RISCV_OBJCOPY    ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-objcopy
+RISCV_AR         ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-ar
+RISCV_OBJDUMP    ?= $(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs-objdump
+RISCV_LINK       ?= $(RISCV_GCC) -t -T $(LINK_SCRIPT) $(RISCV_LDFLAGS)
+RISCV_LD         ?= $(RISCV_GCC)
+
+RISCV_CLANG_ABI        = ilp32f
+RISCV_CLANG_CCPPFLAGS += --target=riscv32 -mabi=$(RISCV_CLANG_ABI)
+RISCV_CLANG_CXXFLAGS  += --sysroot=$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs
+RISCV_CLANG_CXXFLAGS  += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0
+RISCV_CLANG_CXXFLAGS  += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0/riscv32-unknown-elf-dramfs  
+
+RISCV_CLANG       ?= $(RISCV_LLVM_PATH)/bin/clang $(RISCV_CLANG_CFLAGS) $(RISCV_CLANG_CCPPFLAGS)
+RISCV_CLANGXX     ?= $(RISCV_LLVM_PATH)/bin/clang++ $(RISCV_CLANG_CXXFLAGS) $(RISCV_CLANG_CCPPFLAGS)
+RISCV_LLVM_OPT    ?= $(RISCV_LLVM_PATH)/bin/opt
+RISCV_LLVM_LLC    ?= $(RISCV_LLVM_PATH)/bin/llc
+RISCV_LLVM_LIB    ?= $(RISCV_LLVM_PATH)/lib
+
+# Set the default RISC-V Compilers. To override these globally set
+# RISCV_CXX = $(RISCV_CLANGXX), etc. This can also be done on a
+# per-object basis. For example, foo.rvo: RISCV_CXX=$(RISCV_CLANGXX)
+RISCV_CXX ?= $(RISCV_GXX)
+RISCV_CC  ?= $(RISCV_GCC)
+
+################################################################################
+# C/C++ Compilation Flags
+#
+# All RISCV C/C++ compilation variables simply have RISCV_* appended.
+################################################################################
+RISCV_OPT_LEVEL   ?= -O2
+RISCV_ARCH_OP     := rv32imaf
+
+# CCPPFLAGS are common between GCC and G++
+RISCV_CCPPFLAGS += $(RISCV_OPT_LEVEL)
+RISCV_CCPPFLAGS += -march=$(RISCV_ARCH_OP)
+RISCV_CCPPFLAGS += -g
+RISCV_CCPPFLAGS += -static
+RISCV_CCPPFLAGS += -ffast-math
+RISCV_CCPPFLAGS += -fno-common
+RISCV_CCPPFLAGS += -ffp-contract=off
+
+RISCV_CFLAGS   += -std=gnu99 $(RISCV_CCPPFLAGS)
+RISCV_CXXFLAGS += -std=c++11 $(RISCV_CCPPFLAGS)
+RISCV_CXXFLAGS += -fno-threadsafe-statics
+
+RISCV_INCLUDES += -I$(BSG_MANYCORE_COMMON_PATH)
+RISCV_INCLUDES += -I$(BSG_MANYCORE_DIR)/software/bsg_manycore_lib
+
+# TODO: Fail if bsg_tiles_X/Y are not set
+RISCV_DEFINES += -Dbsg_global_X=$(BSG_MACHINE_GLOBAL_X)
+RISCV_DEFINES += -Dbsg_global_Y=$(BSG_MACHINE_GLOBAL_Y)
+RISCV_DEFINES += -Dbsg_group_size=$(BSG_MACHINE_POD_TILES)
+RISCV_DEFINES += -Dbsg_pods_X=$(BSG_MACHINE_PODS_X)
+RISCV_DEFINES += -Dbsg_pods_Y=$(BSG_MACHINE_PODS_Y)
+RISCV_DEFINES += -DIO_X_INDEX=$(BSG_MACHINE_HOST_X_CORD)
+RISCV_DEFINES += -DIO_Y_INDEX=$(BSG_MACHINE_HOST_Y_CORD)
+RISCV_DEFINES += -DPREALLOCATE=0
+RISCV_DEFINES += -DHOST_DEBUG=0
+
+# We build and name a machine-specific crt.rvo because it's REALLY
+# difficult to figure out why your program/cosimulation is hanging
+# when the wrong link script was used during linking
+crt.rvo: $(BSG_MANYCORE_COMMON_PATH)/crt.S
+	$(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.comp.log
+
+# We compile these locally so that we don't interfere with the files in
+# $(BSG_MANYCORE_LIB_PATH).
+# BSG Manycore Library Objects
+LIBBSG_MANYCORE_OBJECTS  += bsg_set_tile_x_y.rvo
+LIBBSG_MANYCORE_OBJECTS  += bsg_tile_config_vars.rvo
+LIBBSG_MANYCORE_OBJECTS  += bsg_printf.rvo
+
+$(LIBBSG_MANYCORE_OBJECTS) main.rvo: RISCV_CXX = $(RISCV_GCC)
+
+$(LIBBSG_MANYCORE_OBJECTS): %.rvo:$(BSG_MANYCORE_LIB_PATH)/%.c
+	$(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@
+
+main.rvo: $(BSG_MANYCORE_CUDALITE_MAIN_PATH)/main.c
+	$(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@
+
+%.rvo: %.c
+	$(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.gcc.log
+
+%.rvo: %.cpp
+	$(RISCV_CXX) $(RISCV_CXXFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.gcc.log
+
+kernel.compile.clean:
+	rm -rf *.rvo *.a
+
+.PRECIOUS: %.rvo
+
+################################################################################
+# Linker Flow
+################################################################################
+
+# ELF File Parameters
+# Default .data section location; LOCAL=>DMEM, SHARED=>DRAM.
+BSG_ELF_DEFAULT_DATA_LOC ?= LOCAL
+
+BSG_ELF_OFF_CHIP_MEM := $(BSG_MACHINE_DRAM_INCLUDED)
+
+# Total addressable DRAM size (in 32-bit WORDS, and SIZE bytes)
+BSG_ELF_DRAM_WORDS := $(shell expr $(BSG_MACHINE_DRAM_BANK_SIZE_WORDS) \* $(BSG_MACHINE_GLOBAL_X))
+BSG_ELF_DRAM_SIZE := $(shell expr $(BSG_ELF_DRAM_WORDS) \* 4)
+
+# Victim Cache Set Size (in 32-bit WORDS and SIZE bytes)
+_BSG_ELF_VCACHE_SET_WORDS := $(shell expr $(BSG_MACHINE_VCACHE_WAY) \* $(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS))
+BSG_ELF_VCACHE_SET_SIZE := $(shell expr $(_BSG_ELF_VCACHE_SET_WORDS) \* 4)
+
+# Victim Cache Column Size (in 32-bit WORDS and SIZE bytes)
+_BSG_ELF_VCACHE_COLUMN_WORDS := $(shell expr $(BSG_MACHINE_VCACHE_SET) \* $(_BSG_ELF_VCACHE_SET_WORDS))
+BSG_ELF_VCACHE_COLUMN_SIZE := $(shell expr $(_BSG_ELF_VCACHE_COLUMN_WORDS) \* 4)
+
+# Victim Cache Total Size (in 32-bit WORDS, and SIZE BYTES)
+_BSG_ELF_VCACHE_MANYCORE_WORDS ?= $(shell expr $(BSG_MACHINE_GLOBAL_X) \* $(_BSG_ELF_VCACHE_COLUMN_WORDS))
+BSG_ELF_VCACHE_MANYCORE_SIZE := $(shell expr $(_BSG_ELF_VCACHE_MANYCORE_WORDS) \* 4)
+
+# Compute the ELF Stack Pointer Location.  
+ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL)
+# If the .data segment is in DMEM (LOCAL) then put it at the top of DMEM. (This is the typical case)
+BSG_ELF_STACK_PTR ?= 0x00000ffc
+else
+  # EVA Offset in DRAM
+  BSG_ELF_DRAM_EVA_OFFSET = 0x80000000
+
+  ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1)
+  # Otherwise, use the top of DRAM (if present),
+  _BSG_ELF_DRAM_LIMIT = $(shell expr $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_DRAM_SIZE))
+  else
+  # Or the Victim Cache address space (if DRAM is disabled/not present).
+  _BSG_ELF_DRAM_LIMIT = $(shell expr $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_VCACHE_MANYCORE_SIZE))
+  endif
+# Finally, Subtract 4 from the maximum memory space address
+BSG_ELF_STACK_PTR = $(shell expr $(_BSG_ELF_DRAM_LIMIT) - 4)
+endif
+
+# Linker script generation parameters
+ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1)
+  ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL)
+    LINK_GEN_OPTS ?= --default_data_loc=dmem --dram_size=$(BSG_ELF_DRAM_SIZE) --sp=$(BSG_ELF_STACK_PTR)
+  else ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), SHARED)
+    LINK_GEN_OPTS ?= --default_data_loc=dram --dram_size=$(BSG_ELF_DRAM_SIZE) --sp=$(BSG_ELF_STACK_PTR)
+  else
+    $(error Invalid BSG_ELF_DEFAULT_DATA_LOC = $(BSG_ELF_DEFAULT_DATA_LOC); Only LOCAL and SHARED are valid)
+  endif
+
+  LINK_GEN_OPTS += --imem_size=0x01000000 # 16MB
+else ifeq ($(BSG_ELF_OFF_CHIP_MEM), 0)
+  ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL)
+    LINK_GEN_OPTS ?= --default_data_loc=dmem --dram_size=$(BSG_ELF_VCACHE_SIZE) --sp=$(BSG_ELF_STACK_PTR)
+  else ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), SHARED)
+    LINK_GEN_OPTS ?= --default_data_loc=dram --dram_size=$(BSG_ELF_VCACHE_SIZE) --sp=$(BSG_ELF_STACK_PTR)
+  else
+    $(error Invalid BSG_ELF_DEFAULT_DATA_LOC = $(BSG_ELF_DEFAULT_DATA_LOC); Only LOCAL and SHARED are valid)
+  endif
+
+  LINK_GEN_OPTS += --imem_size=0x00008000 # 32KB
+else
+  $(error Invalid BSG_ELF_OFF_CHIP_MEM = $(BSG_ELF_OFF_CHIP_MEM); Only 0 and 1 are valid)
+endif
+
+RISCV_LINK_SCRIPT ?= bsg_link.ld
+$(RISCV_LINK_SCRIPT): $(RISCV_LINK_GEN)
+	$(RISCV_LINK_GEN) $(LINK_GEN_OPTS) --out=$@
+
+# Link commands and definitions
+
+RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_dram_size=$(BSG_ELF_DRAM_SIZE)
+RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_vcache_size=$(BSG_ELF_VCACHE_MANYCORE_SIZE)
+RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_stack_ptr=$(BSG_ELF_STACK_PTR)
+
+RISCV_LDFLAGS += -nostdlib
+RISCV_LDFLAGS += -march=$(RISCV_ARCH_OP)
+RISCV_LDFLAGS += -nostartfiles
+RISCV_LDFLAGS += -ffast-math
+RISCV_LDFLAGS += -lc
+RISCV_LDFLAGS += -lm
+RISCV_LDFLAGS += -lgcc
+
+# TODO: temporary fix to solve this problem: https://stackoverflow.com/questions/56518056/risc-v-linker-throwing-sections-lma-overlap-error-despite-lmas-belonging-to-dif
+RISCV_LDFLAGS += -Wl,--no-check-sections 
+
+# This builds a .riscv binary for the current machine type and tile
+# group size. RISCV_TARGET_OBJECTS are .rvo files that will be linked
+# in the final binary.
+%.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) 
+	$(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@
+
+kernel.link.clean:
+	rm -rf *.riscv $(RISCV_LINK_SCRIPT)
+
+
+.PRECIOUS: %.riscv
+.PHONY: kernel.link.clean kernel.compile.clean
+clean: kernel.link.clean kernel.compile.clean
+
diff --git a/examples/graphit/test_vec_add_parallel/Makefile b/examples/graphit/test_vec_add_parallel/Makefile
new file mode 100644
index 000000000..74c5c5b7c
--- /dev/null
+++ b/examples/graphit/test_vec_add_parallel/Makefile
@@ -0,0 +1,142 @@
+# Copyright (c) 2021, University of Washington All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+#
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+#
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Makefile compiles, links, and executes examples Run `make help`
+# to see the available targets for the selected platform.
+
+################################################################################
+# environment.mk verifies the build environment and sets the following
+# makefile variables:
+#
+# LIBRAIRES_PATH: The path to the libraries directory
+# HARDWARE_PATH: The path to the hardware directory
+# EXAMPLES_PATH: The path to the examples directory
+# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
+# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
+###############################################################################
+
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+
+include $(REPLICANT_PATH)/environment.mk
+SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
+CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
+
+# TEST_NAME is the basename of the executable
+TEST_NAME = main
+# KERNEL_NAME is the name of the CUDA-Lite Kernel
+KERNEL_NAME = vec_add_parallel
+
+###############################################################################
+# Host code compilation flags and flow
+###############################################################################
+
+# TEST_SOURCES is a list of source files that need to be compiled
+TEST_SOURCES = main.c
+
+DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE
+CDEFINES += 
+CXXDEFINES += 
+
+FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
+CFLAGS   += -std=c99 $(FLAGS)
+CXXFLAGS += -std=c++11 $(FLAGS)
+
+# compilation.mk defines rules for compilation of C/C++
+include $(EXAMPLES_PATH)/compilation.mk
+
+# Specify any header file dependencies
+main.o: INCLUDES += -I$(EXAMPLES_PATH)
+main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h
+
+###############################################################################
+# Host code link flags and flow
+###############################################################################
+
+LDFLAGS +=
+
+# link.mk defines rules for linking of the final execution binary.
+include $(EXAMPLES_PATH)/link.mk
+
+###############################################################################
+# Device code compilation flow
+###############################################################################
+
+# BSG_MANYCORE_KERNELS is a list of manycore executables that should
+# be built before executing.
+BSG_MANYCORE_KERNELS = kernel.riscv
+
+kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
+kernel.riscv: kernel.rvo
+
+# Tile Group Dimensions
+TILE_GROUP_DIM_X = 2
+TILE_GROUP_DIM_Y = 2
+RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
+RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
+
+include $(EXAMPLES_PATH)/cuda/riscv.mk
+
+###############################################################################
+# Execution flow
+#
+# C_ARGS: Use this to pass arguments that you want to appear in argv
+#         For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
+#
+# SIM_ARGS: Use this to pass arguments to the simulator
+###############################################################################
+C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME)
+
+SIM_ARGS ?=
+
+# Include platform-specific execution rules
+include $(EXAMPLES_PATH)/execution.mk
+
+###############################################################################
+# Regression Flow
+###############################################################################
+
+regression: main.exec.log
+	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
+
+###############################################################################
+# Default rules, help, and clean
+###############################################################################
+.DEFAULT_GOAL := help
+help:
+	@echo "Usage:"
+	@echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}"
+	@echo "      $(TEST_NAME).profile: Build executable with profilers enabled"
+	@echo "      $(TEST_NAME).debug: Build waveform executable (if VCS)"
+	@echo "      $(TEST_NAME).{profile,debug}.log: Run specific executable"
+	@echo "      clean: Remove all subdirectory-specific outputs"
+
+
+.PHONY: clean
+
+clean:
+
+
diff --git a/examples/graphit/test_vec_add_parallel/kernel.cpp b/examples/graphit/test_vec_add_parallel/kernel.cpp
new file mode 100644
index 000000000..b2ea1ae88
--- /dev/null
+++ b/examples/graphit/test_vec_add_parallel/kernel.cpp
@@ -0,0 +1,20 @@
+//This kernel adds 2 vectors 
+
+#include <bsg_manycore.h>
+#include <bsg_set_tile_x_y.h>
+#include <bsg_tile_group_barrier.hpp>
+
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+
+extern "C" __attribute__ ((noinline))
+int kernel_vec_add_parallel(int *A, int *B, int *C, int N, int block_size_x) {
+
+	int start_x = block_size_x * (__bsg_tile_group_id_y * __bsg_grid_dim_x + __bsg_tile_group_id_x); 
+	for (int iter_x = __bsg_id; iter_x < block_size_x; iter_x += bsg_tiles_X * bsg_tiles_Y) { 
+		C[start_x + iter_x] = A[start_x + iter_x] + B[start_x + iter_x];
+	}
+
+	barrier.sync();
+
+	return 0;
+}
diff --git a/examples/graphit/test_vec_add_parallel/main.c b/examples/graphit/test_vec_add_parallel/main.c
new file mode 100644
index 000000000..07c9bd209
--- /dev/null
+++ b/examples/graphit/test_vec_add_parallel/main.c
@@ -0,0 +1,196 @@
+// Copyright (c) 2019, University of Washington All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+// 
+// Redistributions of source code must retain the above copyright notice, this list
+// of conditions and the following disclaimer.
+// 
+// Redistributions in binary form must reproduce the above copyright notice, this
+// list of conditions and the following disclaimer in the documentation and/or
+// other materials provided with the distribution.
+// 
+// Neither the name of the copyright holder nor the names of its contributors may
+// be used to endorse or promote products derived from this software without
+// specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_errno.h>
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_loader.h>
+#include <bsg_manycore_cuda.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <stdio.h>
+#include <cl_manycore_regression.h>
+
+#define ALLOC_NAME "default_allocator"
+
+/*!
+ * Runs the vector addition a grid of 2x2 tile groups. A[N] + B[N] --> C[N]
+ * Grid dimensions are determines by how much of a load we want for each tile group (block_size_x)
+ * This tests uses the software/spmd/bsg_cuda_lite_runtime/vec_add_parallel/ Manycore binary in the BSG Manycore bitbucket repository.  
+*/
+
+
+void host_vec_add (int *A, int *B, int *C, int N) { 
+        for (int i = 0; i < N; i ++) { 
+                C[i] = A[i] + B[i];
+        }
+        return;
+}
+
+
+int kernel_vec_add_parallel (int argc, char **argv) {
+        int rc;
+        char *bin_path, *test_name;
+        struct arguments_path args = {NULL, NULL};
+
+        argp_parse (&argp_path, argc, argv, 0, 0, &args);
+        bin_path = args.path;
+        test_name = args.name;
+
+        bsg_pr_test_info("Running the CUDA Vector Addition Kernel on a grid of 2x2 tile groups.\n\n");
+
+        srand(time); 
+
+        /*********************/
+        /* Initialize device */
+        /*********************/
+        hb_mc_device_t device;
+        BSG_CUDA_CALL(hb_mc_device_init(&device, test_name, 0));
+
+        hb_mc_pod_id_t pod;
+        hb_mc_device_foreach_pod_id(&device, pod)
+        {
+                /**********************************************************************/
+                /* Define path to binary.                                             */
+                /* Initialize device, load binary and unfreeze tiles.                 */
+                /**********************************************************************/
+                bsg_pr_test_info("Loading program for %s onto pod %d\n",
+                                 test_name, pod);
+
+                BSG_CUDA_CALL(hb_mc_device_set_default_pod(&device, pod));
+                BSG_CUDA_CALL(hb_mc_device_program_init(&device, bin_path, ALLOC_NAME, 0));
+
+                /*****************************************************************************************************************
+                 * Allocate memory on the device for A, B and C.
+                 ******************************************************************************************************************/
+                uint32_t N = 1024;
+
+                eva_t A_device, B_device, C_device;
+                BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &A_device)); /* allocate A[N] on the device */
+                BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &B_device)); /* allocate B[N] on the device */
+                BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &C_device)); /* allocate C[N] on the device */
+
+                /*****************************************************************************************************************
+                 * Allocate memory on the host for A & B and initialize with random values.
+                 ******************************************************************************************************************/
+                uint32_t A_host[N]; /* allocate A[N] on the host */
+                uint32_t B_host[N]; /* allocate B[N] on the host */
+                for (int i = 0; i < N; i++) { /* fill A with arbitrary data */
+                        A_host[i] = rand() & 0xFFFF;
+                        B_host[i] = rand() & 0xFFFF;
+                }
+
+                /*****************************************************************************************************************
+                 * Copy A & B from host onto device DRAM.
+                 ******************************************************************************************************************/
+                void *dst = (void *) ((intptr_t) A_device);
+                void *src = (void *) &A_host[0];
+                BSG_CUDA_CALL(hb_mc_device_memcpy (&device, dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_DEVICE)); /* Copy A to the device  */
+
+                dst = (void *) ((intptr_t) B_device);
+                src = (void *) &B_host[0];
+                BSG_CUDA_CALL(hb_mc_device_memcpy (&device, dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_DEVICE)); /* Copy B to the device */
+
+                /*****************************************************************************************************************
+                 * Define block_size_x/y: amount of work for each tile group
+                 * Define tg_dim_x/y: number of tiles in each tile group
+                 * Calculate grid_dim_x/y: number of tile groups needed based on block_size_x/y
+                 ******************************************************************************************************************/
+                uint32_t block_size_x = 64;
+                hb_mc_dimension_t tg_dim = { .x = 2, .y = 2 };
+                hb_mc_dimension_t grid_dim = { .x = N / block_size_x, .y = 1 };
+
+                /*****************************************************************************************************************
+                 * Prepare list of input arguments for kernel.
+                 ******************************************************************************************************************/
+                int cuda_argv[5] = {A_device, B_device, C_device, N, block_size_x};
+
+                /*****************************************************************************************************************
+                 * Enquque grid of tile groups, pass in grid and tile group dimensions, kernel name, number and list of input arguments
+                 ******************************************************************************************************************/
+                BSG_CUDA_CALL(hb_mc_kernel_enqueue (&device, grid_dim, tg_dim, "kernel_vec_add_parallel", 5, cuda_argv));
+
+                /*****************************************************************************************************************
+                 * Launch and execute all tile groups on device and wait for all to finish.
+                 ******************************************************************************************************************/
+                BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(&device));
+
+                /*****************************************************************************************************************
+                 * Copy result matrix back from device DRAM into host memory.
+                 ******************************************************************************************************************/
+                uint32_t C_host[N];
+                src = (void *) ((intptr_t) C_device);
+                dst = (void *) &C_host[0];
+                BSG_CUDA_CALL(hb_mc_device_memcpy (&device, (void *) dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_HOST)); /* copy C to the host */
+
+                /*****************************************************************************************************************
+                 * Freeze the tiles and memory manager cleanup.
+                 ******************************************************************************************************************/
+                BSG_CUDA_CALL(hb_mc_device_program_finish(&device));
+
+                /*****************************************************************************************************************
+                 * Calculate the expected result using host code and compare the results.
+                 ******************************************************************************************************************/
+                uint32_t C_expected[N];
+                host_vec_add (A_host, B_host, C_expected, N);
+
+
+                int mismatch = 0;
+                for (int i = 0; i < N; i++) {
+                        if (A_host[i] + B_host[i] != C_host[i]) {
+                                bsg_pr_err(BSG_RED("Mismatch: ") "C[%d]:  0x%08" PRIx32 " + 0x%08" PRIx32 " = 0x%08" PRIx32 "\t Expected: 0x%08" PRIx32 "\n",
+                                           i , A_host[i], B_host[i], C_host[i], C_expected[i]);
+                                mismatch = 1;
+                        }
+                }
+
+                if (mismatch) {
+                        return HB_MC_FAIL;
+                }
+        }
+
+        BSG_CUDA_CALL(hb_mc_device_finish(&device));
+
+        return HB_MC_SUCCESS;
+}
+
+#ifdef VCS
+int vcs_main(int argc, char ** argv)
+#else
+int main(int argc, char ** argv)
+#endif
+{
+        bsg_pr_test_info("test_vec_add_parallel Regression Test \n");
+        int rc = kernel_vec_add_parallel(argc, argv);
+        bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
+        return rc;
+}
+
+

From 7b091c95af49717dcedff8bb4d3a3423d75f6082 Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Mon, 22 Mar 2021 14:52:25 -0700
Subject: [PATCH 10/22] broken pr-nibble test

---
 examples/graphit/Makefile                     |   1 +
 examples/graphit/test_pr_nibble/Makefile      | 150 +++++++++++
 examples/graphit/test_pr_nibble/kernel.cpp    | 225 ++++++++++++++++
 .../kernel/include/pr_nibble.hpp              |   9 +
 examples/graphit/test_pr_nibble/main.cpp      | 247 ++++++++++++++++++
 examples/graphit/test_pr_nibble/pr.hpp        |  25 ++
 examples/graphit/test_pr_nibble/pr_host.hpp   |  53 ++++
 .../graphit/test_vec_add_parallel/Makefile    |   2 +-
 8 files changed, 711 insertions(+), 1 deletion(-)
 create mode 100644 examples/graphit/test_pr_nibble/Makefile
 create mode 100644 examples/graphit/test_pr_nibble/kernel.cpp
 create mode 100644 examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp
 create mode 100644 examples/graphit/test_pr_nibble/main.cpp
 create mode 100644 examples/graphit/test_pr_nibble/pr.hpp
 create mode 100644 examples/graphit/test_pr_nibble/pr_host.hpp

diff --git a/examples/graphit/Makefile b/examples/graphit/Makefile
index 1ac9533b9..f8389272b 100644
--- a/examples/graphit/Makefile
+++ b/examples/graphit/Makefile
@@ -46,6 +46,7 @@ include $(EXAMPLES_PATH)/link.mk
 
 # Define the tests that get run
 TESTS += test_vec_add_parallel
+TESTS += test_pr_nibble
 
 regression: $(TESTS)
 	@echo "GRAPHIT REGRESSION PASSED"
diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile
new file mode 100644
index 000000000..d456fd96d
--- /dev/null
+++ b/examples/graphit/test_pr_nibble/Makefile
@@ -0,0 +1,150 @@
+# Copyright (c) 2021, University of Washington All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+#
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+#
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Makefile compiles, links, and executes examples Run `make help`
+# to see the available targets for the selected platform.
+
+################################################################################
+# environment.mk verifies the build environment and sets the following
+# makefile variables:
+#
+# LIBRAIRES_PATH: The path to the libraries directory
+# HARDWARE_PATH: The path to the hardware directory
+# EXAMPLES_PATH: The path to the examples directory
+# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
+# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
+###############################################################################
+
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+
+include $(REPLICANT_PATH)/environment.mk
+SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
+CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
+GRAPHIT_PATH = $(REPLICANT_PATH)/../graphit-new
+CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
+
+# TEST_NAME is the basename of the executable
+TEST_NAME = main
+# KERNEL_NAME is the name of the CUDA-Lite Kernel
+KERNEL_NAME = pr_nibble
+
+###############################################################################
+# Host code compilation flags and flow
+###############################################################################
+
+# TEST_SOURCES is a list of source files that need to be compiled
+TEST_SOURCES = main.cpp
+
+DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE
+CDEFINES += 
+CXXDEFINES += 
+
+FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
+CFLAGS   += -std=c99 $(FLAGS)
+CXXFLAGS += -std=c++14 $(FLAGS)
+
+HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ 
+
+# compilation.mk defines rules for compilation of C/C++
+include $(EXAMPLES_PATH)/compilation.mk
+
+# Specify any header file dependencies
+main.o: INCLUDES += -I$(EXAMPLES_PATH) -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/
+main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h
+
+###############################################################################
+# Host code link flags and flow
+###############################################################################
+
+LDFLAGS +=
+
+# link.mk defines rules for linking of the final execution binary.
+include $(EXAMPLES_PATH)/link.mk
+
+###############################################################################
+# Device code compilation flow
+###############################################################################
+
+# BSG_MANYCORE_KERNELS is a list of manycore executables that should
+# be built before executing.
+BSG_MANYCORE_KERNELS = kernel.riscv
+
+kernel.rvo: RISCV_CXX = $(RISCV_GXX)
+kernel.riscv: kernel.rvo
+
+# Tile Group Dimensions
+TILE_GROUP_DIM_X = 16
+TILE_GROUP_DIM_Y = 8
+RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
+RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
+
+RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_pr_nibble/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/
+
+include $(EXAMPLES_PATH)/graphit/riscv.mk
+
+###############################################################################
+# Execution flow
+#
+# C_ARGS: Use this to pass arguments that you want to appear in argv
+#         For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
+#
+# SIM_ARGS: Use this to pass arguments to the simulator
+###############################################################################
+C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH)
+
+SIM_ARGS ?=
+
+# Include platform-specific execution rules
+include $(EXAMPLES_PATH)/execution.mk
+
+###############################################################################
+# Regression Flow
+###############################################################################
+
+regression: main.exec.log
+	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
+
+###############################################################################
+# Default rules, help, and clean
+###############################################################################
+.DEFAULT_GOAL := help
+help:
+	@echo "Usage:"
+	@echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}"
+	@echo "      $(TEST_NAME).profile: Build executable with profilers enabled"
+	@echo "      $(TEST_NAME).debug: Build waveform executable (if VCS)"
+	@echo "      $(TEST_NAME).{profile,debug}.log: Run specific executable"
+	@echo "      clean: Remove all subdirectory-specific outputs"
+
+
+.PHONY: clean
+
+clean:
+
+
diff --git a/examples/graphit/test_pr_nibble/kernel.cpp b/examples/graphit/test_pr_nibble/kernel.cpp
new file mode 100644
index 000000000..d49112e73
--- /dev/null
+++ b/examples/graphit/test_pr_nibble/kernel.cpp
@@ -0,0 +1,225 @@
+//#define DEBUG
+#include <bsg_manycore.h>
+
+#ifdef DEBUG
+#define BSG_TILE_GROUP_X_DIM 1 
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM 
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM 
+#else
+#include <bsg_set_tile_x_y.h>
+// #define BSG_TILE_GROUP_X_DIM 16 
+// #define BSG_TILE_GROUP_Y_DIM 8
+#endif
+
+#include <bsg_tile_group_barrier.hpp>
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+
+#include <pr_nibble.hpp>
+#include <cstring>
+
+#ifdef DEBUG
+#define pr_dbg(fmt, ...)			\
+		bsg_printf(fmt, ##__VA_ARGS__)
+#else
+#define pr_dbg(fmt, ...)
+#endif
+
+__attribute__((section(".dram"))) double  * __restrict p;
+__attribute__((section(".dram"))) double  * __restrict old_rank;
+__attribute__((section(".dram"))) double  * __restrict new_rank;
+__attribute__((section(".dram"))) int  * __restrict out_degree;
+__attribute__((section(".dram"))) int  * __restrict generated_tmp_vector_3;
+//__attribute__((section(".dram"))) double alpha = 0.15; 
+//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; 
+
+template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{
+  //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
+  //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V);
+  int start, end;
+  local_range(V, &start, &end);
+  for ( int d = start; d < end; d++) {
+    int degree = in_indices[d + 1] - in_indices[d];
+    int * neighbors = &in_neighbors[in_indices[d]];
+    for(int s = 0; s < degree; s++) { 
+      if(from_vertexset[neighbors[s]]) {
+        //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); 
+        apply_func (neighbors[s] , d);
+      }
+    } //end of loop on in neighbors
+  } //end of outer for loop
+  return 0;
+} //end of edgeset apply function 
+
+template <typename APPLY_FUNC > int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{
+  //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
+  //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V);
+  int start, end;
+  local_range(V, &start, &end);
+  for ( int s = start; s < end; s++) {
+    if(from_vertexset[s]) {
+      int degree = out_indices[s + 1] - out_indices[s];
+      int * neighbors = &out_neighbors[out_indices[s]];
+      for(int d = 0; d < degree; d++) { 
+        apply_func (s, neighbors[d]);
+	//if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); }
+	
+      }
+    } //end of loop on in neighbors
+  } //end of outer for loop
+  //barrier.sync();
+  return 0;
+} //end of edgeset apply function 
+
+
+struct generated_vector_op_apply_func_4
+{
+  void operator() (int v)
+  {
+    out_degree[v] = generated_tmp_vector_3[v];
+  };
+};
+struct new_rank_generated_vector_op_apply_func_2
+{
+  void operator() (int v)
+  {
+    new_rank[v] = ((float) 0) ;
+  };
+};
+struct old_rank_generated_vector_op_apply_func_1
+{
+  void operator() (int v)
+  {
+    old_rank[v] = ((float) 0) ;
+  };
+};
+struct p_generated_vector_op_apply_func_0
+{
+  void operator() (int v)
+  {
+    p[v] = ((float) 0) ;
+  };
+};
+struct updateEdge
+{
+  void operator() (int src, int dst)
+  {
+    double alpha = 0.15; 
+    new_rank[dst] = (new_rank[dst] + (((((1)  - alpha) / ((1)  + alpha)) * old_rank[src]) / out_degree[src]));
+  };
+};
+struct updateSelf
+{
+  void operator() (int v)
+  {
+    double alpha = 0.15; 
+    p[v] = (p[v] + ((((2)  * alpha) / ((1)  + alpha)) * old_rank[v]));
+    new_rank[v] = (0) ;
+  };
+};
+struct filter_frontier
+{
+  bool operator() (int v)
+  {
+    double epsilon = (double) 1e-6; 
+    bool output ;
+    if(old_rank[v] == 0) return 0;
+    //output = (old_rank[v]) > ((out_degree[v] * epsilon));
+    output = (old_rank[v]) > ((out_degree[v] * epsilon));
+    return output;
+  };
+};
+
+extern "C" int  __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		p_generated_vector_op_apply_func_0()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		old_rank_generated_vector_op_apply_func_1()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		new_rank_generated_vector_op_apply_func_2()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		generated_vector_op_apply_func_4()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) updateSelf_kernel(int V, int tag_c) {
+        //pr_dbg("%i: on update self tag: %i\n", bsg_id, tag_c);
+  bsg_cuda_print_stat_start(tag_c);
+	barrier.sync();
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		updateSelf()(iter_x);
+	}
+  bsg_cuda_print_stat_end(tag_c);
+	barrier.sync();
+	return 0;
+}
+extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
+	barrier.sync();
+        //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c);
+  bsg_cuda_print_stat_start(tag_c);
+	edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x);
+  bsg_cuda_print_stat_end(tag_c);
+	barrier.sync();
+	return 0;
+}
+
+ extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
+	barrier.sync(); 
+  bsg_cuda_print_stat_start(tag_c);
+	edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x);
+  bsg_cuda_print_stat_end(tag_c);
+	barrier.sync();
+	return 0;
+}
+
+extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { 
+        //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c);
+        //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c);
+  bsg_cuda_print_stat_start(tag_c);
+	barrier.sync();
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		if (iter_x < V) {
+			next5[iter_x] = 0;
+			if ( filter_frontier()( iter_x ) ) {
+				next5[iter_x] = 1;
+				//pr_dbg("added vertex %i to frontier\n", iter_x);
+			}
+     		}
+		else { break; }
+	} //end of loop
+  bsg_cuda_print_stat_end(tag_c);
+	barrier.sync();
+	return 0;
+}
+
+
diff --git a/examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp b/examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp
new file mode 100644
index 000000000..ee50a54d6
--- /dev/null
+++ b/examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#ifndef __PR_PULL_BENCHMARK_HPP
+#define __PR_PULL_BENCHMARK_HPP
+
+#include <math.h>
+#include <local_range.h>
+#include <vertex_struct.h>
+#include <atomics.h>
+#endif
diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp
new file mode 100644
index 000000000..e7813101a
--- /dev/null
+++ b/examples/graphit/test_pr_nibble/main.cpp
@@ -0,0 +1,247 @@
+#include "pr.hpp"
+
+//#define DEBUG
+
+#define VERIFY 1
+
+#ifdef DEBUG
+#define X 1 
+#define Y 1
+#else
+#define X 16 
+#define Y 8
+#endif
+
+#define ROOT 6 //eventually we will need to do 50 start vertices (in parallel)
+#define NUM_LOCKS 1024 //width of manycore * 64
+
+GraphHB edges; 
+GlobalScalar<hb_mc_eva_t> p_dev;
+GlobalScalar<hb_mc_eva_t> old_rank_dev;
+GlobalScalar<hb_mc_eva_t> new_rank_dev;
+GlobalScalar<hb_mc_eva_t> out_degree_dev;
+GlobalScalar<float > alpha_dev;
+GlobalScalar<float > epsilon_dev;
+
+#include "pr_host.hpp"
+
+int launch(int argc, char ** argv){
+  InputParser input(argc, argv);
+  if(!input.cmdOptionExists("-g")){
+    std::cerr << "no input args\n";
+    return 0;
+  }
+  std::string ucode_path = input.getRISCVFile();
+
+  int iter = 0;
+  // std::string iterstrbase = "iteration-";
+  // auto pos = ucode_path.find(iterstrbase);
+  // auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos);
+  // std::stringstream ss(iterstr);
+  // ss >> iter;
+  std::cerr << "iteration: " << iter << std::endl;
+
+  int version = 0; //default to vertex pull
+  if(ucode_path.find("push") != std::string::npos) {
+    version = 1;
+  }
+  else if(ucode_path.find("block") != std::string::npos) {
+    version = 2;
+  }
+  int hybrid = 0; //default to vertex pull
+  if(ucode_path.find("hybrid") != std::string::npos) {
+    hybrid = 1;
+  }
+  std::cerr << "version: " << version << std::endl;
+  std::cerr << "load microcode" << std::endl;
+  hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
+
+  std::cerr << "load graph" << std::endl;
+  std::string graph_f = input.getCmdOption("-g");
+  edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); 
+
+  std::cerr << "size of graph: " << std::endl;
+  std::cerr << edges.num_nodes() << std::endl;
+  std::cerr << edges.num_edges() << std::endl; 
+
+  std::cerr << "init global scalars" << std::endl; 
+  p_dev = GlobalScalar<hb_mc_eva_t>("p");
+  hammerblade::init_global_array<double>(hammerblade::builtin_getVerticesHB(edges), p_dev);
+  old_rank_dev = GlobalScalar<hb_mc_eva_t>("old_rank");
+  hammerblade::init_global_array<double>(hammerblade::builtin_getVerticesHB(edges), old_rank_dev);
+  new_rank_dev = GlobalScalar<hb_mc_eva_t>("new_rank");
+  hammerblade::init_global_array<double>(hammerblade::builtin_getVerticesHB(edges), new_rank_dev);
+  out_degree_dev = GlobalScalar<hb_mc_eva_t>("out_degree");
+  hammerblade::init_global_array<int32_t>(hammerblade::builtin_getVerticesHB(edges), out_degree_dev);
+  alpha_dev = GlobalScalar<float>("alpha");
+  epsilon_dev = GlobalScalar<float>("epsilon");
+
+  std::cerr << "init locks" << std::endl;
+  GlobalScalar<hb_mc_eva_t> glbl_locks = GlobalScalar<hb_mc_eva_t>("locks");
+  hammerblade::init_global_array<std::atomic<int>>(NUM_LOCKS, glbl_locks);
+  std::atomic<int> tmp_a[NUM_LOCKS] = {};
+
+  Device::Ptr device = Device::GetInstance();
+
+  float alpha = ((float) 0.15) ;
+  float epsilon = ((float) 1e-06) ;
+  int start_vertex = ROOT;
+
+
+  Vector<int32_t> frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
+  //Vector<int32_t> next_frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
+
+  std::vector<int32_t> hfrontier(edges.num_nodes(), 0);
+  std::vector<double> p(edges.num_nodes(), (double) 0.0);
+  std::vector<double> new_rank(edges.num_nodes(), (double) 0.0);
+  std::vector<double> old_rank(edges.num_nodes(), (double) 0.0);
+  std::vector<int32_t> out_degs = edges.get_out_degrees();
+
+  //compute up to current iter on host
+  hfrontier[start_vertex] = 1;
+  new_rank[start_vertex] = (double) 1.0;
+  old_rank[start_vertex] = (double) 1.0;
+  host_pr_calc(p, old_rank, new_rank, hfrontier, iter);
+
+  frontier.copyToDevice(hfrontier.data(), hfrontier.size());
+
+  //next_frontier.copyToDevice(zeros.data(), zeros.size());
+  hammerblade::write_global_buffer_dma<double>(p.data(), p_dev, p.size());  
+  hammerblade::write_global_buffer_dma<double>(old_rank.data(), old_rank_dev, old_rank.size());  
+  hammerblade::write_global_buffer_dma<double>(new_rank.data(), new_rank_dev, new_rank.size());  
+  hammerblade::write_global_buffer_dma<int32_t>(out_degs.data(), out_degree_dev, out_degs.size());  
+  hammerblade::write_global_buffer_dma<std::atomic<int>>(tmp_a, glbl_locks, NUM_LOCKS);
+
+  device->freeze_cores();
+  device->write_dma();
+  device->unfreeze_cores(); 
+  if(hybrid || version == 2) {
+    int num_items = std::count(hfrontier.begin(), hfrontier.end(), 1);
+    int dir = calculate_direction(num_items, hfrontier, edges, edges.num_nodes(), edges.num_edges());
+    if(dir){ 
+      if(version != 2) version = 0; //pull
+    } else {
+      version = 1; //push
+    }
+  }
+
+  //alpha_dev.set(alpha);
+  //epsilon_dev.set(epsilon);
+
+  //hammerblade::builtin_addVertexHB(frontier, start_vertex);
+  //hammerblade::insert_val(start_vertex, ((double) 1) , old_rank_dev);
+  //hammerblade::insert_val(start_vertex, ((double) 1) , new_rank_dev);
+
+  std::cerr << "start of while loop\n";
+  int tag_c = 0;
+  //double host_rank[edges.num_nodes()];
+  //ofstream prog_file;
+  //prog_file.open("./progress.txt");
+  //prog_file << "starting computation w/ root vertex: " << start_vertex << std::endl;
+  //while ( builtin_getVertexSetSizeHB(frontier, edges.num_nodes()) != 0) 
+  //while ( iter < 16) 
+  for(int i = 0; i < 1; i++)
+  {
+    int f_sz = 0;
+    //new_rank = old_rank;
+    switch(version) {
+      case 0: //vertex pull
+	    std::cerr << "pull kernel\n";
+    	std::cerr << "run update self vertex kernel\n";
+    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c});
+    	device->runJobs();
+    	tag_c++;
+    	std::cerr << "run update edges kernel on iter : " << iter << "\n";
+      device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
+    	device->runJobs();
+      tag_c++;
+    	std::cerr << "create next frontier\n";
+    	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+    	device->runJobs();
+    	std::cerr << "swap arrays\n";
+    	hammerblade::swap_global_arrays<double>(new_rank_dev, old_rank_dev);
+      f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
+      std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
+      break;
+      case 1: //vertex push
+	    std::cerr << "push kernel\n";
+    	std::cerr << "run update self vertex kernel\n";
+    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c});
+    	device->runJobs();
+    	tag_c++;
+    	std::cerr << "run update edges kernel on iter : " << iter << "\n";
+      device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); 
+    	device->runJobs();
+      tag_c++;
+    	std::cerr << "swap arrays\n";
+   	hammerblade::swap_global_arrays<double>(new_rank_dev, old_rank_dev);
+    	std::cerr << "create next frontier\n";
+    	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+    	device->runJobs();
+      f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
+      std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
+      break;
+      case 2: //blocked pull
+	    std::cerr << "blocked pull kernel\n";
+    	std::cerr << "run update self vertex kernel\n";
+    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c});
+    	device->runJobs();
+    	tag_c++;
+    	std::cerr << "run update edges kernel on iter : " << iter << "\n";
+      device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInVertexlistAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
+    	device->runJobs();
+      tag_c++;
+    	std::cerr << "create next frontier\n";
+    	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+    	device->runJobs();
+    	std::cerr << "swap arrays\n";
+    	hammerblade::swap_global_arrays<double>(new_rank_dev, old_rank_dev);
+      f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
+      std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
+      break;
+    }
+    tag_c++;
+
+    iter++;
+    //prog_file << "finished iteration: " << iter << std::endl; 
+  }
+  std::cerr << "*******end of program********\n";
+  //prog_file << "*******end of program********\n";
+  std::cerr << "took: " << iter << " iterations to complete\n";
+  //prog_file << "took: " << iter << " iterations to complete\n";
+  //prog_file.close();
+  if(VERIFY) {
+    ofstream ver_file;
+    ver_file.open("./rank.txt");
+    double host_rank[edges.num_nodes()];
+    hammerblade::read_global_buffer_dma<double>(host_rank, old_rank_dev, edges.num_nodes());
+    for(int i = 0; i < edges.num_nodes(); i++) {
+      ver_file << host_rank[i] << std::endl;
+    }
+    ver_file.close();  
+  }
+  return 0;
+}
+
+#ifdef VCS
+int vcs_main(int argc, char ** argv){
+    // int argc = get_argc(args);
+    // char *argv[argc];
+    // get_argv(args, argc, argv);
+    // svScope scope;
+    // scope = svGetScopeFromName("tb");
+    // svSetScope(scope);
+    bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n");
+    int rc = launch(argc, argv); 
+    //*exit_code = rc;
+    bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
+    return rc;
+}
+#else
+int main(int argc, char ** argv) {
+    bsg_pr_test_info("Unified Main CUDA Regression Test (F1)\n");
+    int rc = launch(argc, argv);
+    bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
+    return rc;
+}
+#endif 
diff --git a/examples/graphit/test_pr_nibble/pr.hpp b/examples/graphit/test_pr_nibble/pr.hpp
new file mode 100644
index 000000000..b73169532
--- /dev/null
+++ b/examples/graphit/test_pr_nibble/pr.hpp
@@ -0,0 +1,25 @@
+#ifndef __PR_PULL_BENCHMARK_HPP
+#define __PR_PULL_BENCHMARK_HPP
+
+#include "hb_intrinsics.h"
+#include "infra_hb/host/arg_parser.hpp"
+#include <string.h>
+#include <stdio.h>
+#include <iostream>
+#include <fstream> 
+#include <atomic>
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_errno.h>
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_loader.h>
+#include <bsg_manycore_cuda.h>
+#include <cl_manycore_regression.h>
+//#include "../common.h"
+
+
+using hammerblade::Device;
+using hammerblade::Vector;
+using hammerblade::GraphHB;
+using hammerblade::GlobalScalar;
+
+#endif
diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp
new file mode 100644
index 000000000..0168845f0
--- /dev/null
+++ b/examples/graphit/test_pr_nibble/pr_host.hpp
@@ -0,0 +1,53 @@
+//function to compute pr-nibble on host up to current iter
+#pragma once
+#include <iostream>
+#include <fstream>
+
+inline void host_pr_calc(std::vector<double> & p, std::vector<double> & old_rank, std::vector<double> & new_rank, std::vector<int> & frontier, int iter) {
+    double alpha = (double) 0.15;
+    double epsilon = (double) 1e-06;
+    auto g = edges.getHostGraph();
+    int * in_neigh = g.in_neighbors_shared_.get();
+    int ** in_index = g.in_index_shared_.get();
+		std::string fname = "iter-" + std::to_string(iter) + ".txt";
+    ofstream ofile;
+    ofile.open (fname);
+    for(int i = 0; i < iter; i++) {
+        //std::memcpy(new_rank, old_rank, sizeof(double)*edges.num_nodes());
+	    //new_rank = old_rank;
+        new_rank.assign(old_rank.begin(), old_rank.end());
+        //print out iteration and size:
+        int num_items = std::count(frontier.begin(), frontier.end(), 1);
+        std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl;
+        //update_self
+        for(int v = 0; v < g.num_nodes(); v++) {
+            p[v] += (2.0 * alpha) / (1.0  + alpha) * old_rank[v];
+            new_rank[v] = (double) 0.0 ;
+        }
+        //update edges
+        for(int d = 0; d < g.num_nodes(); d++) {
+            for(int s : g.in_neigh(d)) {
+                if(frontier[s]){
+                    double update = ((1.0 - alpha) / (1.0  + alpha)) * old_rank[s];
+										update = update / ((double) g.out_degree(s));
+										new_rank[d] += update;
+										if(i == (iter - 1)) {ofile << s << " " << d << " " << new_rank[d] << std::endl;}
+                }
+            }
+        }
+        //old_rank.swap(new_rank);
+        //std::memcpy(old_rank, new_rank, sizeof(double)*edges.num_nodes());
+        //old_rank = new_rank;
+        old_rank.assign(new_rank.begin(), new_rank.end());
+        //update frontier
+        for(int v = 0; v < g.num_nodes(); v++) {
+            frontier[v] = 0;
+            if(g.out_degree(v) > 0 && old_rank[v] >= (((double) g.out_degree(v)) * epsilon)) {
+                frontier[v] = 1;
+            }
+        }
+    }
+		ofile.close();
+    int num_items = std::count(frontier.begin(), frontier.end(), 1);
+    std::cerr << "returning with frontier size: " << num_items << std::endl;
+}
diff --git a/examples/graphit/test_vec_add_parallel/Makefile b/examples/graphit/test_vec_add_parallel/Makefile
index 74c5c5b7c..4291f23a1 100644
--- a/examples/graphit/test_vec_add_parallel/Makefile
+++ b/examples/graphit/test_vec_add_parallel/Makefile
@@ -98,7 +98,7 @@ TILE_GROUP_DIM_Y = 2
 RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
 RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
 
-include $(EXAMPLES_PATH)/cuda/riscv.mk
+include $(EXAMPLES_PATH)/graphit/riscv.mk
 
 ###############################################################################
 # Execution flow

From e0276f6674ca3e6570c31136791f219ab3dd0070 Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Mon, 22 Mar 2021 16:17:00 -0700
Subject: [PATCH 11/22] updated, still broken

---
 examples/graphit/riscv.mk                  |  4 ++--
 examples/graphit/test_pr_nibble/Makefile   |  6 +++---
 examples/graphit/test_pr_nibble/kernel.cpp | 12 ++++++------
 examples/graphit/test_pr_nibble/main.cpp   | 10 ++--------
 examples/graphit/test_pr_nibble/pr.hpp     |  2 +-
 5 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/examples/graphit/riscv.mk b/examples/graphit/riscv.mk
index 1266a5e7f..87a52d511 100644
--- a/examples/graphit/riscv.mk
+++ b/examples/graphit/riscv.mk
@@ -54,7 +54,7 @@ RISCV_LLVM_PATH  := $(RISCV_TOOLS_PATH)/llvm/llvm-install
 RISCV_LINK_GEN := $(BSG_MANYCORE_DIR)/software/py/bsg_manycore_link_gen.py
 
 # These flags are not supported by clang
-RISCV_GNU_FLAGS = -mno-fdiv -frerun-cse-after-loop -fweb -frename-registers
+RISCV_GNU_FLAGS = -frerun-cse-after-loop -fweb -frename-registers -mtune=bsg_vanilla_2020
 
 RISCV_GCC        ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-gcc $(RISCV_GNU_FLAGS)
 RISCV_GXX        ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-g++ $(RISCV_GNU_FLAGS)
@@ -66,7 +66,7 @@ RISCV_LINK       ?= $(RISCV_GCC) -t -T $(LINK_SCRIPT) $(RISCV_LDFLAGS)
 RISCV_LD         ?= $(RISCV_GCC)
 
 RISCV_CLANG_ABI        = ilp32f
-RISCV_CLANG_CCPPFLAGS += --target=riscv32 -mabi=$(RISCV_CLANG_ABI)
+RISCV_CLANG_CCPPFLAGS += --target=riscv32 -mabi=$(RISCV_CLANG_ABI) -march=riscv32imaf -mtune=hb-rv32
 RISCV_CLANG_CXXFLAGS  += --sysroot=$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs
 RISCV_CLANG_CXXFLAGS  += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0
 RISCV_CLANG_CXXFLAGS  += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0/riscv32-unknown-elf-dramfs  
diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile
index d456fd96d..0088fe364 100644
--- a/examples/graphit/test_pr_nibble/Makefile
+++ b/examples/graphit/test_pr_nibble/Makefile
@@ -67,7 +67,7 @@ CXXDEFINES +=
 
 FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
 CFLAGS   += -std=c99 $(FLAGS)
-CXXFLAGS += -std=c++14 $(FLAGS)
+CXXFLAGS += -std=c++14 $(FLAGS) 
 
 HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ 
 
@@ -82,7 +82,7 @@ main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h
 # Host code link flags and flow
 ###############################################################################
 
-LDFLAGS +=
+LDFLAGS += 
 
 # link.mk defines rules for linking of the final execution binary.
 include $(EXAMPLES_PATH)/link.mk
@@ -116,7 +116,7 @@ include $(EXAMPLES_PATH)/graphit/riscv.mk
 #
 # SIM_ARGS: Use this to pass arguments to the simulator
 ###############################################################################
-C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH)
+C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) #-g $(GRAPH_PATH)
 
 SIM_ARGS ?=
 
diff --git a/examples/graphit/test_pr_nibble/kernel.cpp b/examples/graphit/test_pr_nibble/kernel.cpp
index d49112e73..c115b9f73 100644
--- a/examples/graphit/test_pr_nibble/kernel.cpp
+++ b/examples/graphit/test_pr_nibble/kernel.cpp
@@ -25,9 +25,9 @@ bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
 #define pr_dbg(fmt, ...)
 #endif
 
-__attribute__((section(".dram"))) double  * __restrict p;
-__attribute__((section(".dram"))) double  * __restrict old_rank;
-__attribute__((section(".dram"))) double  * __restrict new_rank;
+__attribute__((section(".dram"))) float  * __restrict p;
+__attribute__((section(".dram"))) float  * __restrict old_rank;
+__attribute__((section(".dram"))) float  * __restrict new_rank;
 __attribute__((section(".dram"))) int  * __restrict out_degree;
 __attribute__((section(".dram"))) int  * __restrict generated_tmp_vector_3;
 //__attribute__((section(".dram"))) double alpha = 0.15; 
@@ -106,7 +106,7 @@ struct updateEdge
 {
   void operator() (int src, int dst)
   {
-    double alpha = 0.15; 
+    float alpha = 0.15; 
     new_rank[dst] = (new_rank[dst] + (((((1)  - alpha) / ((1)  + alpha)) * old_rank[src]) / out_degree[src]));
   };
 };
@@ -114,7 +114,7 @@ struct updateSelf
 {
   void operator() (int v)
   {
-    double alpha = 0.15; 
+    float alpha = 0.15; 
     p[v] = (p[v] + ((((2)  * alpha) / ((1)  + alpha)) * old_rank[v]));
     new_rank[v] = (0) ;
   };
@@ -123,7 +123,7 @@ struct filter_frontier
 {
   bool operator() (int v)
   {
-    double epsilon = (double) 1e-6; 
+    float epsilon = (float) 1e-6; 
     bool output ;
     if(old_rank[v] == 0) return 0;
     //output = (old_rank[v]) > ((out_degree[v] * epsilon));
diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp
index e7813101a..3897a44d4 100644
--- a/examples/graphit/test_pr_nibble/main.cpp
+++ b/examples/graphit/test_pr_nibble/main.cpp
@@ -57,7 +57,8 @@ int launch(int argc, char ** argv){
   hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
 
   std::cerr << "load graph" << std::endl;
-  std::string graph_f = input.getCmdOption("-g");
+  //std::string graph_f = input.getCmdOption("-g");
+  std::string graph_f = "~/research/bladerunner6.0/graphit-new/test/graphs/darpa-eval/jhu.mtx"; 
   edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); 
 
   std::cerr << "size of graph: " << std::endl;
@@ -225,15 +226,8 @@ int launch(int argc, char ** argv){
 
 #ifdef VCS
 int vcs_main(int argc, char ** argv){
-    // int argc = get_argc(args);
-    // char *argv[argc];
-    // get_argv(args, argc, argv);
-    // svScope scope;
-    // scope = svGetScopeFromName("tb");
-    // svSetScope(scope);
     bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n");
     int rc = launch(argc, argv); 
-    //*exit_code = rc;
     bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
     return rc;
 }
diff --git a/examples/graphit/test_pr_nibble/pr.hpp b/examples/graphit/test_pr_nibble/pr.hpp
index b73169532..b1f9ac484 100644
--- a/examples/graphit/test_pr_nibble/pr.hpp
+++ b/examples/graphit/test_pr_nibble/pr.hpp
@@ -1,3 +1,4 @@
+#pragma once
 #ifndef __PR_PULL_BENCHMARK_HPP
 #define __PR_PULL_BENCHMARK_HPP
 
@@ -14,7 +15,6 @@
 #include <bsg_manycore_loader.h>
 #include <bsg_manycore_cuda.h>
 #include <cl_manycore_regression.h>
-//#include "../common.h"
 
 
 using hammerblade::Device;

From e7f74e62bd59cbb7bb507f4c74903f6eb8bd49d6 Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Mon, 29 Mar 2021 15:04:36 -0700
Subject: [PATCH 12/22] working pr nibble test

---
 examples/graphit/test_pr_nibble/Makefile    |  7 +--
 examples/graphit/test_pr_nibble/main.cpp    | 48 ++++++++++-----------
 examples/graphit/test_pr_nibble/pr_host.hpp | 18 ++++----
 3 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile
index 0088fe364..8dec2f3be 100644
--- a/examples/graphit/test_pr_nibble/Makefile
+++ b/examples/graphit/test_pr_nibble/Makefile
@@ -44,8 +44,8 @@ REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
 include $(REPLICANT_PATH)/environment.mk
 SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
 CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
-GRAPHIT_PATH = $(REPLICANT_PATH)/../graphit-new
 CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-new
 
 GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
 
@@ -106,7 +106,7 @@ RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
 
 RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_pr_nibble/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/
 
-include $(EXAMPLES_PATH)/graphit/riscv.mk
+include $(EXAMPLES_PATH)/cuda/riscv.mk
 
 ###############################################################################
 # Execution flow
@@ -116,7 +116,7 @@ include $(EXAMPLES_PATH)/graphit/riscv.mk
 #
 # SIM_ARGS: Use this to pass arguments to the simulator
 ###############################################################################
-C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) #-g $(GRAPH_PATH)
+C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH)
 
 SIM_ARGS ?=
 
@@ -142,6 +142,7 @@ help:
 	@echo "      $(TEST_NAME).{profile,debug}.log: Run specific executable"
 	@echo "      clean: Remove all subdirectory-specific outputs"
 
+print-%  : ; @echo $* = $($*)
 
 .PHONY: clean
 
diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp
index 3897a44d4..e5b434481 100644
--- a/examples/graphit/test_pr_nibble/main.cpp
+++ b/examples/graphit/test_pr_nibble/main.cpp
@@ -20,8 +20,8 @@ GlobalScalar<hb_mc_eva_t> p_dev;
 GlobalScalar<hb_mc_eva_t> old_rank_dev;
 GlobalScalar<hb_mc_eva_t> new_rank_dev;
 GlobalScalar<hb_mc_eva_t> out_degree_dev;
-GlobalScalar<float > alpha_dev;
-GlobalScalar<float > epsilon_dev;
+//GlobalScalar<float > alpha_dev;
+//GlobalScalar<float > epsilon_dev;
 
 #include "pr_host.hpp"
 
@@ -57,25 +57,25 @@ int launch(int argc, char ** argv){
   hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
 
   std::cerr << "load graph" << std::endl;
-  //std::string graph_f = input.getCmdOption("-g");
-  std::string graph_f = "~/research/bladerunner6.0/graphit-new/test/graphs/darpa-eval/jhu.mtx"; 
+  std::string graph_f = input.getCmdOption("-g");
   edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); 
 
   std::cerr << "size of graph: " << std::endl;
   std::cerr << edges.num_nodes() << std::endl;
   std::cerr << edges.num_edges() << std::endl; 
-
   std::cerr << "init global scalars" << std::endl; 
+
   p_dev = GlobalScalar<hb_mc_eva_t>("p");
-  hammerblade::init_global_array<double>(hammerblade::builtin_getVerticesHB(edges), p_dev);
+
+  hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), p_dev);
   old_rank_dev = GlobalScalar<hb_mc_eva_t>("old_rank");
-  hammerblade::init_global_array<double>(hammerblade::builtin_getVerticesHB(edges), old_rank_dev);
+  hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), old_rank_dev);
   new_rank_dev = GlobalScalar<hb_mc_eva_t>("new_rank");
-  hammerblade::init_global_array<double>(hammerblade::builtin_getVerticesHB(edges), new_rank_dev);
+  hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), new_rank_dev);
   out_degree_dev = GlobalScalar<hb_mc_eva_t>("out_degree");
   hammerblade::init_global_array<int32_t>(hammerblade::builtin_getVerticesHB(edges), out_degree_dev);
-  alpha_dev = GlobalScalar<float>("alpha");
-  epsilon_dev = GlobalScalar<float>("epsilon");
+  //alpha_dev = GlobalScalar<float>("alpha");
+  //epsilon_dev = GlobalScalar<float>("epsilon");
 
   std::cerr << "init locks" << std::endl;
   GlobalScalar<hb_mc_eva_t> glbl_locks = GlobalScalar<hb_mc_eva_t>("locks");
@@ -87,29 +87,27 @@ int launch(int argc, char ** argv){
   float alpha = ((float) 0.15) ;
   float epsilon = ((float) 1e-06) ;
   int start_vertex = ROOT;
-
-
   Vector<int32_t> frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
   //Vector<int32_t> next_frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
 
   std::vector<int32_t> hfrontier(edges.num_nodes(), 0);
-  std::vector<double> p(edges.num_nodes(), (double) 0.0);
-  std::vector<double> new_rank(edges.num_nodes(), (double) 0.0);
-  std::vector<double> old_rank(edges.num_nodes(), (double) 0.0);
+  std::vector<float> p(edges.num_nodes(), (float) 0.0);
+  std::vector<float> new_rank(edges.num_nodes(), (float) 0.0);
+  std::vector<float> old_rank(edges.num_nodes(), (float) 0.0);
   std::vector<int32_t> out_degs = edges.get_out_degrees();
 
   //compute up to current iter on host
   hfrontier[start_vertex] = 1;
-  new_rank[start_vertex] = (double) 1.0;
-  old_rank[start_vertex] = (double) 1.0;
+  new_rank[start_vertex] = (float) 1.0;
+  old_rank[start_vertex] = (float) 1.0;
   host_pr_calc(p, old_rank, new_rank, hfrontier, iter);
 
   frontier.copyToDevice(hfrontier.data(), hfrontier.size());
 
   //next_frontier.copyToDevice(zeros.data(), zeros.size());
-  hammerblade::write_global_buffer_dma<double>(p.data(), p_dev, p.size());  
-  hammerblade::write_global_buffer_dma<double>(old_rank.data(), old_rank_dev, old_rank.size());  
-  hammerblade::write_global_buffer_dma<double>(new_rank.data(), new_rank_dev, new_rank.size());  
+  hammerblade::write_global_buffer_dma<float>(p.data(), p_dev, p.size());  
+  hammerblade::write_global_buffer_dma<float>(old_rank.data(), old_rank_dev, old_rank.size());  
+  hammerblade::write_global_buffer_dma<float>(new_rank.data(), new_rank_dev, new_rank.size());  
   hammerblade::write_global_buffer_dma<int32_t>(out_degs.data(), out_degree_dev, out_degs.size());  
   hammerblade::write_global_buffer_dma<std::atomic<int>>(tmp_a, glbl_locks, NUM_LOCKS);
 
@@ -160,7 +158,7 @@ int launch(int argc, char ** argv){
     	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
     	device->runJobs();
     	std::cerr << "swap arrays\n";
-    	hammerblade::swap_global_arrays<double>(new_rank_dev, old_rank_dev);
+    	hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
       f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
       std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
       break;
@@ -175,7 +173,7 @@ int launch(int argc, char ** argv){
     	device->runJobs();
       tag_c++;
     	std::cerr << "swap arrays\n";
-   	hammerblade::swap_global_arrays<double>(new_rank_dev, old_rank_dev);
+   	hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
     	std::cerr << "create next frontier\n";
     	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
     	device->runJobs();
@@ -196,7 +194,7 @@ int launch(int argc, char ** argv){
     	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
     	device->runJobs();
     	std::cerr << "swap arrays\n";
-    	hammerblade::swap_global_arrays<double>(new_rank_dev, old_rank_dev);
+    	hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
       f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
       std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
       break;
@@ -214,8 +212,8 @@ int launch(int argc, char ** argv){
   if(VERIFY) {
     ofstream ver_file;
     ver_file.open("./rank.txt");
-    double host_rank[edges.num_nodes()];
-    hammerblade::read_global_buffer_dma<double>(host_rank, old_rank_dev, edges.num_nodes());
+    float host_rank[edges.num_nodes()];
+    hammerblade::read_global_buffer_dma<float>(host_rank, old_rank_dev, edges.num_nodes());
     for(int i = 0; i < edges.num_nodes(); i++) {
       ver_file << host_rank[i] << std::endl;
     }
diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp
index 0168845f0..7e7479495 100644
--- a/examples/graphit/test_pr_nibble/pr_host.hpp
+++ b/examples/graphit/test_pr_nibble/pr_host.hpp
@@ -3,9 +3,9 @@
 #include <iostream>
 #include <fstream>
 
-inline void host_pr_calc(std::vector<double> & p, std::vector<double> & old_rank, std::vector<double> & new_rank, std::vector<int> & frontier, int iter) {
-    double alpha = (double) 0.15;
-    double epsilon = (double) 1e-06;
+inline void host_pr_calc(std::vector<float> & p, std::vector<float> & old_rank, std::vector<float> & new_rank, std::vector<int> & frontier, int iter) {
+    float alpha = (float) 0.15;
+    float epsilon = (float) 1e-06;
     auto g = edges.getHostGraph();
     int * in_neigh = g.in_neighbors_shared_.get();
     int ** in_index = g.in_index_shared_.get();
@@ -13,7 +13,7 @@ inline void host_pr_calc(std::vector<double> & p, std::vector<double> & old_rank
     ofstream ofile;
     ofile.open (fname);
     for(int i = 0; i < iter; i++) {
-        //std::memcpy(new_rank, old_rank, sizeof(double)*edges.num_nodes());
+        //std::memcpy(new_rank, old_rank, sizeof(float)*edges.num_nodes());
 	    //new_rank = old_rank;
         new_rank.assign(old_rank.begin(), old_rank.end());
         //print out iteration and size:
@@ -22,27 +22,27 @@ inline void host_pr_calc(std::vector<double> & p, std::vector<double> & old_rank
         //update_self
         for(int v = 0; v < g.num_nodes(); v++) {
             p[v] += (2.0 * alpha) / (1.0  + alpha) * old_rank[v];
-            new_rank[v] = (double) 0.0 ;
+            new_rank[v] = (float) 0.0 ;
         }
         //update edges
         for(int d = 0; d < g.num_nodes(); d++) {
             for(int s : g.in_neigh(d)) {
                 if(frontier[s]){
-                    double update = ((1.0 - alpha) / (1.0  + alpha)) * old_rank[s];
-										update = update / ((double) g.out_degree(s));
+                    float update = ((1.0 - alpha) / (1.0  + alpha)) * old_rank[s];
+										update = update / ((float) g.out_degree(s));
 										new_rank[d] += update;
 										if(i == (iter - 1)) {ofile << s << " " << d << " " << new_rank[d] << std::endl;}
                 }
             }
         }
         //old_rank.swap(new_rank);
-        //std::memcpy(old_rank, new_rank, sizeof(double)*edges.num_nodes());
+        //std::memcpy(old_rank, new_rank, sizeof(float)*edges.num_nodes());
         //old_rank = new_rank;
         old_rank.assign(new_rank.begin(), new_rank.end());
         //update frontier
         for(int v = 0; v < g.num_nodes(); v++) {
             frontier[v] = 0;
-            if(g.out_degree(v) > 0 && old_rank[v] >= (((double) g.out_degree(v)) * epsilon)) {
+            if(g.out_degree(v) > 0 && old_rank[v] >= (((float) g.out_degree(v)) * epsilon)) {
                 frontier[v] = 1;
             }
         }

From 206bce5f01f24091e0585c36daacab267420178e Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Thu, 1 Apr 2021 15:50:17 -0700
Subject: [PATCH 13/22] initial delta stepping test, fixes for pr nibble

---
 examples/graphit/test_pr_nibble/Makefile      |   2 +-
 examples/graphit/test_pr_nibble/kernel.cpp    |   9 +-
 examples/graphit/test_pr_nibble/main.cpp      |  14 +-
 examples/graphit/test_sssp_delta/Makefile     | 151 ++++++++++++
 examples/graphit/test_sssp_delta/kernel.cpp   |  99 ++++++++
 .../kernel/include/pr_nibble.hpp              |   9 +
 .../test_sssp_delta/kernel/include/sssp.hpp   |   8 +
 examples/graphit/test_sssp_delta/main.cpp     | 219 ++++++++++++++++++
 examples/graphit/test_sssp_delta/sssp.hpp     |  26 +++
 9 files changed, 522 insertions(+), 15 deletions(-)
 create mode 100644 examples/graphit/test_sssp_delta/Makefile
 create mode 100644 examples/graphit/test_sssp_delta/kernel.cpp
 create mode 100644 examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp
 create mode 100644 examples/graphit/test_sssp_delta/kernel/include/sssp.hpp
 create mode 100644 examples/graphit/test_sssp_delta/main.cpp
 create mode 100644 examples/graphit/test_sssp_delta/sssp.hpp

diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile
index 8dec2f3be..4bfca9a2b 100644
--- a/examples/graphit/test_pr_nibble/Makefile
+++ b/examples/graphit/test_pr_nibble/Makefile
@@ -67,7 +67,7 @@ CXXDEFINES +=
 
 FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
 CFLAGS   += -std=c99 $(FLAGS)
-CXXFLAGS += -std=c++14 $(FLAGS) 
+CXXFLAGS += -std=c++11 $(FLAGS) 
 
 HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ 
 
diff --git a/examples/graphit/test_pr_nibble/kernel.cpp b/examples/graphit/test_pr_nibble/kernel.cpp
index c115b9f73..79186ab8a 100644
--- a/examples/graphit/test_pr_nibble/kernel.cpp
+++ b/examples/graphit/test_pr_nibble/kernel.cpp
@@ -125,9 +125,10 @@ struct filter_frontier
   {
     float epsilon = (float) 1e-6; 
     bool output ;
-    if(old_rank[v] == 0) return 0;
+    //if(old_rank[v] == 0) return 0;
+    if(new_rank[v] == 0) return 0;
     //output = (old_rank[v]) > ((out_degree[v] * epsilon));
-    output = (old_rank[v]) > ((out_degree[v] * epsilon));
+    output = (new_rank[v]) > ((out_degree[v] * epsilon));
     return output;
   };
 };
@@ -185,7 +186,9 @@ extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_verte
 	barrier.sync();
         //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c);
   bsg_cuda_print_stat_start(tag_c);
+  bsg_saif_start();
 	edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x);
+  bsg_saif_end();
   bsg_cuda_print_stat_end(tag_c);
 	barrier.sync();
 	return 0;
@@ -194,7 +197,9 @@ extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_verte
  extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
 	barrier.sync(); 
   bsg_cuda_print_stat_start(tag_c);
+  bsg_saif_start();
 	edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x);
+  bsg_saif_end();
   bsg_cuda_print_stat_end(tag_c);
 	barrier.sync();
 	return 0;
diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp
index e5b434481..ff396f302 100644
--- a/examples/graphit/test_pr_nibble/main.cpp
+++ b/examples/graphit/test_pr_nibble/main.cpp
@@ -2,7 +2,7 @@
 
 //#define DEBUG
 
-#define VERIFY 1
+#define VERIFY 0
 
 #ifdef DEBUG
 #define X 1 
@@ -66,7 +66,6 @@ int launch(int argc, char ** argv){
   std::cerr << "init global scalars" << std::endl; 
 
   p_dev = GlobalScalar<hb_mc_eva_t>("p");
-
   hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), p_dev);
   old_rank_dev = GlobalScalar<hb_mc_eva_t>("old_rank");
   hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), old_rank_dev);
@@ -76,14 +75,11 @@ int launch(int argc, char ** argv){
   hammerblade::init_global_array<int32_t>(hammerblade::builtin_getVerticesHB(edges), out_degree_dev);
   //alpha_dev = GlobalScalar<float>("alpha");
   //epsilon_dev = GlobalScalar<float>("epsilon");
-
   std::cerr << "init locks" << std::endl;
   GlobalScalar<hb_mc_eva_t> glbl_locks = GlobalScalar<hb_mc_eva_t>("locks");
   hammerblade::init_global_array<std::atomic<int>>(NUM_LOCKS, glbl_locks);
   std::atomic<int> tmp_a[NUM_LOCKS] = {};
-
   Device::Ptr device = Device::GetInstance();
-
   float alpha = ((float) 0.15) ;
   float epsilon = ((float) 1e-06) ;
   int start_vertex = ROOT;
@@ -124,13 +120,6 @@ int launch(int argc, char ** argv){
     }
   }
 
-  //alpha_dev.set(alpha);
-  //epsilon_dev.set(epsilon);
-
-  //hammerblade::builtin_addVertexHB(frontier, start_vertex);
-  //hammerblade::insert_val(start_vertex, ((double) 1) , old_rank_dev);
-  //hammerblade::insert_val(start_vertex, ((double) 1) , new_rank_dev);
-
   std::cerr << "start of while loop\n";
   int tag_c = 0;
   //double host_rank[edges.num_nodes()];
@@ -219,6 +208,7 @@ int launch(int argc, char ** argv){
     }
     ver_file.close();  
   }
+  device->finish(); 
   return 0;
 }
 
diff --git a/examples/graphit/test_sssp_delta/Makefile b/examples/graphit/test_sssp_delta/Makefile
new file mode 100644
index 000000000..2e980c4e2
--- /dev/null
+++ b/examples/graphit/test_sssp_delta/Makefile
@@ -0,0 +1,151 @@
+# Copyright (c) 2021, University of Washington All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+#
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+#
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Makefile compiles, links, and executes examples Run `make help`
+# to see the available targets for the selected platform.
+
+################################################################################
+# environment.mk verifies the build environment and sets the following
+# makefile variables:
+#
+# LIBRAIRES_PATH: The path to the libraries directory
+# HARDWARE_PATH: The path to the hardware directory
+# EXAMPLES_PATH: The path to the examples directory
+# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
+# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
+###############################################################################
+
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+
+include $(REPLICANT_PATH)/environment.mk
+SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
+CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
+CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-new
+
+GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
+
+# TEST_NAME is the basename of the executable
+TEST_NAME = main
+# KERNEL_NAME is the name of the CUDA-Lite Kernel
+KERNEL_NAME = sssp 
+
+###############################################################################
+# Host code compilation flags and flow
+###############################################################################
+
+# TEST_SOURCES is a list of source files that need to be compiled
+TEST_SOURCES = main.cpp
+
+DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE
+CDEFINES += 
+CXXDEFINES += 
+
+FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
+CFLAGS   += -std=c99 $(FLAGS)
+CXXFLAGS += -std=c++11 $(FLAGS) 
+
+HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ 
+
+# compilation.mk defines rules for compilation of C/C++
+include $(EXAMPLES_PATH)/compilation.mk
+
+# Specify any header file dependencies
+main.o: INCLUDES += -I$(EXAMPLES_PATH) -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/
+main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h
+
+###############################################################################
+# Host code link flags and flow
+###############################################################################
+
+LDFLAGS += 
+
+# link.mk defines rules for linking of the final execution binary.
+include $(EXAMPLES_PATH)/link.mk
+
+###############################################################################
+# Device code compilation flow
+###############################################################################
+
+# BSG_MANYCORE_KERNELS is a list of manycore executables that should
+# be built before executing.
+BSG_MANYCORE_KERNELS = kernel.riscv
+
+kernel.rvo: RISCV_CXX = $(RISCV_GXX)
+kernel.riscv: kernel.rvo
+
+# Tile Group Dimensions
+TILE_GROUP_DIM_X = 16
+TILE_GROUP_DIM_Y = 8
+RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
+RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
+
+RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_sssp_delta/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/
+
+include $(EXAMPLES_PATH)/cuda/riscv.mk
+
+###############################################################################
+# Execution flow
+#
+# C_ARGS: Use this to pass arguments that you want to appear in argv
+#         For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
+#
+# SIM_ARGS: Use this to pass arguments to the simulator
+###############################################################################
+C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH)
+
+SIM_ARGS ?=
+
+# Include platform-specific execution rules
+include $(EXAMPLES_PATH)/execution.mk
+
+###############################################################################
+# Regression Flow
+###############################################################################
+
+regression: main.exec.log
+	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
+
+###############################################################################
+# Default rules, help, and clean
+###############################################################################
+.DEFAULT_GOAL := help
+help:
+	@echo "Usage:"
+	@echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}"
+	@echo "      $(TEST_NAME).profile: Build executable with profilers enabled"
+	@echo "      $(TEST_NAME).debug: Build waveform executable (if VCS)"
+	@echo "      $(TEST_NAME).{profile,debug}.log: Run specific executable"
+	@echo "      clean: Remove all subdirectory-specific outputs"
+
+print-%  : ; @echo $* = $($*)
+
+.PHONY: clean
+
+clean:
+
+
diff --git a/examples/graphit/test_sssp_delta/kernel.cpp b/examples/graphit/test_sssp_delta/kernel.cpp
new file mode 100644
index 000000000..5039f1dd2
--- /dev/null
+++ b/examples/graphit/test_sssp_delta/kernel.cpp
@@ -0,0 +1,99 @@
+#include <bsg_manycore.h>
+#include <bsg_set_tile_x_y.h>
+
+//#define BSG_TILE_GROUP_X_DIM 16 
+//#define BSG_TILE_GROUP_Y_DIM 2
+//#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
+//#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
+#include <bsg_tile_group_barrier.hpp>
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+#include <sssp.hpp>
+
+//#define DEBUG
+#ifdef DEBUG
+#define pr_dbg(fmt, ...)	\
+	bsg_printf(fmt, ##__VA_ARGS__)
+#else
+#define pr_dbg(fmt, ...)
+#endif
+
+
+__attribute__((section(".dram"))) int  * __restrict dist;
+
+template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier(int *in_indices , WNode *in_neighbors, int* from_vertexset, int * next_frontier, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{ 
+  bsg_cuda_print_stat_start(1);
+  bsg_saif_start();
+  int start, end;
+  local_range(V, &start, &end);
+  if(bsg_id == 0) pr_dbg("elem 1: %i and dist: %i and random weight: %i\n", from_vertexset[5], dist[5], in_neighbors[in_indices[5]].weight);
+  for ( int d = start; d < end; d++) {
+    int degree = in_indices[d + 1] - in_indices[d];
+    WNode * neighbors = &in_neighbors[in_indices[d]];
+    for(int s = 0; s < degree; s++) { 
+      if(from_vertexset[neighbors[s].vertex]) {
+        if( apply_func ( neighbors[s].vertex, d, neighbors[s].weight )) { 
+          next_frontier[d] = 1;
+        }
+      }
+    } //end of loop on in neighbors
+  } //end of outer for loop
+  bsg_saif_end();
+  bsg_cuda_print_stat_end(1);
+  barrier.sync();
+  return 0;
+} //end of edgeset apply function 
+
+
+struct dist_generated_vector_op_apply_func_0
+{
+  void operator() (int v)
+  {
+    dist[v] = (2147483647) ;
+  };
+};
+struct updateEdge
+{
+  bool operator() (int src, int dst, int weight)
+  {
+    bool output3 = false;
+    int new_dist = (dist[src] + weight);
+    if(dist[dst] > new_dist) {
+      dist[dst] = new_dist;
+      output3 = true;
+    }
+    return output3;
+  };
+};
+struct reset
+{
+  void operator() (int v)
+  {
+    dist[v] = (2147483647) ;
+  };
+};
+
+extern "C" int  __attribute__ ((noinline)) dist_generated_vector_op_apply_func_0_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		dist_generated_vector_op_apply_func_0()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) reset_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		reset()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier_call(int *in_indices, WNode *in_neighbors, int *frontier, int *modified_vertexsubset1, int V, int E, int block_size_x) {
+	edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier(in_indices, in_neighbors, frontier, modified_vertexsubset1, updateEdge(), V, E, block_size_x);
+	return 0;
+}
+
+
diff --git a/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp b/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp
new file mode 100644
index 000000000..ee50a54d6
--- /dev/null
+++ b/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#ifndef __PR_PULL_BENCHMARK_HPP
+#define __PR_PULL_BENCHMARK_HPP
+
+#include <math.h>
+#include <local_range.h>
+#include <vertex_struct.h>
+#include <atomics.h>
+#endif
diff --git a/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp b/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp
new file mode 100644
index 000000000..23da05000
--- /dev/null
+++ b/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp
@@ -0,0 +1,8 @@
+#ifndef __SSSP_BENCHMARK_HPP
+#define __SSSP_BENCHMARK_HPP
+
+#include <vertex_struct.h>
+#include <local_range.h>
+#include <atomics.h>
+#include <wnode.h>
+#endif
diff --git a/examples/graphit/test_sssp_delta/main.cpp b/examples/graphit/test_sssp_delta/main.cpp
new file mode 100644
index 000000000..069ee5426
--- /dev/null
+++ b/examples/graphit/test_sssp_delta/main.cpp
@@ -0,0 +1,219 @@
+#include "sssp.hpp"
+#define X 16 
+#define Y 2
+#define NUM_LOCKS 1024
+#define VERIFY false 
+#define ROOT 6
+#define DELTA 32
+
+WGraphHB edges;
+GlobalScalar<hb_mc_eva_t> dist_dev; 
+//BucketPriorityQueue<int> pq;
+
+bool apply(int s, int d, int w, std::vector<int> &dist) {
+  int new_dist = (dist[s] + w);
+  if(dist[d] > new_dist) {
+    dist[d] = new_dist;
+    return true;
+  }
+  return false;
+}
+
+void sssp_pull_call(std::vector<int> &front, std::vector<int> &next, std::vector<int> &dist) {
+  auto g = edges.getHostGraph();
+  auto * in_neigh = g.in_neighbors_shared_.get();
+  auto ** in_index = g.in_index_shared_.get();
+  for(int d = 0; d < edges.num_nodes(); d++) {
+    int ind = in_index[d] - in_neigh;
+    int degree = g.in_degree(d);
+    auto * neighbors = &in_neigh[ind];
+    for(int s = 0; s < degree; s++){
+      if(front[neighbors[s].v]){
+        if(apply(neighbors[s].v, d, neighbors[s].w, dist)) {
+          next[d] = 1;
+        }
+      } 
+    }
+  }  
+
+}
+
+void host_sssp_pull(BucketPriorityQueue<int>& pq, std::vector<int> &dist, int iter) {
+  dist[ROOT] = 0;
+  Device::Ptr device = Device::GetInstance(); 
+  Vector<int> next_frontier_dev = Vector<int>(edges.num_nodes());
+  std::vector<int> h_next(edges.num_nodes(), 0);
+  std::vector<int> h_front(edges.num_nodes(), 0);  
+
+  for(int i = 0; i < iter; i++) {
+    if(!(pq.finished() == 0)) { std::cout << "no more items on iter: " << i << "\n"; break; }
+    Vector<int32_t> front = pq.popDenseReadyVertexSet(); 
+    front.copyToHost(h_front.data(), edges.num_nodes()); 
+    device->freeze_cores();
+    device->read_dma();
+    device->unfreeze_cores();
+    int num_elems = std::count(h_front.begin(), h_front.end(), 1);
+    std::cout << "num elems in front: " << num_elems << " val of 0: " << h_front[0] << std::endl; 
+    sssp_pull_call(h_front, h_next, dist);
+    num_elems = std::count(h_next.begin(), h_next.end(), 1);
+    std::cout << "num elems in next front: " << num_elems << std::endl; 
+    std::cout << "dist of 1: " << dist[1] << std::endl;
+    next_frontier_dev.copyToDevice(h_next.data(), edges.num_nodes());  
+    hammerblade::write_global_buffer_dma<int>(dist.data(), dist_dev, edges.num_nodes());
+    device->freeze_cores();
+    device->write_dma();
+    device->unfreeze_cores();
+    hammerblade::updateBucketWithGraphItVertexSubset<int>(next_frontier_dev, pq);
+    std::fill(h_next.begin(), h_next.end(), 0);
+  }
+}
+void host_sssp_push(BucketPriorityQueue<int> &pq, std::vector<int> &dist, int iter) {
+  host_sssp_pull(pq, dist, iter);
+}
+
+int launch(int argc, char * argv[]){
+  InputParser input(argc, argv);
+  if(!input.cmdOptionExists("-g")) { 
+  
+    std::cerr << "no input args\n";
+    for(auto i = 0; i < argc; i++) {
+      std::cerr << argv[i] << " ";
+    }
+    std::cerr << std::endl;
+    return 0;
+  }
+  std::string ucode_path = input.getRISCVFile();
+
+  int iter = 0;
+  //std::string iterstrbase = "iteration-";
+  //auto pos = ucode_path.find(iterstrbase);
+  //auto iterstr = ucode_path.substr(pos + iterstrbase.size(), std::string::npos);
+  //std::stringstream ss(iterstr);
+  //ss >> iter;
+  std::cerr << "iteration: " << iter << std::endl;
+
+  int version = 0; //pull-vertex
+  if(ucode_path.find("push-vertex") != std::string::npos) {
+    version = 1;
+  }
+  std::cerr << "load microcode" << std::endl;
+  hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
+  std::cerr << "load graph" << std::endl;
+
+  std::string graph_f = input.getCmdOption("-g");
+  //std::string frontier_f = input.getCmdOption("-f");
+  edges = hammerblade::builtin_loadWeightedEdgesFromFileToHB (graph_f.c_str()); 
+  std::cerr << "out deg of 0: " << edges.out_degree(5) << "num edges: " << edges.num_edges() << std::endl;
+
+
+  Device::Ptr device = Device::GetInstance(); 
+  dist_dev = GlobalScalar<hb_mc_eva_t>("dist");
+  hammerblade::init_global_array<int>(edges.num_nodes(), dist_dev);
+  hammerblade::assign_val_dma<int>(0, edges.num_nodes(), (2147483647), dist_dev);
+  int start_vertex = 0;
+  //hammerblade::insert_val<int>(start_vertex, 0, dist_dev); 
+ 
+  std::cerr << "init locks\n";
+  GlobalScalar<hb_mc_eva_t> glbl_locks = GlobalScalar<hb_mc_eva_t>("locks");
+  hammerblade::init_global_array<std::atomic<int>>(NUM_LOCKS, glbl_locks);
+  std::atomic<int> tmp_array[NUM_LOCKS] = {};
+  hammerblade::write_global_buffer_dma<std::atomic<int>>(tmp_array, glbl_locks, NUM_LOCKS);
+
+  std::cerr << "doing batch dma write" << std::endl;
+  device->freeze_cores();
+  device->write_dma();
+  device->unfreeze_cores();
+  hammerblade::insert_val<int>(start_vertex, 0, dist_dev); 
+  std::cerr << "init pq" << std::endl;
+  BucketPriorityQueue<int> pq = BucketPriorityQueue<int>(edges.num_nodes(), &dist_dev, (hammerblade::BucketOrder)1, (hammerblade::PriorityOrder)0, (int) 128, (int) 32);
+
+  std::cerr << "host side compute up to current iter: \n";
+  std::vector<int> h_dist(edges.num_nodes(), 2147483647);
+  if(version == 0) {
+    host_sssp_pull(pq, h_dist, iter);
+  } else {
+    host_sssp_push(pq, h_dist, iter);
+  } 
+  hammerblade::write_global_buffer_dma<int>(h_dist.data(), dist_dev, edges.num_nodes());
+  device->freeze_cores();
+  device->write_dma();
+  device->unfreeze_cores();
+
+  std::cerr << "starting while loop" << std::endl;
+  Vector<int32_t> next_frontier_dev;
+  switch(version){
+    case 0: { // do dense pull bfs
+      //device->enqueueJob("init_kernel", hb_mc_dimension(X,Y), {edges.num_nodes()});
+      //device->runJobs();
+      for(int i = 0; i < 1; i++) //just doing one large iteration
+      {
+     
+	std::cerr << "doing SSSP Delta Stepping kernel" << std::endl;
+        //Vector<int32_t> frontier = hammerblade::getBucketWithGraphItVertexSubset<int>(pq);
+        Vector<int32_t> frontier = pq.popDenseReadyVertexSet(); 
+        std::cerr << "got frontier from pq\n";
+        next_frontier_dev = Vector<int32_t>(edges.num_nodes());
+        //next_frontier_dev.assign(0, edges.num_nodes(), 0);
+        //device->freeze_cores();
+        //device->write_dma();
+        //device->unfreeze_cores();
+        printf("0x%08x\n", frontier.getAddr());
+        printf("next: 0x%08x\n", next_frontier_dev.getAddr());
+        std::cerr << "initialized next front\n";
+        device->enqueueJob("edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier_call",
+                         hb_mc_dimension(X,Y),
+                        {edges.getInIndicesAddr(),
+                         edges.getInNeighborsAddr(),
+                         frontier.getAddr(),
+                         next_frontier_dev.getAddr(),  
+                         edges.num_nodes(),
+                         edges.num_edges(),
+                         edges.num_nodes()});
+        device->runJobs();
+        std::cerr << "updating buckets:\n";
+        hammerblade::updateBucketWithGraphItVertexSubset<int>(next_frontier_dev, pq);
+        hammerblade::deleteObject(frontier);
+    }
+    break;
+    }
+    case 1: { //do sparse push blocked bfs
+    break;
+    } 
+  }
+        
+  std::cerr << "finished while loop" << std::endl;
+
+  if(VERIFY) {
+    int * host_next = new int[edges.num_nodes()];
+    next_frontier_dev.copyToHost(host_next, edges.num_nodes());
+
+    device->freeze_cores();
+    device->read_dma();
+    device->unfreeze_cores();
+
+    ofstream file("./frontier_verify.txt");
+    if(!file.is_open()) std::cerr <<"couldn't open file\n";
+    for(int i = 0; i < edges.num_nodes(); i++) {
+      if(host_next[i] == 1 && i % 50 == 0) std::cerr << i << std::endl;
+      file << host_next[i] << std::endl;
+    }
+    file.close();
+  }
+	device->finish();
+  return 0;
+}
+#ifdef VCS 
+int vcs_main(int argc, char ** argv) {
+    bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n");
+    int rc = launch(argc,argv);
+    bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
+    return rc;
+}
+#else
+int main(int argc, char ** argv) {
+    bsg_pr_test_info("Unified Main CUDA Regression Test (F1)\n");
+    int rc = launch(argc,argv);
+    bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
+    return rc;
+}
+#endif
diff --git a/examples/graphit/test_sssp_delta/sssp.hpp b/examples/graphit/test_sssp_delta/sssp.hpp
new file mode 100644
index 000000000..2dfcd3d5c
--- /dev/null
+++ b/examples/graphit/test_sssp_delta/sssp.hpp
@@ -0,0 +1,26 @@
+#ifndef __SSSP_BENCHMARK_HPP
+#define __SSSP_BENCHMARK_HPP
+
+#pragma once
+#include "hb_intrinsics.h"
+#include "infra_hb/host/arg_parser.hpp"
+#include "infra_hb/host/priority_queue.hpp"
+#include <string.h>
+#include <stdio.h>
+#include <fstream> 
+#include <atomic>
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_errno.h>
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_loader.h>
+#include <bsg_manycore_cuda.h>
+#include <cl_manycore_regression.h>
+
+using hammerblade::Device;
+using hammerblade::Vector;
+using hammerblade::GraphHB;
+using hammerblade::WGraphHB;
+using hammerblade::GlobalScalar;
+using hammerblade::BucketPriorityQueue;
+using hammerblade::Bucket;
+#endif

From 72e4809aee544688da49c36d1956be9339682e58 Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Sun, 4 Apr 2021 18:28:36 -0700
Subject: [PATCH 14/22] hack to get multiple kernel versions support, needs to
 be refactored

---
 examples/cuda/riscv.mk                   |  2 +-
 examples/graphit/test_pr_nibble/Makefile | 48 ++++++++++++++++++++----
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/examples/cuda/riscv.mk b/examples/cuda/riscv.mk
index 87a52d511..00b37c1eb 100644
--- a/examples/cuda/riscv.mk
+++ b/examples/cuda/riscv.mk
@@ -244,7 +244,7 @@ RISCV_LDFLAGS += -Wl,--no-check-sections
 # This builds a .riscv binary for the current machine type and tile
 # group size. RISCV_TARGET_OBJECTS are .rvo files that will be linked
 # in the final binary.
-%.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) 
+kernel.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) 
 	$(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@
 
 kernel.link.clean:
diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile
index 4bfca9a2b..74d58eeb3 100644
--- a/examples/graphit/test_pr_nibble/Makefile
+++ b/examples/graphit/test_pr_nibble/Makefile
@@ -39,21 +39,24 @@
 # BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
 ###############################################################################
 
+CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
 
 include $(REPLICANT_PATH)/environment.mk
 SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
 CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
-CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
-GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-new
+GRAPHIT_PATH = $(REPLICANT_PATH)/../graphit-new
 
 GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
-
 # TEST_NAME is the basename of the executable
 TEST_NAME = main
 # KERNEL_NAME is the name of the CUDA-Lite Kernel
 KERNEL_NAME = pr_nibble
 
+VERSIONS = hybrid
+DEFAULT_VERSION := hybrid
+KERNEL_DEFAULT 	:= kernel/$(DEFAULT_VERSION)/kernel.cpp
+
 ###############################################################################
 # Host code compilation flags and flow
 ###############################################################################
@@ -69,7 +72,6 @@ FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
 CFLAGS   += -std=c99 $(FLAGS)
 CXXFLAGS += -std=c++11 $(FLAGS) 
 
-HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ 
 
 # compilation.mk defines rules for compilation of C/C++
 include $(EXAMPLES_PATH)/compilation.mk
@@ -93,11 +95,13 @@ include $(EXAMPLES_PATH)/link.mk
 
 # BSG_MANYCORE_KERNELS is a list of manycore executables that should
 # be built before executing.
-BSG_MANYCORE_KERNELS = kernel.riscv
+BSG_MANYCORE_KERNELS = kernel.riscv 
 
 kernel.rvo: RISCV_CXX = $(RISCV_GXX)
 kernel.riscv: kernel.rvo
 
+%/kernel.rvo: RISCV_CXX = $(RISCV_GXX)
+
 # Tile Group Dimensions
 TILE_GROUP_DIM_X = 16
 TILE_GROUP_DIM_Y = 8
@@ -108,6 +112,9 @@ RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_pr_nibble/kernel/inc
 
 include $(EXAMPLES_PATH)/cuda/riscv.mk
 
+%/kernel.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo %/kernel.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT)
+	$(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@
+
 ###############################################################################
 # Execution flow
 #
@@ -116,13 +123,37 @@ include $(EXAMPLES_PATH)/cuda/riscv.mk
 #
 # SIM_ARGS: Use this to pass arguments to the simulator
 ###############################################################################
-C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH)
+#C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH)
+C_ARGS ?= $(KERNEL_NAME) -g $(GRAPH_PATH)
 
 SIM_ARGS ?=
 
 # Include platform-specific execution rules
 include $(EXAMPLES_PATH)/execution.mk
 
+HOST_TARGET := $(TEST_NAME).profile
+
+$(VERSIONS): %: kernel/%/$(HOST_TARGET).log
+
+ALIASES = vanilla_stats.csv vcache_stats.csv
+$(ALIASES): $(HOST_TARGET).log ;
+$(HOST_TARGET).log: kernel.riscv $(HOST_TARGET)
+	./$(HOST_TARGET) $(SIM_ARGS) +c_args="kernel.riscv $(DEFAULT_VERSION) $(C_ARGS)" 2>&1 | tee $@  
+
+
+KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a)
+.PRECIOUS: $(KERNEL_ALIASES)
+$(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ;
+kernel/%/$(HOST_TARGET).log: kernel/%/kernel.riscv $(HOST_TARGET)
+	$(eval EXEC_PATH   := $(patsubst %/,%,$(dir $@)))
+	$(eval KERNEL_PATH := $(CURRENT_PATH)/$(EXEC_PATH))
+	$(eval _VERSION    := $(notdir $(EXEC_PATH)))
+	cd $(EXEC_PATH) && \
+	$(CURRENT_PATH)/$(HOST_TARGET) $(SIM_ARGS) +c_args="$(KERNEL_PATH)/kernel.riscv $(_VERSION) $(C_ARGS)" \
+		2>&1 | tee $(notdir $a)
+
+versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log)
+
 ###############################################################################
 # Regression Flow
 ###############################################################################
@@ -144,8 +175,11 @@ help:
 
 print-%  : ; @echo $* = $($*)
 
+version.clean:
+	rm -rf kernel/*/*{.ucli,.csv,.log,.rvo,.riscv,.vpd,.key,.dis,.ll,.ll.s}
+
 .PHONY: clean
 
-clean:
+clean: version.clean
 
 

From bcfc393515a0853f22b8a61921d40e40ba3b0324 Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Tue, 20 Apr 2021 10:53:09 -0700
Subject: [PATCH 15/22] cleaning up/adding kernel code to repo

---
 examples/graphit/test_pr_nibble/Makefile      |  34 ++-
 .../test_pr_nibble/kernel/hybrid/kernel.cpp   | 229 ++++++++++++++++++
 examples/graphit/test_pr_nibble/main.cpp      |  32 +--
 examples/graphit/test_pr_nibble/pr_host.hpp   |  11 +-
 4 files changed, 272 insertions(+), 34 deletions(-)
 create mode 100644 examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp

diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile
index 74d58eeb3..9ca9ec067 100644
--- a/examples/graphit/test_pr_nibble/Makefile
+++ b/examples/graphit/test_pr_nibble/Makefile
@@ -52,8 +52,29 @@ GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
 TEST_NAME = main
 # KERNEL_NAME is the name of the CUDA-Lite Kernel
 KERNEL_NAME = pr_nibble
+HOST_TARGET := $(TEST_NAME).profile
+
+BASE_VERSIONS += hybrid-update
+
+ITERATIONS := 0 1 2 3 4 5 6 7 8 9
+v-from-basev-and-iter = $1-iteration-$2
+basev-from-v          = $(word 1,$(subst -iteration-, ,$1))
+iter-from-v           = $(word 2,$(subst -iteration-, ,$1))
+
+VERSIONS := $(foreach i,$(ITERATIONS),$(foreach v,$(BASE_VERSIONS),\
+        $(call v-from-basev-and-iter,$v,$i)))
+
+VERSION-DIRS := $(foreach v,$(VERSIONS),kernel/$v)
+
+.PHONY: $(VERSION-DIRS)
+$(VERSION-DIRS): 
+	cp -r $(call basev-from-v,$@) $@
+
+.PHONY: versions bleach-versions
+versions: $(VERSION-DIRS)
+bleach-versions: 
+	rm -rf $(VERSION-DIRS)
 
-VERSIONS = hybrid
 DEFAULT_VERSION := hybrid
 KERNEL_DEFAULT 	:= kernel/$(DEFAULT_VERSION)/kernel.cpp
 
@@ -131,20 +152,19 @@ SIM_ARGS ?=
 # Include platform-specific execution rules
 include $(EXAMPLES_PATH)/execution.mk
 
-HOST_TARGET := $(TEST_NAME).profile
 
 $(VERSIONS): %: kernel/%/$(HOST_TARGET).log
 
-ALIASES = vanilla_stats.csv vcache_stats.csv
+ALIASES = vanilla_stats.csv vcache_stats.csv dramsim3epoch.json dramsim3.json dramsim3.tag.json dramsim3.txt 
 $(ALIASES): $(HOST_TARGET).log ;
-$(HOST_TARGET).log: kernel.riscv $(HOST_TARGET)
+$(HOST_TARGET).log: $(HOST_TARGET) kernel.riscv 
 	./$(HOST_TARGET) $(SIM_ARGS) +c_args="kernel.riscv $(DEFAULT_VERSION) $(C_ARGS)" 2>&1 | tee $@  
 
 
 KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a)
 .PRECIOUS: $(KERNEL_ALIASES)
 $(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ;
-kernel/%/$(HOST_TARGET).log: kernel/%/kernel.riscv $(HOST_TARGET)
+kernel/%/$(HOST_TARGET).log: $(HOST_TARGET) kernel/%/kernel.riscv 
 	$(eval EXEC_PATH   := $(patsubst %/,%,$(dir $@)))
 	$(eval KERNEL_PATH := $(CURRENT_PATH)/$(EXEC_PATH))
 	$(eval _VERSION    := $(notdir $(EXEC_PATH)))
@@ -152,7 +172,9 @@ kernel/%/$(HOST_TARGET).log: kernel/%/kernel.riscv $(HOST_TARGET)
 	$(CURRENT_PATH)/$(HOST_TARGET) $(SIM_ARGS) +c_args="$(KERNEL_PATH)/kernel.riscv $(_VERSION) $(C_ARGS)" \
 		2>&1 | tee $(notdir $a)
 
-versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log)
+.PRECIOUS: %.log
+
+all-versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log)
 
 ###############################################################################
 # Regression Flow
diff --git a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp
new file mode 100644
index 000000000..16e66425c
--- /dev/null
+++ b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp
@@ -0,0 +1,229 @@
+//#define DEBUG
+#include <bsg_manycore.h>
+
+#ifdef DEBUG
+#define BSG_TILE_GROUP_X_DIM 1 
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM 
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM 
+#else
+#include <bsg_set_tile_x_y.h>
+// #define BSG_TILE_GROUP_X_DIM 16 
+// #define BSG_TILE_GROUP_Y_DIM 8
+#endif
+
+#include <bsg_tile_group_barrier.hpp>
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+
+#include <pr_nibble.hpp>
+#include <cstring>
+
+#ifdef DEBUG
+#define pr_dbg(fmt, ...)			\
+		bsg_printf(fmt, ##__VA_ARGS__)
+#else
+#define pr_dbg(fmt, ...)
+#endif
+
+__attribute__((section(".dram"))) float  * __restrict p;
+__attribute__((section(".dram"))) float  * __restrict old_rank;
+__attribute__((section(".dram"))) float  * __restrict new_rank;
+__attribute__((section(".dram"))) int  * __restrict out_degree;
+__attribute__((section(".dram"))) int  * __restrict generated_tmp_vector_3;
+//__attribute__((section(".dram"))) double alpha = 0.15; 
+//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; 
+
+template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{
+  //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
+  //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V);
+  int start, end;
+  local_range(V, &start, &end);
+  for ( int d = start; d < end; d++) {
+    int degree = in_indices[d + 1] - in_indices[d];
+    int * neighbors = &in_neighbors[in_indices[d]];
+    for(int s = 0; s < degree; s++) { 
+      if(from_vertexset[neighbors[s]]) {
+        //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); 
+        apply_func (neighbors[s] , d);
+      }
+    } //end of loop on in neighbors
+  } //end of outer for loop
+  return 0;
+} //end of edgeset apply function 
+
+template <typename APPLY_FUNC > int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{
+  //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
+  //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V);
+  int start, end;
+  local_range(V, &start, &end);
+  for ( int s = start; s < end; s++) {
+    if(from_vertexset[s]) {
+      int degree = out_indices[s + 1] - out_indices[s];
+      int * neighbors = &out_neighbors[out_indices[s]];
+      for(int d = 0; d < degree; d++) { 
+        apply_func (s, neighbors[d]);
+	//if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); }
+	
+      }
+    } //end of loop on in neighbors
+  } //end of outer for loop
+  //barrier.sync();
+  return 0;
+} //end of edgeset apply function 
+
+
+struct generated_vector_op_apply_func_4
+{
+  void operator() (int v)
+  {
+    out_degree[v] = generated_tmp_vector_3[v];
+  };
+};
+struct new_rank_generated_vector_op_apply_func_2
+{
+  void operator() (int v)
+  {
+    new_rank[v] = ((float) 0) ;
+  };
+};
+struct old_rank_generated_vector_op_apply_func_1
+{
+  void operator() (int v)
+  {
+    old_rank[v] = ((float) 0) ;
+  };
+};
+struct p_generated_vector_op_apply_func_0
+{
+  void operator() (int v)
+  {
+    p[v] = ((float) 0) ;
+  };
+};
+struct updateEdge
+{
+  void operator() (int src, int dst)
+  {
+    float alpha = 0.15; 
+    new_rank[dst] = (new_rank[dst] + (((((1)  - alpha) / ((1)  + alpha)) * old_rank[src]) / out_degree[src]));
+  };
+};
+struct updateSelf
+{
+  void operator() (int v)
+  {
+    float alpha = 0.15; 
+    p[v] = (p[v] + ((((2)  * alpha) / ((1)  + alpha)) * old_rank[v]));
+    new_rank[v] = (0) ;
+  };
+};
+struct filter_frontier
+{
+  bool operator() (int v)
+  {
+    float epsilon = (float) 1e-6; 
+    bool output ;
+    //if(old_rank[v] == 0) return 0;
+    if(new_rank[v] == 0) return 0;
+    //output = (old_rank[v]) > ((out_degree[v] * epsilon));
+    output = (new_rank[v]) > ((out_degree[v] * epsilon));
+    return output;
+  };
+};
+
+extern "C" int  __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		p_generated_vector_op_apply_func_0()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		old_rank_generated_vector_op_apply_func_1()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		new_rank_generated_vector_op_apply_func_2()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) {
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		generated_vector_op_apply_func_4()(iter_x);
+	}
+	barrier.sync();
+	return 0;
+}
+extern "C" int  __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) {
+  bsg_cuda_print_stat_start(tag_c);
+	barrier.sync();
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		if(frontier[iter_x]) { updateSelf()(iter_x); }
+	}
+  bsg_cuda_print_stat_end(tag_c);
+	barrier.sync();
+	return 0;
+}
+extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
+	barrier.sync();
+        //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c);
+  bsg_cuda_print_stat_start(tag_c);
+  bsg_saif_start();
+	edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x);
+  bsg_saif_end();
+  bsg_cuda_print_stat_end(tag_c);
+	barrier.sync();
+	return 0;
+}
+
+ extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
+	barrier.sync(); 
+  bsg_cuda_print_stat_start(tag_c);
+  bsg_saif_start();
+	edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x);
+  bsg_saif_end();
+  bsg_cuda_print_stat_end(tag_c);
+	barrier.sync();
+	return 0;
+}
+
+extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { 
+        //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c);
+        //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c);
+  bsg_cuda_print_stat_start(tag_c);
+	barrier.sync();
+	int start, end;
+	local_range(V, &start, &end);
+	for (int iter_x = start; iter_x < end; iter_x++) {
+		if (iter_x < V) {
+			next5[iter_x] = 0;
+			if ( filter_frontier()( iter_x ) ) {
+				next5[iter_x] = 1;
+				//pr_dbg("added vertex %i to frontier\n", iter_x);
+			}
+     		}
+		else { break; }
+	} //end of loop
+  bsg_cuda_print_stat_end(tag_c);
+	barrier.sync();
+	return 0;
+}
+
+
diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp
index ff396f302..9cbfdde2b 100644
--- a/examples/graphit/test_pr_nibble/main.cpp
+++ b/examples/graphit/test_pr_nibble/main.cpp
@@ -34,11 +34,11 @@ int launch(int argc, char ** argv){
   std::string ucode_path = input.getRISCVFile();
 
   int iter = 0;
-  // std::string iterstrbase = "iteration-";
-  // auto pos = ucode_path.find(iterstrbase);
-  // auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos);
-  // std::stringstream ss(iterstr);
-  // ss >> iter;
+  std::string iterstrbase = "iteration-";
+  auto pos = ucode_path.find(iterstrbase);
+  auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos);
+  std::stringstream ss(iterstr);
+  ss >> iter;
   std::cerr << "iteration: " << iter << std::endl;
 
   int version = 0; //default to vertex pull
@@ -84,7 +84,6 @@ int launch(int argc, char ** argv){
   float epsilon = ((float) 1e-06) ;
   int start_vertex = ROOT;
   Vector<int32_t> frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
-  //Vector<int32_t> next_frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
 
   std::vector<int32_t> hfrontier(edges.num_nodes(), 0);
   std::vector<float> p(edges.num_nodes(), (float) 0.0);
@@ -122,12 +121,7 @@ int launch(int argc, char ** argv){
 
   std::cerr << "start of while loop\n";
   int tag_c = 0;
-  //double host_rank[edges.num_nodes()];
-  //ofstream prog_file;
-  //prog_file.open("./progress.txt");
-  //prog_file << "starting computation w/ root vertex: " << start_vertex << std::endl;
   //while ( builtin_getVertexSetSizeHB(frontier, edges.num_nodes()) != 0) 
-  //while ( iter < 16) 
   for(int i = 0; i < 1; i++)
   {
     int f_sz = 0;
@@ -136,10 +130,10 @@ int launch(int argc, char ** argv){
       case 0: //vertex pull
 	    std::cerr << "pull kernel\n";
     	std::cerr << "run update self vertex kernel\n";
-    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c});
+    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
     	device->runJobs();
     	tag_c++;
-    	std::cerr << "run update edges kernel on iter : " << iter << "\n";
+ 			std::cerr << "run update edges kernel on iter : " << iter << "\n";
       device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
     	device->runJobs();
       tag_c++;
@@ -154,25 +148,25 @@ int launch(int argc, char ** argv){
       case 1: //vertex push
 	    std::cerr << "push kernel\n";
     	std::cerr << "run update self vertex kernel\n";
-    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c});
+    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
     	device->runJobs();
     	tag_c++;
     	std::cerr << "run update edges kernel on iter : " << iter << "\n";
       device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); 
     	device->runJobs();
       tag_c++;
-    	std::cerr << "swap arrays\n";
-   	hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
     	std::cerr << "create next frontier\n";
     	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
     	device->runJobs();
+    	std::cerr << "swap arrays\n";
+   	  hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
       f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
       std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
       break;
       case 2: //blocked pull
 	    std::cerr << "blocked pull kernel\n";
     	std::cerr << "run update self vertex kernel\n";
-    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c});
+    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
     	device->runJobs();
     	tag_c++;
     	std::cerr << "run update edges kernel on iter : " << iter << "\n";
@@ -191,13 +185,9 @@ int launch(int argc, char ** argv){
     tag_c++;
 
     iter++;
-    //prog_file << "finished iteration: " << iter << std::endl; 
   }
   std::cerr << "*******end of program********\n";
-  //prog_file << "*******end of program********\n";
   std::cerr << "took: " << iter << " iterations to complete\n";
-  //prog_file << "took: " << iter << " iterations to complete\n";
-  //prog_file.close();
   if(VERIFY) {
     ofstream ver_file;
     ver_file.open("./rank.txt");
diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp
index 7e7479495..1923c6d6d 100644
--- a/examples/graphit/test_pr_nibble/pr_host.hpp
+++ b/examples/graphit/test_pr_nibble/pr_host.hpp
@@ -13,16 +13,16 @@ inline void host_pr_calc(std::vector<float> & p, std::vector<float> & old_rank,
     ofstream ofile;
     ofile.open (fname);
     for(int i = 0; i < iter; i++) {
-        //std::memcpy(new_rank, old_rank, sizeof(float)*edges.num_nodes());
-	    //new_rank = old_rank;
         new_rank.assign(old_rank.begin(), old_rank.end());
         //print out iteration and size:
         int num_items = std::count(frontier.begin(), frontier.end(), 1);
         std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl;
         //update_self
         for(int v = 0; v < g.num_nodes(); v++) {
-            p[v] += (2.0 * alpha) / (1.0  + alpha) * old_rank[v];
-            new_rank[v] = (float) 0.0 ;
+						if(frontier[v]) {
+            	p[v] += (2.0 * alpha) / (1.0  + alpha) * old_rank[v];
+            	new_rank[v] = (float) 0.0 ;
+						}
         }
         //update edges
         for(int d = 0; d < g.num_nodes(); d++) {
@@ -35,9 +35,6 @@ inline void host_pr_calc(std::vector<float> & p, std::vector<float> & old_rank,
                 }
             }
         }
-        //old_rank.swap(new_rank);
-        //std::memcpy(old_rank, new_rank, sizeof(float)*edges.num_nodes());
-        //old_rank = new_rank;
         old_rank.assign(new_rank.begin(), new_rank.end());
         //update frontier
         for(int v = 0; v < g.num_nodes(); v++) {

From 5a72371e4b338b6db74842daa495f7506bdde6a8 Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Wed, 5 May 2021 10:57:33 -0700
Subject: [PATCH 16/22] Redirects submodules

---
 .gitmodules | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 099c758cb..c130faae3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,7 +3,7 @@
 	url = git@github.com:bespoke-silicon-group/hb-prog-eval
 [submodule "examples/sdh-eval-workloads/ipnsw/graph-tools"]
 	path = examples/sdh-eval-workloads/ipnsw/graph-tools
-	url = git@github.com:mrutt92/graph-tools
+	url = git@github.com:bespoke-silicon-group/graph-tools
 [submodule "examples/sdh-eval-workloads/ipnsw/hammerblade-helpers"]
 	path = examples/sdh-eval-workloads/ipnsw/hammerblade-helpers
-	url = git@github.com:mrutt92/hammerblade-helpers
+	url = git@github.com:bespoke-silicon-group/hammerblade-helpers

From b12e42e7842cb0168b789e9badaaae20fbe70e54 Mon Sep 17 00:00:00 2001
From: Max Ruttenberg <mrutt@cs.washington.edu>
Date: Wed, 5 May 2021 12:20:10 -0700
Subject: [PATCH 17/22] [ipnsw] adds rule to generate input

---
 examples/sdh-eval-workloads/ipnsw/Makefile | 30 +++++++++++++++++-----
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile
index d643c9e9f..a8f6da2c5 100644
--- a/examples/sdh-eval-workloads/ipnsw/Makefile
+++ b/examples/sdh-eval-workloads/ipnsw/Makefile
@@ -6,6 +6,23 @@ include $(REPLICANT_PATH)/environment.mk
 
 all:
 
+##################
+# Prepare inputs #
+##################
+ipnsw-eval-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw
+ipnsw-inputs  = $(ipnsw-eval-dir)/data/database_music100.bin
+ipnsw-inputs += $(ipnsw-eval-dir)/data/query_music100.bin
+ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_0
+ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_1
+ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_2
+ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_3
+
+ipnsw-input := $(ipnsw-eval-dir)/data/database_music100.bin
+# this rule generates all the inputs, but we just target one
+# to avoid running this more than once
+$(ipnsw-input):
+	cd $(ipnsw-eval-dir) && bash prep.sh
+
 #######################################
 # Base clase run directory generation #
 #######################################
@@ -50,12 +67,7 @@ endef
 #################################
 # Common command line arguments #
 #################################
-C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/database_music100.bin
-C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/query_music100.bin
-C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_0
-C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_1
-C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_2
-C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_3
+C_ARGS += $(ipnsw-inputs)
 
 ###############
 # Greedy Walk #
@@ -141,6 +153,12 @@ exec:     exec-$(call run-name,$1,$2)
 profile:  profile-$(call run-name,$1,$2)
 debug:    debug-$(call run-name,$1,$2)
 saifgen:  saifgen-$(call run-name,$1,$2)
+
+saifgen-$(call run-name,$1,$2): $(ipnsw-input)
+profile-$(call run-name,$1,$2): $(ipnsw-input)
+debug-$(call run-name,$1,$2):   $(ipnsw-input)
+exec-$(call run-name,$1,$2):    $(ipnsw-input)
+
 endef
 .PHONY: generate
 .PHONY: purge

From 769350dc4d4e49904aed4b570ec8d865e4d525eb Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Wed, 5 May 2021 16:24:36 -0700
Subject: [PATCH 18/22] [pr-nibble] adding graphit submodule, removing
 unnecessary tests, trying to refactor test

---
 .gitmodules                                   |   3 +
 examples/graphit/Makefile                     |   1 -
 examples/graphit/graphit-src                  |   1 +
 examples/graphit/riscv.mk                     | 257 ------------------
 examples/graphit/test_pr_nibble/Makefile      |   9 +-
 examples/graphit/test_pr_nibble/main.cpp      |  18 +-
 examples/graphit/test_pr_nibble/pr.hpp        |   2 +-
 examples/graphit/test_sssp_delta/Makefile     | 151 ----------
 examples/graphit/test_sssp_delta/kernel.cpp   |  99 -------
 .../kernel/include/pr_nibble.hpp              |   9 -
 .../test_sssp_delta/kernel/include/sssp.hpp   |   8 -
 examples/graphit/test_sssp_delta/main.cpp     | 219 ---------------
 examples/graphit/test_sssp_delta/sssp.hpp     |  26 --
 .../graphit/test_vec_add_parallel/Makefile    | 142 ----------
 .../graphit/test_vec_add_parallel/kernel.cpp  |  20 --
 examples/graphit/test_vec_add_parallel/main.c | 196 -------------
 16 files changed, 11 insertions(+), 1150 deletions(-)
 create mode 160000 examples/graphit/graphit-src
 delete mode 100644 examples/graphit/riscv.mk
 delete mode 100644 examples/graphit/test_sssp_delta/Makefile
 delete mode 100644 examples/graphit/test_sssp_delta/kernel.cpp
 delete mode 100644 examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp
 delete mode 100644 examples/graphit/test_sssp_delta/kernel/include/sssp.hpp
 delete mode 100644 examples/graphit/test_sssp_delta/main.cpp
 delete mode 100644 examples/graphit/test_sssp_delta/sssp.hpp
 delete mode 100644 examples/graphit/test_vec_add_parallel/Makefile
 delete mode 100644 examples/graphit/test_vec_add_parallel/kernel.cpp
 delete mode 100644 examples/graphit/test_vec_add_parallel/main.c

diff --git a/.gitmodules b/.gitmodules
index c130faae3..18ee1bd0f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "examples/sdh-eval-workloads/ipnsw/hammerblade-helpers"]
 	path = examples/sdh-eval-workloads/ipnsw/hammerblade-helpers
 	url = git@github.com:bespoke-silicon-group/hammerblade-helpers
+[submodule "examples/graphit/graphit-src"]
+	path = examples/graphit/graphit-src
+	url = https://github.com/bespoke-silicon-group/graphit.git
diff --git a/examples/graphit/Makefile b/examples/graphit/Makefile
index f8389272b..600ef53f4 100644
--- a/examples/graphit/Makefile
+++ b/examples/graphit/Makefile
@@ -45,7 +45,6 @@ include $(REPLICANT_PATH)/environment.mk
 include $(EXAMPLES_PATH)/link.mk
 
 # Define the tests that get run
-TESTS += test_vec_add_parallel
 TESTS += test_pr_nibble
 
 regression: $(TESTS)
diff --git a/examples/graphit/graphit-src b/examples/graphit/graphit-src
new file mode 160000
index 000000000..9f4d8e9ba
--- /dev/null
+++ b/examples/graphit/graphit-src
@@ -0,0 +1 @@
+Subproject commit 9f4d8e9bacac0ed44afe7c3abde697f21457a487
diff --git a/examples/graphit/riscv.mk b/examples/graphit/riscv.mk
deleted file mode 100644
index 87a52d511..000000000
--- a/examples/graphit/riscv.mk
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) 2019, University of Washington All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-#
-# Redistributions of source code must retain the above copyright notice, this list
-# of conditions and the following disclaimer.
-#
-# Redistributions in binary form must reproduce the above copyright notice, this
-# list of conditions and the following disclaimer in the documentation and/or
-# other materials provided with the distribution.
-#
-# Neither the name of the copyright holder nor the names of its contributors may
-# be used to endorse or promote products derived from this software without
-# specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# TODO: Makefile comment
-ORANGE=\033[0;33m
-RED=\033[0;31m
-NC=\033[0m
-
-################################################################################
-# Paths
-################################################################################
-_REPO_ROOT ?= $(shell git rev-parse --show-toplevel)
--include $(_REPO_ROOT)/environment.mk
-
-BSG_MANYCORE_SPMD_PATH = $(BSG_MANYCORE_DIR)/software/spmd/
-BSG_MANYCORE_CUDALITE_PATH = $(BSG_MANYCORE_SPMD_PATH)/bsg_cuda_lite_runtime/
-BSG_MANYCORE_CUDALITE_MAIN_PATH = $(BSG_MANYCORE_CUDALITE_PATH)/main
-
-BSG_MANYCORE_LIB_PATH    = $(BSG_MANYCORE_DIR)/software/bsg_manycore_lib
-BSG_MANYCORE_COMMON_PATH = $(BSG_MANYCORE_SPMD_PATH)/common/
-
-RISCV_TOOLS_PATH := $(BSG_MANYCORE_DIR)/software/riscv-tools/
-RISCV_GNU_PATH   := $(RISCV_TOOLS_PATH)/riscv-install
-RISCV_LLVM_PATH  := $(RISCV_TOOLS_PATH)/llvm/llvm-install
-
-################################################################################
-# Include RISC-V Tool Configuration
-################################################################################
-
-RISCV_LINK_GEN := $(BSG_MANYCORE_DIR)/software/py/bsg_manycore_link_gen.py
-
-# These flags are not supported by clang
-RISCV_GNU_FLAGS = -frerun-cse-after-loop -fweb -frename-registers -mtune=bsg_vanilla_2020
-
-RISCV_GCC        ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-gcc $(RISCV_GNU_FLAGS)
-RISCV_GXX        ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-g++ $(RISCV_GNU_FLAGS)
-RISCV_ELF2HEX    ?= LD_LIBRARY_PATH=$(RISCV_GNU_PATH)/lib $(RISCV_GNU_PATH)/bin/elf2hex
-RISCV_OBJCOPY    ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-objcopy
-RISCV_AR         ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-ar
-RISCV_OBJDUMP    ?= $(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs-objdump
-RISCV_LINK       ?= $(RISCV_GCC) -t -T $(LINK_SCRIPT) $(RISCV_LDFLAGS)
-RISCV_LD         ?= $(RISCV_GCC)
-
-RISCV_CLANG_ABI        = ilp32f
-RISCV_CLANG_CCPPFLAGS += --target=riscv32 -mabi=$(RISCV_CLANG_ABI) -march=riscv32imaf -mtune=hb-rv32
-RISCV_CLANG_CXXFLAGS  += --sysroot=$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs
-RISCV_CLANG_CXXFLAGS  += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0
-RISCV_CLANG_CXXFLAGS  += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0/riscv32-unknown-elf-dramfs  
-
-RISCV_CLANG       ?= $(RISCV_LLVM_PATH)/bin/clang $(RISCV_CLANG_CFLAGS) $(RISCV_CLANG_CCPPFLAGS)
-RISCV_CLANGXX     ?= $(RISCV_LLVM_PATH)/bin/clang++ $(RISCV_CLANG_CXXFLAGS) $(RISCV_CLANG_CCPPFLAGS)
-RISCV_LLVM_OPT    ?= $(RISCV_LLVM_PATH)/bin/opt
-RISCV_LLVM_LLC    ?= $(RISCV_LLVM_PATH)/bin/llc
-RISCV_LLVM_LIB    ?= $(RISCV_LLVM_PATH)/lib
-
-# Set the default RISC-V Compilers. To override these globally set
-# RISCV_CXX = $(RISCV_CLANGXX), etc. This can also be done on a
-# per-object basis. For example, foo.rvo: RISCV_CXX=$(RISCV_CLANGXX)
-RISCV_CXX ?= $(RISCV_GXX)
-RISCV_CC  ?= $(RISCV_GCC)
-
-################################################################################
-# C/C++ Compilation Flags
-#
-# All RISCV C/C++ compilation variables simply have RISCV_* appended.
-################################################################################
-RISCV_OPT_LEVEL   ?= -O2
-RISCV_ARCH_OP     := rv32imaf
-
-# CCPPFLAGS are common between GCC and G++
-RISCV_CCPPFLAGS += $(RISCV_OPT_LEVEL)
-RISCV_CCPPFLAGS += -march=$(RISCV_ARCH_OP)
-RISCV_CCPPFLAGS += -g
-RISCV_CCPPFLAGS += -static
-RISCV_CCPPFLAGS += -ffast-math
-RISCV_CCPPFLAGS += -fno-common
-RISCV_CCPPFLAGS += -ffp-contract=off
-
-RISCV_CFLAGS   += -std=gnu99 $(RISCV_CCPPFLAGS)
-RISCV_CXXFLAGS += -std=c++11 $(RISCV_CCPPFLAGS)
-RISCV_CXXFLAGS += -fno-threadsafe-statics
-
-RISCV_INCLUDES += -I$(BSG_MANYCORE_COMMON_PATH)
-RISCV_INCLUDES += -I$(BSG_MANYCORE_DIR)/software/bsg_manycore_lib
-
-# TODO: Fail if bsg_tiles_X/Y are not set
-RISCV_DEFINES += -Dbsg_global_X=$(BSG_MACHINE_GLOBAL_X)
-RISCV_DEFINES += -Dbsg_global_Y=$(BSG_MACHINE_GLOBAL_Y)
-RISCV_DEFINES += -Dbsg_group_size=$(BSG_MACHINE_POD_TILES)
-RISCV_DEFINES += -Dbsg_pods_X=$(BSG_MACHINE_PODS_X)
-RISCV_DEFINES += -Dbsg_pods_Y=$(BSG_MACHINE_PODS_Y)
-RISCV_DEFINES += -DIO_X_INDEX=$(BSG_MACHINE_HOST_X_CORD)
-RISCV_DEFINES += -DIO_Y_INDEX=$(BSG_MACHINE_HOST_Y_CORD)
-RISCV_DEFINES += -DPREALLOCATE=0
-RISCV_DEFINES += -DHOST_DEBUG=0
-
-# We build and name a machine-specific crt.rvo because it's REALLY
-# difficult to figure out why your program/cosimulation is hanging
-# when the wrong link script was used during linking
-crt.rvo: $(BSG_MANYCORE_COMMON_PATH)/crt.S
-	$(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.comp.log
-
-# We compile these locally so that we don't interfere with the files in
-# $(BSG_MANYCORE_LIB_PATH).
-# BSG Manycore Library Objects
-LIBBSG_MANYCORE_OBJECTS  += bsg_set_tile_x_y.rvo
-LIBBSG_MANYCORE_OBJECTS  += bsg_tile_config_vars.rvo
-LIBBSG_MANYCORE_OBJECTS  += bsg_printf.rvo
-
-$(LIBBSG_MANYCORE_OBJECTS) main.rvo: RISCV_CXX = $(RISCV_GCC)
-
-$(LIBBSG_MANYCORE_OBJECTS): %.rvo:$(BSG_MANYCORE_LIB_PATH)/%.c
-	$(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@
-
-main.rvo: $(BSG_MANYCORE_CUDALITE_MAIN_PATH)/main.c
-	$(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@
-
-%.rvo: %.c
-	$(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.gcc.log
-
-%.rvo: %.cpp
-	$(RISCV_CXX) $(RISCV_CXXFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.gcc.log
-
-kernel.compile.clean:
-	rm -rf *.rvo *.a
-
-.PRECIOUS: %.rvo
-
-################################################################################
-# Linker Flow
-################################################################################
-
-# ELF File Parameters
-# Default .data section location; LOCAL=>DMEM, SHARED=>DRAM.
-BSG_ELF_DEFAULT_DATA_LOC ?= LOCAL
-
-BSG_ELF_OFF_CHIP_MEM := $(BSG_MACHINE_DRAM_INCLUDED)
-
-# Total addressable DRAM size (in 32-bit WORDS, and SIZE bytes)
-BSG_ELF_DRAM_WORDS := $(shell expr $(BSG_MACHINE_DRAM_BANK_SIZE_WORDS) \* $(BSG_MACHINE_GLOBAL_X))
-BSG_ELF_DRAM_SIZE := $(shell expr $(BSG_ELF_DRAM_WORDS) \* 4)
-
-# Victim Cache Set Size (in 32-bit WORDS and SIZE bytes)
-_BSG_ELF_VCACHE_SET_WORDS := $(shell expr $(BSG_MACHINE_VCACHE_WAY) \* $(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS))
-BSG_ELF_VCACHE_SET_SIZE := $(shell expr $(_BSG_ELF_VCACHE_SET_WORDS) \* 4)
-
-# Victim Cache Column Size (in 32-bit WORDS and SIZE bytes)
-_BSG_ELF_VCACHE_COLUMN_WORDS := $(shell expr $(BSG_MACHINE_VCACHE_SET) \* $(_BSG_ELF_VCACHE_SET_WORDS))
-BSG_ELF_VCACHE_COLUMN_SIZE := $(shell expr $(_BSG_ELF_VCACHE_COLUMN_WORDS) \* 4)
-
-# Victim Cache Total Size (in 32-bit WORDS, and SIZE BYTES)
-_BSG_ELF_VCACHE_MANYCORE_WORDS ?= $(shell expr $(BSG_MACHINE_GLOBAL_X) \* $(_BSG_ELF_VCACHE_COLUMN_WORDS))
-BSG_ELF_VCACHE_MANYCORE_SIZE := $(shell expr $(_BSG_ELF_VCACHE_MANYCORE_WORDS) \* 4)
-
-# Compute the ELF Stack Pointer Location.  
-ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL)
-# If the .data segment is in DMEM (LOCAL) then put it at the top of DMEM. (This is the typical case)
-BSG_ELF_STACK_PTR ?= 0x00000ffc
-else
-  # EVA Offset in DRAM
-  BSG_ELF_DRAM_EVA_OFFSET = 0x80000000
-
-  ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1)
-  # Otherwise, use the top of DRAM (if present),
-  _BSG_ELF_DRAM_LIMIT = $(shell expr $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_DRAM_SIZE))
-  else
-  # Or the Victim Cache address space (if DRAM is disabled/not present).
-  _BSG_ELF_DRAM_LIMIT = $(shell expr $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_VCACHE_MANYCORE_SIZE))
-  endif
-# Finally, Subtract 4 from the maximum memory space address
-BSG_ELF_STACK_PTR = $(shell expr $(_BSG_ELF_DRAM_LIMIT) - 4)
-endif
-
-# Linker script generation parameters
-ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1)
-  ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL)
-    LINK_GEN_OPTS ?= --default_data_loc=dmem --dram_size=$(BSG_ELF_DRAM_SIZE) --sp=$(BSG_ELF_STACK_PTR)
-  else ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), SHARED)
-    LINK_GEN_OPTS ?= --default_data_loc=dram --dram_size=$(BSG_ELF_DRAM_SIZE) --sp=$(BSG_ELF_STACK_PTR)
-  else
-    $(error Invalid BSG_ELF_DEFAULT_DATA_LOC = $(BSG_ELF_DEFAULT_DATA_LOC); Only LOCAL and SHARED are valid)
-  endif
-
-  LINK_GEN_OPTS += --imem_size=0x01000000 # 16MB
-else ifeq ($(BSG_ELF_OFF_CHIP_MEM), 0)
-  ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL)
-    LINK_GEN_OPTS ?= --default_data_loc=dmem --dram_size=$(BSG_ELF_VCACHE_SIZE) --sp=$(BSG_ELF_STACK_PTR)
-  else ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), SHARED)
-    LINK_GEN_OPTS ?= --default_data_loc=dram --dram_size=$(BSG_ELF_VCACHE_SIZE) --sp=$(BSG_ELF_STACK_PTR)
-  else
-    $(error Invalid BSG_ELF_DEFAULT_DATA_LOC = $(BSG_ELF_DEFAULT_DATA_LOC); Only LOCAL and SHARED are valid)
-  endif
-
-  LINK_GEN_OPTS += --imem_size=0x00008000 # 32KB
-else
-  $(error Invalid BSG_ELF_OFF_CHIP_MEM = $(BSG_ELF_OFF_CHIP_MEM); Only 0 and 1 are valid)
-endif
-
-RISCV_LINK_SCRIPT ?= bsg_link.ld
-$(RISCV_LINK_SCRIPT): $(RISCV_LINK_GEN)
-	$(RISCV_LINK_GEN) $(LINK_GEN_OPTS) --out=$@
-
-# Link commands and definitions
-
-RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_dram_size=$(BSG_ELF_DRAM_SIZE)
-RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_vcache_size=$(BSG_ELF_VCACHE_MANYCORE_SIZE)
-RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_stack_ptr=$(BSG_ELF_STACK_PTR)
-
-RISCV_LDFLAGS += -nostdlib
-RISCV_LDFLAGS += -march=$(RISCV_ARCH_OP)
-RISCV_LDFLAGS += -nostartfiles
-RISCV_LDFLAGS += -ffast-math
-RISCV_LDFLAGS += -lc
-RISCV_LDFLAGS += -lm
-RISCV_LDFLAGS += -lgcc
-
-# TODO: temporary fix to solve this problem: https://stackoverflow.com/questions/56518056/risc-v-linker-throwing-sections-lma-overlap-error-despite-lmas-belonging-to-dif
-RISCV_LDFLAGS += -Wl,--no-check-sections 
-
-# This builds a .riscv binary for the current machine type and tile
-# group size. RISCV_TARGET_OBJECTS are .rvo files that will be linked
-# in the final binary.
-%.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) 
-	$(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@
-
-kernel.link.clean:
-	rm -rf *.riscv $(RISCV_LINK_SCRIPT)
-
-
-.PRECIOUS: %.riscv
-.PHONY: kernel.link.clean kernel.compile.clean
-clean: kernel.link.clean kernel.compile.clean
-
diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile
index 9ca9ec067..bdd77f513 100644
--- a/examples/graphit/test_pr_nibble/Makefile
+++ b/examples/graphit/test_pr_nibble/Makefile
@@ -45,7 +45,7 @@ REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
 include $(REPLICANT_PATH)/environment.mk
 SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
 CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
-GRAPHIT_PATH = $(REPLICANT_PATH)/../graphit-new
+GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-src
 
 GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
 # TEST_NAME is the basename of the executable
@@ -54,7 +54,7 @@ TEST_NAME = main
 KERNEL_NAME = pr_nibble
 HOST_TARGET := $(TEST_NAME).profile
 
-BASE_VERSIONS += hybrid-update
+BASE_VERSIONS += hybrid
 
 ITERATIONS := 0 1 2 3 4 5 6 7 8 9
 v-from-basev-and-iter = $1-iteration-$2
@@ -98,8 +98,7 @@ CXXFLAGS += -std=c++11 $(FLAGS)
 include $(EXAMPLES_PATH)/compilation.mk
 
 # Specify any header file dependencies
-main.o: INCLUDES += -I$(EXAMPLES_PATH) -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/
-main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h
+main.o: INCLUDES += -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/
 
 ###############################################################################
 # Host code link flags and flow
@@ -162,7 +161,7 @@ $(HOST_TARGET).log: $(HOST_TARGET) kernel.riscv
 
 
 KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a)
-.PRECIOUS: $(KERNEL_ALIASES)
+.PRECIOUS: $(KERNEL_ALIASES) kernel/%/kernel.riscv
 $(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ;
 kernel/%/$(HOST_TARGET).log: $(HOST_TARGET) kernel/%/kernel.riscv 
 	$(eval EXEC_PATH   := $(patsubst %/,%,$(dir $@)))
diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp
index 9cbfdde2b..f0bee8b64 100644
--- a/examples/graphit/test_pr_nibble/main.cpp
+++ b/examples/graphit/test_pr_nibble/main.cpp
@@ -25,7 +25,7 @@ GlobalScalar<hb_mc_eva_t> out_degree_dev;
 
 #include "pr_host.hpp"
 
-int launch(int argc, char ** argv){
+int test_pr_nibble(int argc, char ** argv){
   InputParser input(argc, argv);
   if(!input.cmdOptionExists("-g")){
     std::cerr << "no input args\n";
@@ -202,18 +202,4 @@ int launch(int argc, char ** argv){
   return 0;
 }
 
-#ifdef VCS
-int vcs_main(int argc, char ** argv){
-    bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n");
-    int rc = launch(argc, argv); 
-    bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
-    return rc;
-}
-#else
-int main(int argc, char ** argv) {
-    bsg_pr_test_info("Unified Main CUDA Regression Test (F1)\n");
-    int rc = launch(argc, argv);
-    bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
-    return rc;
-}
-#endif 
+declare_program_main("test_pr_nibble", test_pr_nibble); 
diff --git a/examples/graphit/test_pr_nibble/pr.hpp b/examples/graphit/test_pr_nibble/pr.hpp
index b1f9ac484..5cce0e30a 100644
--- a/examples/graphit/test_pr_nibble/pr.hpp
+++ b/examples/graphit/test_pr_nibble/pr.hpp
@@ -4,6 +4,7 @@
 
 #include "hb_intrinsics.h"
 #include "infra_hb/host/arg_parser.hpp"
+#include <bsg_manycore_regression.h>
 #include <string.h>
 #include <stdio.h>
 #include <iostream>
@@ -14,7 +15,6 @@
 #include <bsg_manycore_tile.h>
 #include <bsg_manycore_loader.h>
 #include <bsg_manycore_cuda.h>
-#include <cl_manycore_regression.h>
 
 
 using hammerblade::Device;
diff --git a/examples/graphit/test_sssp_delta/Makefile b/examples/graphit/test_sssp_delta/Makefile
deleted file mode 100644
index 2e980c4e2..000000000
--- a/examples/graphit/test_sssp_delta/Makefile
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2021, University of Washington All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-#
-# Redistributions of source code must retain the above copyright notice, this list
-# of conditions and the following disclaimer.
-#
-# Redistributions in binary form must reproduce the above copyright notice, this
-# list of conditions and the following disclaimer in the documentation and/or
-# other materials provided with the distribution.
-#
-# Neither the name of the copyright holder nor the names of its contributors may
-# be used to endorse or promote products derived from this software without
-# specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# This Makefile compiles, links, and executes examples Run `make help`
-# to see the available targets for the selected platform.
-
-################################################################################
-# environment.mk verifies the build environment and sets the following
-# makefile variables:
-#
-# LIBRAIRES_PATH: The path to the libraries directory
-# HARDWARE_PATH: The path to the hardware directory
-# EXAMPLES_PATH: The path to the examples directory
-# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
-# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
-###############################################################################
-
-REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
-
-include $(REPLICANT_PATH)/environment.mk
-SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
-CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
-CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
-GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-new
-
-GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
-
-# TEST_NAME is the basename of the executable
-TEST_NAME = main
-# KERNEL_NAME is the name of the CUDA-Lite Kernel
-KERNEL_NAME = sssp 
-
-###############################################################################
-# Host code compilation flags and flow
-###############################################################################
-
-# TEST_SOURCES is a list of source files that need to be compiled
-TEST_SOURCES = main.cpp
-
-DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE
-CDEFINES += 
-CXXDEFINES += 
-
-FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
-CFLAGS   += -std=c99 $(FLAGS)
-CXXFLAGS += -std=c++11 $(FLAGS) 
-
-HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ 
-
-# compilation.mk defines rules for compilation of C/C++
-include $(EXAMPLES_PATH)/compilation.mk
-
-# Specify any header file dependencies
-main.o: INCLUDES += -I$(EXAMPLES_PATH) -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/
-main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h
-
-###############################################################################
-# Host code link flags and flow
-###############################################################################
-
-LDFLAGS += 
-
-# link.mk defines rules for linking of the final execution binary.
-include $(EXAMPLES_PATH)/link.mk
-
-###############################################################################
-# Device code compilation flow
-###############################################################################
-
-# BSG_MANYCORE_KERNELS is a list of manycore executables that should
-# be built before executing.
-BSG_MANYCORE_KERNELS = kernel.riscv
-
-kernel.rvo: RISCV_CXX = $(RISCV_GXX)
-kernel.riscv: kernel.rvo
-
-# Tile Group Dimensions
-TILE_GROUP_DIM_X = 16
-TILE_GROUP_DIM_Y = 8
-RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
-RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
-
-RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_sssp_delta/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/
-
-include $(EXAMPLES_PATH)/cuda/riscv.mk
-
-###############################################################################
-# Execution flow
-#
-# C_ARGS: Use this to pass arguments that you want to appear in argv
-#         For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
-#
-# SIM_ARGS: Use this to pass arguments to the simulator
-###############################################################################
-C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH)
-
-SIM_ARGS ?=
-
-# Include platform-specific execution rules
-include $(EXAMPLES_PATH)/execution.mk
-
-###############################################################################
-# Regression Flow
-###############################################################################
-
-regression: main.exec.log
-	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
-
-###############################################################################
-# Default rules, help, and clean
-###############################################################################
-.DEFAULT_GOAL := help
-help:
-	@echo "Usage:"
-	@echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}"
-	@echo "      $(TEST_NAME).profile: Build executable with profilers enabled"
-	@echo "      $(TEST_NAME).debug: Build waveform executable (if VCS)"
-	@echo "      $(TEST_NAME).{profile,debug}.log: Run specific executable"
-	@echo "      clean: Remove all subdirectory-specific outputs"
-
-print-%  : ; @echo $* = $($*)
-
-.PHONY: clean
-
-clean:
-
-
diff --git a/examples/graphit/test_sssp_delta/kernel.cpp b/examples/graphit/test_sssp_delta/kernel.cpp
deleted file mode 100644
index 5039f1dd2..000000000
--- a/examples/graphit/test_sssp_delta/kernel.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-#include <bsg_manycore.h>
-#include <bsg_set_tile_x_y.h>
-
-//#define BSG_TILE_GROUP_X_DIM 16 
-//#define BSG_TILE_GROUP_Y_DIM 2
-//#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
-//#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
-#include <bsg_tile_group_barrier.hpp>
-bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
-#include <sssp.hpp>
-
-//#define DEBUG
-#ifdef DEBUG
-#define pr_dbg(fmt, ...)	\
-	bsg_printf(fmt, ##__VA_ARGS__)
-#else
-#define pr_dbg(fmt, ...)
-#endif
-
-
-__attribute__((section(".dram"))) int  * __restrict dist;
-
-template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier(int *in_indices , WNode *in_neighbors, int* from_vertexset, int * next_frontier, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
-{ 
-  bsg_cuda_print_stat_start(1);
-  bsg_saif_start();
-  int start, end;
-  local_range(V, &start, &end);
-  if(bsg_id == 0) pr_dbg("elem 1: %i and dist: %i and random weight: %i\n", from_vertexset[5], dist[5], in_neighbors[in_indices[5]].weight);
-  for ( int d = start; d < end; d++) {
-    int degree = in_indices[d + 1] - in_indices[d];
-    WNode * neighbors = &in_neighbors[in_indices[d]];
-    for(int s = 0; s < degree; s++) { 
-      if(from_vertexset[neighbors[s].vertex]) {
-        if( apply_func ( neighbors[s].vertex, d, neighbors[s].weight )) { 
-          next_frontier[d] = 1;
-        }
-      }
-    } //end of loop on in neighbors
-  } //end of outer for loop
-  bsg_saif_end();
-  bsg_cuda_print_stat_end(1);
-  barrier.sync();
-  return 0;
-} //end of edgeset apply function 
-
-
-struct dist_generated_vector_op_apply_func_0
-{
-  void operator() (int v)
-  {
-    dist[v] = (2147483647) ;
-  };
-};
-struct updateEdge
-{
-  bool operator() (int src, int dst, int weight)
-  {
-    bool output3 = false;
-    int new_dist = (dist[src] + weight);
-    if(dist[dst] > new_dist) {
-      dist[dst] = new_dist;
-      output3 = true;
-    }
-    return output3;
-  };
-};
-struct reset
-{
-  void operator() (int v)
-  {
-    dist[v] = (2147483647) ;
-  };
-};
-
-extern "C" int  __attribute__ ((noinline)) dist_generated_vector_op_apply_func_0_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		dist_generated_vector_op_apply_func_0()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int  __attribute__ ((noinline)) reset_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		reset()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier_call(int *in_indices, WNode *in_neighbors, int *frontier, int *modified_vertexsubset1, int V, int E, int block_size_x) {
-	edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier(in_indices, in_neighbors, frontier, modified_vertexsubset1, updateEdge(), V, E, block_size_x);
-	return 0;
-}
-
-
diff --git a/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp b/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp
deleted file mode 100644
index ee50a54d6..000000000
--- a/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-#ifndef __PR_PULL_BENCHMARK_HPP
-#define __PR_PULL_BENCHMARK_HPP
-
-#include <math.h>
-#include <local_range.h>
-#include <vertex_struct.h>
-#include <atomics.h>
-#endif
diff --git a/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp b/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp
deleted file mode 100644
index 23da05000..000000000
--- a/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __SSSP_BENCHMARK_HPP
-#define __SSSP_BENCHMARK_HPP
-
-#include <vertex_struct.h>
-#include <local_range.h>
-#include <atomics.h>
-#include <wnode.h>
-#endif
diff --git a/examples/graphit/test_sssp_delta/main.cpp b/examples/graphit/test_sssp_delta/main.cpp
deleted file mode 100644
index 069ee5426..000000000
--- a/examples/graphit/test_sssp_delta/main.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-#include "sssp.hpp"
-#define X 16 
-#define Y 2
-#define NUM_LOCKS 1024
-#define VERIFY false 
-#define ROOT 6
-#define DELTA 32
-
-WGraphHB edges;
-GlobalScalar<hb_mc_eva_t> dist_dev; 
-//BucketPriorityQueue<int> pq;
-
-bool apply(int s, int d, int w, std::vector<int> &dist) {
-  int new_dist = (dist[s] + w);
-  if(dist[d] > new_dist) {
-    dist[d] = new_dist;
-    return true;
-  }
-  return false;
-}
-
-void sssp_pull_call(std::vector<int> &front, std::vector<int> &next, std::vector<int> &dist) {
-  auto g = edges.getHostGraph();
-  auto * in_neigh = g.in_neighbors_shared_.get();
-  auto ** in_index = g.in_index_shared_.get();
-  for(int d = 0; d < edges.num_nodes(); d++) {
-    int ind = in_index[d] - in_neigh;
-    int degree = g.in_degree(d);
-    auto * neighbors = &in_neigh[ind];
-    for(int s = 0; s < degree; s++){
-      if(front[neighbors[s].v]){
-        if(apply(neighbors[s].v, d, neighbors[s].w, dist)) {
-          next[d] = 1;
-        }
-      } 
-    }
-  }  
-
-}
-
-void host_sssp_pull(BucketPriorityQueue<int>& pq, std::vector<int> &dist, int iter) {
-  dist[ROOT] = 0;
-  Device::Ptr device = Device::GetInstance(); 
-  Vector<int> next_frontier_dev = Vector<int>(edges.num_nodes());
-  std::vector<int> h_next(edges.num_nodes(), 0);
-  std::vector<int> h_front(edges.num_nodes(), 0);  
-
-  for(int i = 0; i < iter; i++) {
-    if(!(pq.finished() == 0)) { std::cout << "no more items on iter: " << i << "\n"; break; }
-    Vector<int32_t> front = pq.popDenseReadyVertexSet(); 
-    front.copyToHost(h_front.data(), edges.num_nodes()); 
-    device->freeze_cores();
-    device->read_dma();
-    device->unfreeze_cores();
-    int num_elems = std::count(h_front.begin(), h_front.end(), 1);
-    std::cout << "num elems in front: " << num_elems << " val of 0: " << h_front[0] << std::endl; 
-    sssp_pull_call(h_front, h_next, dist);
-    num_elems = std::count(h_next.begin(), h_next.end(), 1);
-    std::cout << "num elems in next front: " << num_elems << std::endl; 
-    std::cout << "dist of 1: " << dist[1] << std::endl;
-    next_frontier_dev.copyToDevice(h_next.data(), edges.num_nodes());  
-    hammerblade::write_global_buffer_dma<int>(dist.data(), dist_dev, edges.num_nodes());
-    device->freeze_cores();
-    device->write_dma();
-    device->unfreeze_cores();
-    hammerblade::updateBucketWithGraphItVertexSubset<int>(next_frontier_dev, pq);
-    std::fill(h_next.begin(), h_next.end(), 0);
-  }
-}
-void host_sssp_push(BucketPriorityQueue<int> &pq, std::vector<int> &dist, int iter) {
-  host_sssp_pull(pq, dist, iter);
-}
-
-int launch(int argc, char * argv[]){
-  InputParser input(argc, argv);
-  if(!input.cmdOptionExists("-g")) { 
-  
-    std::cerr << "no input args\n";
-    for(auto i = 0; i < argc; i++) {
-      std::cerr << argv[i] << " ";
-    }
-    std::cerr << std::endl;
-    return 0;
-  }
-  std::string ucode_path = input.getRISCVFile();
-
-  int iter = 0;
-  //std::string iterstrbase = "iteration-";
-  //auto pos = ucode_path.find(iterstrbase);
-  //auto iterstr = ucode_path.substr(pos + iterstrbase.size(), std::string::npos);
-  //std::stringstream ss(iterstr);
-  //ss >> iter;
-  std::cerr << "iteration: " << iter << std::endl;
-
-  int version = 0; //pull-vertex
-  if(ucode_path.find("push-vertex") != std::string::npos) {
-    version = 1;
-  }
-  std::cerr << "load microcode" << std::endl;
-  hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
-  std::cerr << "load graph" << std::endl;
-
-  std::string graph_f = input.getCmdOption("-g");
-  //std::string frontier_f = input.getCmdOption("-f");
-  edges = hammerblade::builtin_loadWeightedEdgesFromFileToHB (graph_f.c_str()); 
-  std::cerr << "out deg of 0: " << edges.out_degree(5) << "num edges: " << edges.num_edges() << std::endl;
-
-
-  Device::Ptr device = Device::GetInstance(); 
-  dist_dev = GlobalScalar<hb_mc_eva_t>("dist");
-  hammerblade::init_global_array<int>(edges.num_nodes(), dist_dev);
-  hammerblade::assign_val_dma<int>(0, edges.num_nodes(), (2147483647), dist_dev);
-  int start_vertex = 0;
-  //hammerblade::insert_val<int>(start_vertex, 0, dist_dev); 
- 
-  std::cerr << "init locks\n";
-  GlobalScalar<hb_mc_eva_t> glbl_locks = GlobalScalar<hb_mc_eva_t>("locks");
-  hammerblade::init_global_array<std::atomic<int>>(NUM_LOCKS, glbl_locks);
-  std::atomic<int> tmp_array[NUM_LOCKS] = {};
-  hammerblade::write_global_buffer_dma<std::atomic<int>>(tmp_array, glbl_locks, NUM_LOCKS);
-
-  std::cerr << "doing batch dma write" << std::endl;
-  device->freeze_cores();
-  device->write_dma();
-  device->unfreeze_cores();
-  hammerblade::insert_val<int>(start_vertex, 0, dist_dev); 
-  std::cerr << "init pq" << std::endl;
-  BucketPriorityQueue<int> pq = BucketPriorityQueue<int>(edges.num_nodes(), &dist_dev, (hammerblade::BucketOrder)1, (hammerblade::PriorityOrder)0, (int) 128, (int) 32);
-
-  std::cerr << "host side compute up to current iter: \n";
-  std::vector<int> h_dist(edges.num_nodes(), 2147483647);
-  if(version == 0) {
-    host_sssp_pull(pq, h_dist, iter);
-  } else {
-    host_sssp_push(pq, h_dist, iter);
-  } 
-  hammerblade::write_global_buffer_dma<int>(h_dist.data(), dist_dev, edges.num_nodes());
-  device->freeze_cores();
-  device->write_dma();
-  device->unfreeze_cores();
-
-  std::cerr << "starting while loop" << std::endl;
-  Vector<int32_t> next_frontier_dev;
-  switch(version){
-    case 0: { // do dense pull bfs
-      //device->enqueueJob("init_kernel", hb_mc_dimension(X,Y), {edges.num_nodes()});
-      //device->runJobs();
-      for(int i = 0; i < 1; i++) //just doing one large iteration
-      {
-     
-	std::cerr << "doing SSSP Delta Stepping kernel" << std::endl;
-        //Vector<int32_t> frontier = hammerblade::getBucketWithGraphItVertexSubset<int>(pq);
-        Vector<int32_t> frontier = pq.popDenseReadyVertexSet(); 
-        std::cerr << "got frontier from pq\n";
-        next_frontier_dev = Vector<int32_t>(edges.num_nodes());
-        //next_frontier_dev.assign(0, edges.num_nodes(), 0);
-        //device->freeze_cores();
-        //device->write_dma();
-        //device->unfreeze_cores();
-        printf("0x%08x\n", frontier.getAddr());
-        printf("next: 0x%08x\n", next_frontier_dev.getAddr());
-        std::cerr << "initialized next front\n";
-        device->enqueueJob("edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier_call",
-                         hb_mc_dimension(X,Y),
-                        {edges.getInIndicesAddr(),
-                         edges.getInNeighborsAddr(),
-                         frontier.getAddr(),
-                         next_frontier_dev.getAddr(),  
-                         edges.num_nodes(),
-                         edges.num_edges(),
-                         edges.num_nodes()});
-        device->runJobs();
-        std::cerr << "updating buckets:\n";
-        hammerblade::updateBucketWithGraphItVertexSubset<int>(next_frontier_dev, pq);
-        hammerblade::deleteObject(frontier);
-    }
-    break;
-    }
-    case 1: { //do sparse push blocked bfs
-    break;
-    } 
-  }
-        
-  std::cerr << "finished while loop" << std::endl;
-
-  if(VERIFY) {
-    int * host_next = new int[edges.num_nodes()];
-    next_frontier_dev.copyToHost(host_next, edges.num_nodes());
-
-    device->freeze_cores();
-    device->read_dma();
-    device->unfreeze_cores();
-
-    ofstream file("./frontier_verify.txt");
-    if(!file.is_open()) std::cerr <<"couldn't open file\n";
-    for(int i = 0; i < edges.num_nodes(); i++) {
-      if(host_next[i] == 1 && i % 50 == 0) std::cerr << i << std::endl;
-      file << host_next[i] << std::endl;
-    }
-    file.close();
-  }
-	device->finish();
-  return 0;
-}
-#ifdef VCS 
-int vcs_main(int argc, char ** argv) {
-    bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n");
-    int rc = launch(argc,argv);
-    bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
-    return rc;
-}
-#else
-int main(int argc, char ** argv) {
-    bsg_pr_test_info("Unified Main CUDA Regression Test (F1)\n");
-    int rc = launch(argc,argv);
-    bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
-    return rc;
-}
-#endif
diff --git a/examples/graphit/test_sssp_delta/sssp.hpp b/examples/graphit/test_sssp_delta/sssp.hpp
deleted file mode 100644
index 2dfcd3d5c..000000000
--- a/examples/graphit/test_sssp_delta/sssp.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef __SSSP_BENCHMARK_HPP
-#define __SSSP_BENCHMARK_HPP
-
-#pragma once
-#include "hb_intrinsics.h"
-#include "infra_hb/host/arg_parser.hpp"
-#include "infra_hb/host/priority_queue.hpp"
-#include <string.h>
-#include <stdio.h>
-#include <fstream> 
-#include <atomic>
-#include <bsg_manycore_tile.h>
-#include <bsg_manycore_errno.h>
-#include <bsg_manycore_tile.h>
-#include <bsg_manycore_loader.h>
-#include <bsg_manycore_cuda.h>
-#include <cl_manycore_regression.h>
-
-using hammerblade::Device;
-using hammerblade::Vector;
-using hammerblade::GraphHB;
-using hammerblade::WGraphHB;
-using hammerblade::GlobalScalar;
-using hammerblade::BucketPriorityQueue;
-using hammerblade::Bucket;
-#endif
diff --git a/examples/graphit/test_vec_add_parallel/Makefile b/examples/graphit/test_vec_add_parallel/Makefile
deleted file mode 100644
index 4291f23a1..000000000
--- a/examples/graphit/test_vec_add_parallel/Makefile
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2021, University of Washington All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-#
-# Redistributions of source code must retain the above copyright notice, this list
-# of conditions and the following disclaimer.
-#
-# Redistributions in binary form must reproduce the above copyright notice, this
-# list of conditions and the following disclaimer in the documentation and/or
-# other materials provided with the distribution.
-#
-# Neither the name of the copyright holder nor the names of its contributors may
-# be used to endorse or promote products derived from this software without
-# specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# This Makefile compiles, links, and executes examples Run `make help`
-# to see the available targets for the selected platform.
-
-################################################################################
-# environment.mk verifies the build environment and sets the following
-# makefile variables:
-#
-# LIBRAIRES_PATH: The path to the libraries directory
-# HARDWARE_PATH: The path to the hardware directory
-# EXAMPLES_PATH: The path to the examples directory
-# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
-# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
-###############################################################################
-
-REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
-
-include $(REPLICANT_PATH)/environment.mk
-SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
-CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
-
-# TEST_NAME is the basename of the executable
-TEST_NAME = main
-# KERNEL_NAME is the name of the CUDA-Lite Kernel
-KERNEL_NAME = vec_add_parallel
-
-###############################################################################
-# Host code compilation flags and flow
-###############################################################################
-
-# TEST_SOURCES is a list of source files that need to be compiled
-TEST_SOURCES = main.c
-
-DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE
-CDEFINES += 
-CXXDEFINES += 
-
-FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
-CFLAGS   += -std=c99 $(FLAGS)
-CXXFLAGS += -std=c++11 $(FLAGS)
-
-# compilation.mk defines rules for compilation of C/C++
-include $(EXAMPLES_PATH)/compilation.mk
-
-# Specify any header file dependencies
-main.o: INCLUDES += -I$(EXAMPLES_PATH)
-main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h
-
-###############################################################################
-# Host code link flags and flow
-###############################################################################
-
-LDFLAGS +=
-
-# link.mk defines rules for linking of the final execution binary.
-include $(EXAMPLES_PATH)/link.mk
-
-###############################################################################
-# Device code compilation flow
-###############################################################################
-
-# BSG_MANYCORE_KERNELS is a list of manycore executables that should
-# be built before executing.
-BSG_MANYCORE_KERNELS = kernel.riscv
-
-kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX)
-kernel.riscv: kernel.rvo
-
-# Tile Group Dimensions
-TILE_GROUP_DIM_X = 2
-TILE_GROUP_DIM_Y = 2
-RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
-RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
-
-include $(EXAMPLES_PATH)/graphit/riscv.mk
-
-###############################################################################
-# Execution flow
-#
-# C_ARGS: Use this to pass arguments that you want to appear in argv
-#         For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
-#
-# SIM_ARGS: Use this to pass arguments to the simulator
-###############################################################################
-C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME)
-
-SIM_ARGS ?=
-
-# Include platform-specific execution rules
-include $(EXAMPLES_PATH)/execution.mk
-
-###############################################################################
-# Regression Flow
-###############################################################################
-
-regression: main.exec.log
-	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
-
-###############################################################################
-# Default rules, help, and clean
-###############################################################################
-.DEFAULT_GOAL := help
-help:
-	@echo "Usage:"
-	@echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}"
-	@echo "      $(TEST_NAME).profile: Build executable with profilers enabled"
-	@echo "      $(TEST_NAME).debug: Build waveform executable (if VCS)"
-	@echo "      $(TEST_NAME).{profile,debug}.log: Run specific executable"
-	@echo "      clean: Remove all subdirectory-specific outputs"
-
-
-.PHONY: clean
-
-clean:
-
-
diff --git a/examples/graphit/test_vec_add_parallel/kernel.cpp b/examples/graphit/test_vec_add_parallel/kernel.cpp
deleted file mode 100644
index b2ea1ae88..000000000
--- a/examples/graphit/test_vec_add_parallel/kernel.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//This kernel adds 2 vectors 
-
-#include <bsg_manycore.h>
-#include <bsg_set_tile_x_y.h>
-#include <bsg_tile_group_barrier.hpp>
-
-bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
-
-extern "C" __attribute__ ((noinline))
-int kernel_vec_add_parallel(int *A, int *B, int *C, int N, int block_size_x) {
-
-	int start_x = block_size_x * (__bsg_tile_group_id_y * __bsg_grid_dim_x + __bsg_tile_group_id_x); 
-	for (int iter_x = __bsg_id; iter_x < block_size_x; iter_x += bsg_tiles_X * bsg_tiles_Y) { 
-		C[start_x + iter_x] = A[start_x + iter_x] + B[start_x + iter_x];
-	}
-
-	barrier.sync();
-
-	return 0;
-}
diff --git a/examples/graphit/test_vec_add_parallel/main.c b/examples/graphit/test_vec_add_parallel/main.c
deleted file mode 100644
index 07c9bd209..000000000
--- a/examples/graphit/test_vec_add_parallel/main.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2019, University of Washington All rights reserved.
-// 
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-// 
-// Redistributions of source code must retain the above copyright notice, this list
-// of conditions and the following disclaimer.
-// 
-// Redistributions in binary form must reproduce the above copyright notice, this
-// list of conditions and the following disclaimer in the documentation and/or
-// other materials provided with the distribution.
-// 
-// Neither the name of the copyright holder nor the names of its contributors may
-// be used to endorse or promote products derived from this software without
-// specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <bsg_manycore_tile.h>
-#include <bsg_manycore_errno.h>
-#include <bsg_manycore_tile.h>
-#include <bsg_manycore_loader.h>
-#include <bsg_manycore_cuda.h>
-#include <stdlib.h>
-#include <time.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/ioctl.h>
-#include <stdio.h>
-#include <cl_manycore_regression.h>
-
-#define ALLOC_NAME "default_allocator"
-
-/*!
- * Runs the vector addition a grid of 2x2 tile groups. A[N] + B[N] --> C[N]
- * Grid dimensions are determines by how much of a load we want for each tile group (block_size_x)
- * This tests uses the software/spmd/bsg_cuda_lite_runtime/vec_add_parallel/ Manycore binary in the BSG Manycore bitbucket repository.  
-*/
-
-
-void host_vec_add (int *A, int *B, int *C, int N) { 
-        for (int i = 0; i < N; i ++) { 
-                C[i] = A[i] + B[i];
-        }
-        return;
-}
-
-
-int kernel_vec_add_parallel (int argc, char **argv) {
-        int rc;
-        char *bin_path, *test_name;
-        struct arguments_path args = {NULL, NULL};
-
-        argp_parse (&argp_path, argc, argv, 0, 0, &args);
-        bin_path = args.path;
-        test_name = args.name;
-
-        bsg_pr_test_info("Running the CUDA Vector Addition Kernel on a grid of 2x2 tile groups.\n\n");
-
-        srand(time); 
-
-        /*********************/
-        /* Initialize device */
-        /*********************/
-        hb_mc_device_t device;
-        BSG_CUDA_CALL(hb_mc_device_init(&device, test_name, 0));
-
-        hb_mc_pod_id_t pod;
-        hb_mc_device_foreach_pod_id(&device, pod)
-        {
-                /**********************************************************************/
-                /* Define path to binary.                                             */
-                /* Initialize device, load binary and unfreeze tiles.                 */
-                /**********************************************************************/
-                bsg_pr_test_info("Loading program for %s onto pod %d\n",
-                                 test_name, pod);
-
-                BSG_CUDA_CALL(hb_mc_device_set_default_pod(&device, pod));
-                BSG_CUDA_CALL(hb_mc_device_program_init(&device, bin_path, ALLOC_NAME, 0));
-
-                /*****************************************************************************************************************
-                 * Allocate memory on the device for A, B and C.
-                 ******************************************************************************************************************/
-                uint32_t N = 1024;
-
-                eva_t A_device, B_device, C_device;
-                BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &A_device)); /* allocate A[N] on the device */
-                BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &B_device)); /* allocate B[N] on the device */
-                BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &C_device)); /* allocate C[N] on the device */
-
-                /*****************************************************************************************************************
-                 * Allocate memory on the host for A & B and initialize with random values.
-                 ******************************************************************************************************************/
-                uint32_t A_host[N]; /* allocate A[N] on the host */
-                uint32_t B_host[N]; /* allocate B[N] on the host */
-                for (int i = 0; i < N; i++) { /* fill A with arbitrary data */
-                        A_host[i] = rand() & 0xFFFF;
-                        B_host[i] = rand() & 0xFFFF;
-                }
-
-                /*****************************************************************************************************************
-                 * Copy A & B from host onto device DRAM.
-                 ******************************************************************************************************************/
-                void *dst = (void *) ((intptr_t) A_device);
-                void *src = (void *) &A_host[0];
-                BSG_CUDA_CALL(hb_mc_device_memcpy (&device, dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_DEVICE)); /* Copy A to the device  */
-
-                dst = (void *) ((intptr_t) B_device);
-                src = (void *) &B_host[0];
-                BSG_CUDA_CALL(hb_mc_device_memcpy (&device, dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_DEVICE)); /* Copy B to the device */
-
-                /*****************************************************************************************************************
-                 * Define block_size_x/y: amount of work for each tile group
-                 * Define tg_dim_x/y: number of tiles in each tile group
-                 * Calculate grid_dim_x/y: number of tile groups needed based on block_size_x/y
-                 ******************************************************************************************************************/
-                uint32_t block_size_x = 64;
-                hb_mc_dimension_t tg_dim = { .x = 2, .y = 2 };
-                hb_mc_dimension_t grid_dim = { .x = N / block_size_x, .y = 1 };
-
-                /*****************************************************************************************************************
-                 * Prepare list of input arguments for kernel.
-                 ******************************************************************************************************************/
-                int cuda_argv[5] = {A_device, B_device, C_device, N, block_size_x};
-
-                /*****************************************************************************************************************
-                 * Enquque grid of tile groups, pass in grid and tile group dimensions, kernel name, number and list of input arguments
-                 ******************************************************************************************************************/
-                BSG_CUDA_CALL(hb_mc_kernel_enqueue (&device, grid_dim, tg_dim, "kernel_vec_add_parallel", 5, cuda_argv));
-
-                /*****************************************************************************************************************
-                 * Launch and execute all tile groups on device and wait for all to finish.
-                 ******************************************************************************************************************/
-                BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(&device));
-
-                /*****************************************************************************************************************
-                 * Copy result matrix back from device DRAM into host memory.
-                 ******************************************************************************************************************/
-                uint32_t C_host[N];
-                src = (void *) ((intptr_t) C_device);
-                dst = (void *) &C_host[0];
-                BSG_CUDA_CALL(hb_mc_device_memcpy (&device, (void *) dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_HOST)); /* copy C to the host */
-
-                /*****************************************************************************************************************
-                 * Freeze the tiles and memory manager cleanup.
-                 ******************************************************************************************************************/
-                BSG_CUDA_CALL(hb_mc_device_program_finish(&device));
-
-                /*****************************************************************************************************************
-                 * Calculate the expected result using host code and compare the results.
-                 ******************************************************************************************************************/
-                uint32_t C_expected[N];
-                host_vec_add (A_host, B_host, C_expected, N);
-
-
-                int mismatch = 0;
-                for (int i = 0; i < N; i++) {
-                        if (A_host[i] + B_host[i] != C_host[i]) {
-                                bsg_pr_err(BSG_RED("Mismatch: ") "C[%d]:  0x%08" PRIx32 " + 0x%08" PRIx32 " = 0x%08" PRIx32 "\t Expected: 0x%08" PRIx32 "\n",
-                                           i , A_host[i], B_host[i], C_host[i], C_expected[i]);
-                                mismatch = 1;
-                        }
-                }
-
-                if (mismatch) {
-                        return HB_MC_FAIL;
-                }
-        }
-
-        BSG_CUDA_CALL(hb_mc_device_finish(&device));
-
-        return HB_MC_SUCCESS;
-}
-
-#ifdef VCS
-int vcs_main(int argc, char ** argv)
-#else
-int main(int argc, char ** argv)
-#endif
-{
-        bsg_pr_test_info("test_vec_add_parallel Regression Test \n");
-        int rc = kernel_vec_add_parallel(argc, argv);
-        bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS);
-        return rc;
-}
-
-

From b77d6353a39c4b69630bb59aab19aaf025f90c81 Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Wed, 5 May 2021 17:12:49 -0700
Subject: [PATCH 19/22] [pr-nibble] change regression/clean rules, update
 kernel and host code with prefetching kernel

---
 examples/graphit/test_pr_nibble/Makefile      |   6 +-
 examples/graphit/test_pr_nibble/kernel.cpp    | 230 ------------------
 .../test_pr_nibble/kernel/hybrid/kernel.cpp   | 101 +++-----
 examples/graphit/test_pr_nibble/main.cpp      |  28 +--
 4 files changed, 44 insertions(+), 321 deletions(-)
 delete mode 100644 examples/graphit/test_pr_nibble/kernel.cpp

diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile
index bdd77f513..720bbdc24 100644
--- a/examples/graphit/test_pr_nibble/Makefile
+++ b/examples/graphit/test_pr_nibble/Makefile
@@ -52,7 +52,7 @@ GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx
 TEST_NAME = main
 # KERNEL_NAME is the name of the CUDA-Lite Kernel
 KERNEL_NAME = pr_nibble
-HOST_TARGET := $(TEST_NAME).profile
+HOST_TARGET := $(TEST_NAME).exec
 
 BASE_VERSIONS += hybrid
 
@@ -179,7 +179,7 @@ all-versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log)
 # Regression Flow
 ###############################################################################
 
-regression: main.exec.log
+regression: versions all-versions 
 	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
 
 ###############################################################################
@@ -201,6 +201,6 @@ version.clean:
 
 .PHONY: clean
 
-clean: version.clean
+clean: bleach-versions version.clean
 
 
diff --git a/examples/graphit/test_pr_nibble/kernel.cpp b/examples/graphit/test_pr_nibble/kernel.cpp
deleted file mode 100644
index 79186ab8a..000000000
--- a/examples/graphit/test_pr_nibble/kernel.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-//#define DEBUG
-#include <bsg_manycore.h>
-
-#ifdef DEBUG
-#define BSG_TILE_GROUP_X_DIM 1 
-#define BSG_TILE_GROUP_Y_DIM 1
-#define bsg_tiles_X BSG_TILE_GROUP_X_DIM 
-#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM 
-#else
-#include <bsg_set_tile_x_y.h>
-// #define BSG_TILE_GROUP_X_DIM 16 
-// #define BSG_TILE_GROUP_Y_DIM 8
-#endif
-
-#include <bsg_tile_group_barrier.hpp>
-bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
-
-#include <pr_nibble.hpp>
-#include <cstring>
-
-#ifdef DEBUG
-#define pr_dbg(fmt, ...)			\
-		bsg_printf(fmt, ##__VA_ARGS__)
-#else
-#define pr_dbg(fmt, ...)
-#endif
-
-__attribute__((section(".dram"))) float  * __restrict p;
-__attribute__((section(".dram"))) float  * __restrict old_rank;
-__attribute__((section(".dram"))) float  * __restrict new_rank;
-__attribute__((section(".dram"))) int  * __restrict out_degree;
-__attribute__((section(".dram"))) int  * __restrict generated_tmp_vector_3;
-//__attribute__((section(".dram"))) double alpha = 0.15; 
-//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; 
-
-template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
-{
-  //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
-  //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V);
-  int start, end;
-  local_range(V, &start, &end);
-  for ( int d = start; d < end; d++) {
-    int degree = in_indices[d + 1] - in_indices[d];
-    int * neighbors = &in_neighbors[in_indices[d]];
-    for(int s = 0; s < degree; s++) { 
-      if(from_vertexset[neighbors[s]]) {
-        //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); 
-        apply_func (neighbors[s] , d);
-      }
-    } //end of loop on in neighbors
-  } //end of outer for loop
-  return 0;
-} //end of edgeset apply function 
-
-template <typename APPLY_FUNC > int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
-{
-  //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
-  //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V);
-  int start, end;
-  local_range(V, &start, &end);
-  for ( int s = start; s < end; s++) {
-    if(from_vertexset[s]) {
-      int degree = out_indices[s + 1] - out_indices[s];
-      int * neighbors = &out_neighbors[out_indices[s]];
-      for(int d = 0; d < degree; d++) { 
-        apply_func (s, neighbors[d]);
-	//if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); }
-	
-      }
-    } //end of loop on in neighbors
-  } //end of outer for loop
-  //barrier.sync();
-  return 0;
-} //end of edgeset apply function 
-
-
-struct generated_vector_op_apply_func_4
-{
-  void operator() (int v)
-  {
-    out_degree[v] = generated_tmp_vector_3[v];
-  };
-};
-struct new_rank_generated_vector_op_apply_func_2
-{
-  void operator() (int v)
-  {
-    new_rank[v] = ((float) 0) ;
-  };
-};
-struct old_rank_generated_vector_op_apply_func_1
-{
-  void operator() (int v)
-  {
-    old_rank[v] = ((float) 0) ;
-  };
-};
-struct p_generated_vector_op_apply_func_0
-{
-  void operator() (int v)
-  {
-    p[v] = ((float) 0) ;
-  };
-};
-struct updateEdge
-{
-  void operator() (int src, int dst)
-  {
-    float alpha = 0.15; 
-    new_rank[dst] = (new_rank[dst] + (((((1)  - alpha) / ((1)  + alpha)) * old_rank[src]) / out_degree[src]));
-  };
-};
-struct updateSelf
-{
-  void operator() (int v)
-  {
-    float alpha = 0.15; 
-    p[v] = (p[v] + ((((2)  * alpha) / ((1)  + alpha)) * old_rank[v]));
-    new_rank[v] = (0) ;
-  };
-};
-struct filter_frontier
-{
-  bool operator() (int v)
-  {
-    float epsilon = (float) 1e-6; 
-    bool output ;
-    //if(old_rank[v] == 0) return 0;
-    if(new_rank[v] == 0) return 0;
-    //output = (old_rank[v]) > ((out_degree[v] * epsilon));
-    output = (new_rank[v]) > ((out_degree[v] * epsilon));
-    return output;
-  };
-};
-
-extern "C" int  __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		p_generated_vector_op_apply_func_0()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int  __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		old_rank_generated_vector_op_apply_func_1()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int  __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		new_rank_generated_vector_op_apply_func_2()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int  __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		generated_vector_op_apply_func_4()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int  __attribute__ ((noinline)) updateSelf_kernel(int V, int tag_c) {
-        //pr_dbg("%i: on update self tag: %i\n", bsg_id, tag_c);
-  bsg_cuda_print_stat_start(tag_c);
-	barrier.sync();
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		updateSelf()(iter_x);
-	}
-  bsg_cuda_print_stat_end(tag_c);
-	barrier.sync();
-	return 0;
-}
-extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
-	barrier.sync();
-        //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c);
-  bsg_cuda_print_stat_start(tag_c);
-  bsg_saif_start();
-	edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x);
-  bsg_saif_end();
-  bsg_cuda_print_stat_end(tag_c);
-	barrier.sync();
-	return 0;
-}
-
- extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
-	barrier.sync(); 
-  bsg_cuda_print_stat_start(tag_c);
-  bsg_saif_start();
-	edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x);
-  bsg_saif_end();
-  bsg_cuda_print_stat_end(tag_c);
-	barrier.sync();
-	return 0;
-}
-
-extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { 
-        //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c);
-        //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c);
-  bsg_cuda_print_stat_start(tag_c);
-	barrier.sync();
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		if (iter_x < V) {
-			next5[iter_x] = 0;
-			if ( filter_frontier()( iter_x ) ) {
-				next5[iter_x] = 1;
-				//pr_dbg("added vertex %i to frontier\n", iter_x);
-			}
-     		}
-		else { break; }
-	} //end of loop
-  bsg_cuda_print_stat_end(tag_c);
-	barrier.sync();
-	return 0;
-}
-
-
diff --git a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp
index 16e66425c..14449a85d 100644
--- a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp
+++ b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp
@@ -29,10 +29,11 @@ __attribute__((section(".dram"))) float  * __restrict p;
 __attribute__((section(".dram"))) float  * __restrict old_rank;
 __attribute__((section(".dram"))) float  * __restrict new_rank;
 __attribute__((section(".dram"))) int  * __restrict out_degree;
-__attribute__((section(".dram"))) int  * __restrict generated_tmp_vector_3;
+//__attribute__((section(".dram"))) int  * __restrict generated_tmp_vector_3;
 //__attribute__((section(".dram"))) double alpha = 0.15; 
 //__attribute__((section(".dram"))) double epsilon = (double) 1e-6; 
 
+
 template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
 {
   //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
@@ -73,35 +74,6 @@ template <typename APPLY_FUNC > int edgeset_apply_push_parallel_from_vertexset(i
   return 0;
 } //end of edgeset apply function 
 
-
-struct generated_vector_op_apply_func_4
-{
-  void operator() (int v)
-  {
-    out_degree[v] = generated_tmp_vector_3[v];
-  };
-};
-struct new_rank_generated_vector_op_apply_func_2
-{
-  void operator() (int v)
-  {
-    new_rank[v] = ((float) 0) ;
-  };
-};
-struct old_rank_generated_vector_op_apply_func_1
-{
-  void operator() (int v)
-  {
-    old_rank[v] = ((float) 0) ;
-  };
-};
-struct p_generated_vector_op_apply_func_0
-{
-  void operator() (int v)
-  {
-    p[v] = ((float) 0) ;
-  };
-};
 struct updateEdge
 {
   void operator() (int src, int dst)
@@ -133,49 +105,14 @@ struct filter_frontier
   };
 };
 
-extern "C" int  __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		p_generated_vector_op_apply_func_0()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int  __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		old_rank_generated_vector_op_apply_func_1()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int  __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		new_rank_generated_vector_op_apply_func_2()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
-extern "C" int  __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) {
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		generated_vector_op_apply_func_4()(iter_x);
-	}
-	barrier.sync();
-	return 0;
-}
 extern "C" int  __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) {
+        //pr_dbg("%i: on update self tag: %i\n", bsg_id, tag_c);
   bsg_cuda_print_stat_start(tag_c);
 	barrier.sync();
 	int start, end;
 	local_range(V, &start, &end);
 	for (int iter_x = start; iter_x < end; iter_x++) {
-		if(frontier[iter_x]) { updateSelf()(iter_x); }
+		if(frontier[iter_x]) {updateSelf()(iter_x);}
 	}
   bsg_cuda_print_stat_end(tag_c);
 	barrier.sync();
@@ -226,4 +163,34 @@ extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5
 	return 0;
 }
 
+extern "C" void prefetch(int * in_indices, int * in_neighbors, int * from_vertexset, int V, int E) {
+  	int id = __bsg_id;
+  	int threads = bsg_tiles_X * bsg_tiles_Y;
+    // prefetch all data;
+    for (int i = 32 * id; i < E; i += 32 * threads) {
+        asm volatile ("lw x0, %[p]" :: [p] "m" (in_neighbors[i]));
+    }
+    for (int i = 32 * id; i < V; i += 32 * threads) {
+        asm volatile ("lw x0, %[p]" :: [p] "m" (in_indices[i]));
+    }
+    for (int i = 32 * id; i < V; i += 32 * threads) {
+        asm volatile ("lw x0, %[p]" :: [p] "m" (from_vertexset[i]));
+    }
+    for (int i = 32 * id; i < V; i += 32 * threads) {
+        asm volatile ("lw x0, %[p]" :: [p] "m" (out_degree[i]));
+    }
+    for (int i = 32 * id; i < V; i += 32 * threads) {
+        asm volatile ("lw x0, %[p]" :: [p] "m" (p[i]));
+    }
+    for (int i = 32 * id; i < V; i += 32 * threads) {
+        asm volatile ("lw x0, %[p]" :: [p] "m" (old_rank[i]));
+    }
+    for (int i = 32 * id; i < V; i += 32 * threads) {
+        asm volatile ("lw x0, %[p]" :: [p] "m" (new_rank[i]));
+    }
+		barrier.sync();
+    return ;
+
+}
+
 
diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp
index f0bee8b64..56dd9aaf6 100644
--- a/examples/graphit/test_pr_nibble/main.cpp
+++ b/examples/graphit/test_pr_nibble/main.cpp
@@ -45,14 +45,12 @@ int test_pr_nibble(int argc, char ** argv){
   if(ucode_path.find("push") != std::string::npos) {
     version = 1;
   }
-  else if(ucode_path.find("block") != std::string::npos) {
-    version = 2;
-  }
   int hybrid = 0; //default to vertex pull
   if(ucode_path.find("hybrid") != std::string::npos) {
     hybrid = 1;
   }
   std::cerr << "version: " << version << std::endl;
+  std::cerr << "hybrid: " << hybrid << std::endl;
   std::cerr << "load microcode" << std::endl;
   hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
 
@@ -129,6 +127,9 @@ int test_pr_nibble(int argc, char ** argv){
     switch(version) {
       case 0: //vertex pull
 	    std::cerr << "pull kernel\n";
+      std::cerr << "preloading the cache\n";
+      device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()});
+      device->runJobs();
     	std::cerr << "run update self vertex kernel\n";
     	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
     	device->runJobs();
@@ -147,6 +148,9 @@ int test_pr_nibble(int argc, char ** argv){
       break;
       case 1: //vertex push
 	    std::cerr << "push kernel\n";
+      std::cerr << "preloading the cache\n";
+      device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()});
+      device->runJobs();
     	std::cerr << "run update self vertex kernel\n";
     	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
     	device->runJobs();
@@ -163,24 +167,6 @@ int test_pr_nibble(int argc, char ** argv){
       f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
       std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
       break;
-      case 2: //blocked pull
-	    std::cerr << "blocked pull kernel\n";
-    	std::cerr << "run update self vertex kernel\n";
-    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
-    	device->runJobs();
-    	tag_c++;
-    	std::cerr << "run update edges kernel on iter : " << iter << "\n";
-      device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInVertexlistAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
-    	device->runJobs();
-      tag_c++;
-    	std::cerr << "create next frontier\n";
-    	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
-    	device->runJobs();
-    	std::cerr << "swap arrays\n";
-    	hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
-      f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
-      std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
-      break;
     }
     tag_c++;
 

From 7d47bbf7cf9ad696ec5c0aa4f8d64671de7cf8fb Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Wed, 5 May 2021 17:16:08 -0700
Subject: [PATCH 20/22] clone graphit submodule with ssh not https

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 18ee1bd0f..f352d419a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -9,4 +9,4 @@
 	url = git@github.com:bespoke-silicon-group/hammerblade-helpers
 [submodule "examples/graphit/graphit-src"]
 	path = examples/graphit/graphit-src
-	url = https://github.com/bespoke-silicon-group/graphit.git
+	url = git@github.com:bespoke-silicon-group/graphit.git 

From 741062c4b4ce71680b864edbaa4d9b0b00b77a2b Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Mon, 10 May 2021 14:02:33 -0700
Subject: [PATCH 21/22] rename test, remove tabs, switch to 4 spaces for tabs,
 some cleaning up of code

---
 .../{test_pr_nibble => pr_nibble}/Makefile    |   0
 .../pr_nibble/kernel/hybrid/kernel.cpp        | 175 ++++++++++++++++
 .../kernel/include/pr_nibble.hpp              |   0
 examples/graphit/pr_nibble/main.cpp           | 177 ++++++++++++++++
 .../{test_pr_nibble => pr_nibble}/pr.hpp      |   2 +-
 examples/graphit/pr_nibble/pr_host.hpp        |  45 ++++
 .../test_pr_nibble/kernel/hybrid/kernel.cpp   | 196 ------------------
 examples/graphit/test_pr_nibble/main.cpp      | 191 -----------------
 examples/graphit/test_pr_nibble/pr_host.hpp   |  50 -----
 9 files changed, 398 insertions(+), 438 deletions(-)
 rename examples/graphit/{test_pr_nibble => pr_nibble}/Makefile (100%)
 create mode 100644 examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp
 rename examples/graphit/{test_pr_nibble => pr_nibble}/kernel/include/pr_nibble.hpp (100%)
 create mode 100644 examples/graphit/pr_nibble/main.cpp
 rename examples/graphit/{test_pr_nibble => pr_nibble}/pr.hpp (90%)
 create mode 100644 examples/graphit/pr_nibble/pr_host.hpp
 delete mode 100644 examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp
 delete mode 100644 examples/graphit/test_pr_nibble/main.cpp
 delete mode 100644 examples/graphit/test_pr_nibble/pr_host.hpp

diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/pr_nibble/Makefile
similarity index 100%
rename from examples/graphit/test_pr_nibble/Makefile
rename to examples/graphit/pr_nibble/Makefile
diff --git a/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp
new file mode 100644
index 000000000..294d564a6
--- /dev/null
+++ b/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp
@@ -0,0 +1,175 @@
+//#define DEBUG
+#include <bsg_manycore.h>
+
+#ifdef DEBUG
+#define BSG_TILE_GROUP_X_DIM 1 
+#define BSG_TILE_GROUP_Y_DIM 1
+#define bsg_tiles_X BSG_TILE_GROUP_X_DIM 
+#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM 
+#else
+#include <bsg_set_tile_x_y.h>
+#endif
+
+#include <bsg_tile_group_barrier.hpp>
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+
+#include <pr_nibble.hpp>
+#include <cstring>
+
+#ifdef DEBUG
+#define pr_dbg(fmt, ...)            \
+        bsg_printf(fmt, ##__VA_ARGS__)
+#else
+#define pr_dbg(fmt, ...)
+#endif
+
+__attribute__((section(".dram"))) float  * __restrict p;
+__attribute__((section(".dram"))) float  * __restrict old_rank;
+__attribute__((section(".dram"))) float  * __restrict new_rank;
+__attribute__((section(".dram"))) int  * __restrict out_degree;
+
+
+template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{
+    int start, end;
+    local_range(V, &start, &end);
+    for ( int d = start; d < end; d++) {
+        int degree = in_indices[d + 1] - in_indices[d];
+        int * neighbors = &in_neighbors[in_indices[d]];
+        for(int s = 0; s < degree; s++) { 
+            if(from_vertexset[neighbors[s]]) {
+                apply_func (neighbors[s] , d);
+            }
+        } //end of loop on in neighbors
+    } //end of outer for loop
+    return 0;
+} //end of edgeset apply function 
+
+template <typename APPLY_FUNC > int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
+{
+    int start, end;
+    local_range(V, &start, &end);
+    for ( int s = start; s < end; s++) {
+        if(from_vertexset[s]) {
+            int degree = out_indices[s + 1] - out_indices[s];
+            int * neighbors = &out_neighbors[out_indices[s]];
+            for(int d = 0; d < degree; d++) { 
+                apply_func (s, neighbors[d]);
+    
+            }
+        } //end of loop on in neighbors
+    } //end of outer for loop
+    return 0;
+} //end of edgeset apply function 
+
+struct updateEdge
+{
+    void operator() (int src, int dst)
+    {
+        float alpha = 0.15; 
+        new_rank[dst] = (new_rank[dst] + (((((1)    - alpha) / ((1)  + alpha)) * old_rank[src]) / out_degree[src]));
+    };
+};
+struct updateSelf
+{
+    void operator() (int v)
+    {
+        float alpha = 0.15; 
+        p[v] = (p[v] + ((((2)  * alpha) / ((1)  + alpha)) * old_rank[v]));
+        new_rank[v] = (0) ;
+    };
+};
+struct filter_frontier
+{
+    bool operator() (int v)
+    {
+        float epsilon = (float) 1e-6; 
+        bool output ;
+        if(new_rank[v] == 0) return 0;
+        output = (new_rank[v]) > ((out_degree[v] * epsilon));
+        return output;
+    };
+};
+
+extern "C" int  __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) {
+    bsg_cuda_print_stat_start(tag_c);
+    barrier.sync();
+    int start, end;
+    local_range(V, &start, &end);
+    for (int iter_x = start; iter_x < end; iter_x++) {
+        if(frontier[iter_x]) {updateSelf()(iter_x);}
+    }
+    bsg_cuda_print_stat_end(tag_c);
+    barrier.sync();
+    return 0;
+}
+extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
+    barrier.sync();
+    bsg_cuda_print_stat_start(tag_c);
+    bsg_saif_start();
+    edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x);
+    bsg_saif_end();
+    bsg_cuda_print_stat_end(tag_c);
+    barrier.sync();
+    return 0;
+}
+
+ extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
+    barrier.sync(); 
+    bsg_cuda_print_stat_start(tag_c);
+    bsg_saif_start();
+    edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x);
+    bsg_saif_end();
+    bsg_cuda_print_stat_end(tag_c);
+    barrier.sync();
+    return 0;
+}
+
+extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { 
+    bsg_cuda_print_stat_start(tag_c);
+    barrier.sync();
+    int start, end;
+    local_range(V, &start, &end);
+    for (int iter_x = start; iter_x < end; iter_x++) {
+        if (iter_x < V) {
+            next5[iter_x] = 0;
+            if ( filter_frontier()( iter_x ) ) {
+                next5[iter_x] = 1;
+            }
+                }
+        else { break; }
+    } //end of loop
+    bsg_cuda_print_stat_end(tag_c);
+    barrier.sync();
+    return 0;
+}
+
+extern "C" void prefetch(int * in_indices, int * in_neighbors, int * from_vertexset, int V, int E) {
+        int id = __bsg_id;
+        int threads = bsg_tiles_X * bsg_tiles_Y;
+        // prefetch all data;
+        for (int i = 32 * id; i < E; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (in_neighbors[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (in_indices[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (from_vertexset[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (out_degree[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (p[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (old_rank[i]));
+        }
+        for (int i = 32 * id; i < V; i += 32 * threads) {
+                asm volatile ("lw x0, %[p]" :: [p] "m" (new_rank[i]));
+        }
+        barrier.sync();
+        return ;
+
+}
diff --git a/examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp b/examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp
similarity index 100%
rename from examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp
rename to examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp
diff --git a/examples/graphit/pr_nibble/main.cpp b/examples/graphit/pr_nibble/main.cpp
new file mode 100644
index 000000000..55a05fd8e
--- /dev/null
+++ b/examples/graphit/pr_nibble/main.cpp
@@ -0,0 +1,177 @@
+#include "pr.hpp"
+
+//#define DEBUG
+
+#define VERIFY 0
+
+#ifdef DEBUG
+#define X 1 
+#define Y 1
+#else
+#define X 16 //tile group dim X 
+#define Y 8 // tile group dim Y
+#endif
+
+#define ROOT 6 
+#define NUM_LOCKS 1024 //width of manycore * 64
+
+GraphHB edges; 
+GlobalScalar<hb_mc_eva_t> p_dev;
+GlobalScalar<hb_mc_eva_t> old_rank_dev;
+GlobalScalar<hb_mc_eva_t> new_rank_dev;
+GlobalScalar<hb_mc_eva_t> out_degree_dev;
+
+#include "pr_host.hpp"
+
+int test_pr_nibble(int argc, char ** argv){
+    InputParser input(argc, argv);
+    if(!input.cmdOptionExists("-g")){
+        std::cerr << "no input args\n";
+        return 0;
+    }
+    std::string ucode_path = input.getRISCVFile();
+
+    int iter = 0;
+    std::string iterstrbase = "iteration-";
+    auto pos = ucode_path.find(iterstrbase);
+    auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos);
+    std::stringstream ss(iterstr);
+    ss >> iter;
+    std::cerr << "iteration: " << iter << std::endl;
+
+    int version = 0; //default to vertex pull
+    if(ucode_path.find("push") != std::string::npos) {
+        version = 1;
+    }
+    int hybrid = 0; //default to vertex pull
+    if(ucode_path.find("hybrid") != std::string::npos) {
+        hybrid = 1;
+    }
+    std::cerr << "version: " << version << std::endl;
+    std::cerr << "hybrid: " << hybrid << std::endl;
+    std::cerr << "load microcode" << std::endl;
+    hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
+
+    std::cerr << "load graph" << std::endl;
+    std::string graph_f = input.getCmdOption("-g");
+    edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); 
+
+    std::cerr << "size of graph: " << std::endl;
+    std::cerr << edges.num_nodes() << std::endl;
+    std::cerr << edges.num_edges() << std::endl; 
+
+    std::cerr << "init global scalars" << std::endl; 
+    p_dev = GlobalScalar<hb_mc_eva_t>("p");
+    hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), p_dev);
+    old_rank_dev = GlobalScalar<hb_mc_eva_t>("old_rank");
+    hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), old_rank_dev);
+    new_rank_dev = GlobalScalar<hb_mc_eva_t>("new_rank");
+    hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), new_rank_dev);
+    out_degree_dev = GlobalScalar<hb_mc_eva_t>("out_degree");
+    hammerblade::init_global_array<int32_t>(hammerblade::builtin_getVerticesHB(edges), out_degree_dev);
+    
+    std::cerr << "init locks" << std::endl;
+    GlobalScalar<hb_mc_eva_t> glbl_locks = GlobalScalar<hb_mc_eva_t>("locks");
+    hammerblade::init_global_array<std::atomic<int>>(NUM_LOCKS, glbl_locks);
+    std::atomic<int> tmp_a[NUM_LOCKS] = {};
+    Device::Ptr device = Device::GetInstance();
+    int start_vertex = ROOT;
+    Vector<int32_t> frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
+
+    std::vector<int32_t> hfrontier(edges.num_nodes(), 0);
+    std::vector<float> p(edges.num_nodes(), (float) 0.0);
+    std::vector<float> new_rank(edges.num_nodes(), (float) 0.0);
+    std::vector<float> old_rank(edges.num_nodes(), (float) 0.0);
+    std::vector<int32_t> out_degs = edges.get_out_degrees();
+
+    //compute up to current iter on host
+    hfrontier[start_vertex] = 1;
+    new_rank[start_vertex] = (float) 1.0;
+    old_rank[start_vertex] = (float) 1.0;
+    host_pr_calc(p, old_rank, new_rank, hfrontier, iter);
+
+    //copy all variables at their current state to device
+    frontier.copyToDevice(hfrontier.data(), hfrontier.size());
+    hammerblade::write_global_buffer_dma<float>(p.data(), p_dev, p.size());  
+    hammerblade::write_global_buffer_dma<float>(old_rank.data(), old_rank_dev, old_rank.size());    
+    hammerblade::write_global_buffer_dma<float>(new_rank.data(), new_rank_dev, new_rank.size());    
+    hammerblade::write_global_buffer_dma<int32_t>(out_degs.data(), out_degree_dev, out_degs.size());    
+    //initialize locks for atomics on device
+    hammerblade::write_global_buffer_dma<std::atomic<int>>(tmp_a, glbl_locks, NUM_LOCKS);
+
+    device->freeze_cores();
+    device->write_dma();
+    device->unfreeze_cores();
+    //determine push or pull traversal for this iteration 
+    if(hybrid) { 
+        int num_items = std::count(hfrontier.begin(), hfrontier.end(), 1);
+        int dir = calculate_direction(num_items, hfrontier, edges, edges.num_nodes(), edges.num_edges());
+        if(dir){ 
+            version = 0; //pull
+        } else {
+            version = 1; //push
+        }
+    }
+
+    std::cerr << "start of while loop\n";
+    int tag_c = 0;
+    int f_sz = 0;
+    switch(version) {
+        case 0: //vertex pull
+            std::cerr << "pull kernel\n";
+            std::cerr << "preloading the cache\n";
+            device->enqueuejob("prefetch", hb_mc_dimension(x,y),{edges.getinindicesaddr() , edges.getinneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges()});
+            device->runjobs();
+            std::cerr << "run update self vertex kernel\n";
+            device->enqueuejob("updateself_kernel",hb_mc_dimension(x,y), {frontier.getaddr(), edges.num_nodes(), tag_c});
+            device->runjobs();
+            tag_c++;
+            std::cerr << "run update edges kernel on iter : " << iter << "\n";
+            device->enqueuejob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(x,y),{edges.getinindicesaddr() , edges.getinneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
+            device->runjobs();
+            tag_c++;
+            std::cerr << "create next frontier\n";
+            device->enqueuejob("filter_frontier_where_call", hb_mc_dimension(x,y),{frontier.getaddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+            device->runjobs();
+            std::cerr << "swap arrays\n";
+            hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
+            f_sz = builtin_getvertexsetsizehb(frontier, edges.num_nodes());
+            std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
+            break;
+        case 1: //vertex push
+            std::cerr << "push kernel\n";
+            std::cerr << "preloading the cache\n";
+            device->enqueuejob("prefetch", hb_mc_dimension(x,y),{edges.getoutindicesaddr() , edges.getoutneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges()});
+            device->runjobs();
+            std::cerr << "run update self vertex kernel\n";
+            device->enqueuejob("updateself_kernel",hb_mc_dimension(x,y), {frontier.getaddr(), edges.num_nodes(), tag_c});
+            device->runjobs();
+            tag_c++;
+            std::cerr << "run update edges kernel on iter : " << iter << "\n";
+            device->enqueuejob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(x,y),{edges.getoutindicesaddr() , edges.getoutneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); 
+            device->runjobs();
+            tag_c++;
+            std::cerr << "create next frontier\n";
+            device->enqueuejob("filter_frontier_where_call", hb_mc_dimension(x,y),{frontier.getaddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+            device->runjobs();
+            std::cerr << "swap arrays\n";
+            hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
+            f_sz = builtin_getvertexsetsizehb(frontier, edges.num_nodes());
+            std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
+            break;
+    }
+    if(verify) {
+        ofstream ver_file;
+        ver_file.open("./rank.txt");
+        float host_rank[edges.num_nodes()];
+        hammerblade::read_global_buffer_dma<float>(host_rank, old_rank_dev, edges.num_nodes());
+        for(int i = 0; i < edges.num_nodes(); i++) {
+            ver_file << host_rank[i] << std::endl;
+        }
+        ver_file.close();  
+    }
+    device->finish(); 
+    return 0;
+}
+
+declare_program_main("test_pr_nibble", test_pr_nibble); 
diff --git a/examples/graphit/test_pr_nibble/pr.hpp b/examples/graphit/pr_nibble/pr.hpp
similarity index 90%
rename from examples/graphit/test_pr_nibble/pr.hpp
rename to examples/graphit/pr_nibble/pr.hpp
index 5cce0e30a..ae01c8cc2 100644
--- a/examples/graphit/test_pr_nibble/pr.hpp
+++ b/examples/graphit/pr_nibble/pr.hpp
@@ -2,7 +2,7 @@
 #ifndef __PR_PULL_BENCHMARK_HPP
 #define __PR_PULL_BENCHMARK_HPP
 
-#include "hb_intrinsics.h"
+#include "hb_intrinsics.h" //graphit host runtime libs
 #include "infra_hb/host/arg_parser.hpp"
 #include <bsg_manycore_regression.h>
 #include <string.h>
diff --git a/examples/graphit/pr_nibble/pr_host.hpp b/examples/graphit/pr_nibble/pr_host.hpp
new file mode 100644
index 000000000..fcbb811e0
--- /dev/null
+++ b/examples/graphit/pr_nibble/pr_host.hpp
@@ -0,0 +1,45 @@
+//function to compute pr-nibble on host up to current iter
+#pragma once
+#include <iostream>
+#include <fstream>
+
+inline void host_pr_calc(std::vector<float> & p, std::vector<float> & old_rank, std::vector<float> & new_rank, std::vector<int> & frontier, int iter) {
+        float alpha = (float) 0.15;
+        float epsilon = (float) 1e-06;
+        auto g = edges.getHostGraph();
+        int * in_neigh = g.in_neighbors_shared_.get();
+        int ** in_index = g.in_index_shared_.get();
+        for(int i = 0; i < iter; i++) {
+                new_rank.assign(old_rank.begin(), old_rank.end());
+                //print out iteration and size:
+                int num_items = std::count(frontier.begin(), frontier.end(), 1);
+                std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl;
+                //update_self
+                for(int v = 0; v < g.num_nodes(); v++) {
+                        if(frontier[v]) {
+                            p[v] += (2.0 * alpha) / (1.0    + alpha) * old_rank[v];
+                            new_rank[v] = (float) 0.0 ;
+                        }
+                }
+                //update edges
+                for(int d = 0; d < g.num_nodes(); d++) {
+                        for(int s : g.in_neigh(d)) {
+                                if(frontier[s]){
+                                        float update = ((1.0 - alpha) / (1.0    + alpha)) * old_rank[s];
+                                        update = update / ((float) g.out_degree(s));
+                                        new_rank[d] += update;
+                                }
+                        }
+                }
+                old_rank.assign(new_rank.begin(), new_rank.end());
+                //update frontier
+                for(int v = 0; v < g.num_nodes(); v++) {
+                        frontier[v] = 0;
+                        if(g.out_degree(v) > 0 && old_rank[v] >= (((float) g.out_degree(v)) * epsilon)) {
+                                frontier[v] = 1;
+                        }
+                }
+        }
+        int num_items = std::count(frontier.begin(), frontier.end(), 1);
+        std::cerr << "returning with frontier size: " << num_items << std::endl;
+}
diff --git a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp
deleted file mode 100644
index 14449a85d..000000000
--- a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//#define DEBUG
-#include <bsg_manycore.h>
-
-#ifdef DEBUG
-#define BSG_TILE_GROUP_X_DIM 1 
-#define BSG_TILE_GROUP_Y_DIM 1
-#define bsg_tiles_X BSG_TILE_GROUP_X_DIM 
-#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM 
-#else
-#include <bsg_set_tile_x_y.h>
-// #define BSG_TILE_GROUP_X_DIM 16 
-// #define BSG_TILE_GROUP_Y_DIM 8
-#endif
-
-#include <bsg_tile_group_barrier.hpp>
-bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
-
-#include <pr_nibble.hpp>
-#include <cstring>
-
-#ifdef DEBUG
-#define pr_dbg(fmt, ...)			\
-		bsg_printf(fmt, ##__VA_ARGS__)
-#else
-#define pr_dbg(fmt, ...)
-#endif
-
-__attribute__((section(".dram"))) float  * __restrict p;
-__attribute__((section(".dram"))) float  * __restrict old_rank;
-__attribute__((section(".dram"))) float  * __restrict new_rank;
-__attribute__((section(".dram"))) int  * __restrict out_degree;
-//__attribute__((section(".dram"))) int  * __restrict generated_tmp_vector_3;
-//__attribute__((section(".dram"))) double alpha = 0.15; 
-//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; 
-
-
-template <typename APPLY_FUNC > int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
-{
-  //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
-  //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V);
-  int start, end;
-  local_range(V, &start, &end);
-  for ( int d = start; d < end; d++) {
-    int degree = in_indices[d + 1] - in_indices[d];
-    int * neighbors = &in_neighbors[in_indices[d]];
-    for(int s = 0; s < degree; s++) { 
-      if(from_vertexset[neighbors[s]]) {
-        //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); 
-        apply_func (neighbors[s] , d);
-      }
-    } //end of loop on in neighbors
-  } //end of outer for loop
-  return 0;
-} //end of edgeset apply function 
-
-template <typename APPLY_FUNC > int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) 
-{
-  //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); 
-  //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V);
-  int start, end;
-  local_range(V, &start, &end);
-  for ( int s = start; s < end; s++) {
-    if(from_vertexset[s]) {
-      int degree = out_indices[s + 1] - out_indices[s];
-      int * neighbors = &out_neighbors[out_indices[s]];
-      for(int d = 0; d < degree; d++) { 
-        apply_func (s, neighbors[d]);
-	//if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); }
-	
-      }
-    } //end of loop on in neighbors
-  } //end of outer for loop
-  //barrier.sync();
-  return 0;
-} //end of edgeset apply function 
-
-struct updateEdge
-{
-  void operator() (int src, int dst)
-  {
-    float alpha = 0.15; 
-    new_rank[dst] = (new_rank[dst] + (((((1)  - alpha) / ((1)  + alpha)) * old_rank[src]) / out_degree[src]));
-  };
-};
-struct updateSelf
-{
-  void operator() (int v)
-  {
-    float alpha = 0.15; 
-    p[v] = (p[v] + ((((2)  * alpha) / ((1)  + alpha)) * old_rank[v]));
-    new_rank[v] = (0) ;
-  };
-};
-struct filter_frontier
-{
-  bool operator() (int v)
-  {
-    float epsilon = (float) 1e-6; 
-    bool output ;
-    //if(old_rank[v] == 0) return 0;
-    if(new_rank[v] == 0) return 0;
-    //output = (old_rank[v]) > ((out_degree[v] * epsilon));
-    output = (new_rank[v]) > ((out_degree[v] * epsilon));
-    return output;
-  };
-};
-
-extern "C" int  __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) {
-        //pr_dbg("%i: on update self tag: %i\n", bsg_id, tag_c);
-  bsg_cuda_print_stat_start(tag_c);
-	barrier.sync();
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		if(frontier[iter_x]) {updateSelf()(iter_x);}
-	}
-  bsg_cuda_print_stat_end(tag_c);
-	barrier.sync();
-	return 0;
-}
-extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
-	barrier.sync();
-        //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c);
-  bsg_cuda_print_stat_start(tag_c);
-  bsg_saif_start();
-	edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x);
-  bsg_saif_end();
-  bsg_cuda_print_stat_end(tag_c);
-	barrier.sync();
-	return 0;
-}
-
- extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) {
-	barrier.sync(); 
-  bsg_cuda_print_stat_start(tag_c);
-  bsg_saif_start();
-	edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x);
-  bsg_saif_end();
-  bsg_cuda_print_stat_end(tag_c);
-	barrier.sync();
-	return 0;
-}
-
-extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { 
-        //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c);
-        //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c);
-  bsg_cuda_print_stat_start(tag_c);
-	barrier.sync();
-	int start, end;
-	local_range(V, &start, &end);
-	for (int iter_x = start; iter_x < end; iter_x++) {
-		if (iter_x < V) {
-			next5[iter_x] = 0;
-			if ( filter_frontier()( iter_x ) ) {
-				next5[iter_x] = 1;
-				//pr_dbg("added vertex %i to frontier\n", iter_x);
-			}
-     		}
-		else { break; }
-	} //end of loop
-  bsg_cuda_print_stat_end(tag_c);
-	barrier.sync();
-	return 0;
-}
-
-extern "C" void prefetch(int * in_indices, int * in_neighbors, int * from_vertexset, int V, int E) {
-  	int id = __bsg_id;
-  	int threads = bsg_tiles_X * bsg_tiles_Y;
-    // prefetch all data;
-    for (int i = 32 * id; i < E; i += 32 * threads) {
-        asm volatile ("lw x0, %[p]" :: [p] "m" (in_neighbors[i]));
-    }
-    for (int i = 32 * id; i < V; i += 32 * threads) {
-        asm volatile ("lw x0, %[p]" :: [p] "m" (in_indices[i]));
-    }
-    for (int i = 32 * id; i < V; i += 32 * threads) {
-        asm volatile ("lw x0, %[p]" :: [p] "m" (from_vertexset[i]));
-    }
-    for (int i = 32 * id; i < V; i += 32 * threads) {
-        asm volatile ("lw x0, %[p]" :: [p] "m" (out_degree[i]));
-    }
-    for (int i = 32 * id; i < V; i += 32 * threads) {
-        asm volatile ("lw x0, %[p]" :: [p] "m" (p[i]));
-    }
-    for (int i = 32 * id; i < V; i += 32 * threads) {
-        asm volatile ("lw x0, %[p]" :: [p] "m" (old_rank[i]));
-    }
-    for (int i = 32 * id; i < V; i += 32 * threads) {
-        asm volatile ("lw x0, %[p]" :: [p] "m" (new_rank[i]));
-    }
-		barrier.sync();
-    return ;
-
-}
-
-
diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp
deleted file mode 100644
index 56dd9aaf6..000000000
--- a/examples/graphit/test_pr_nibble/main.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-#include "pr.hpp"
-
-//#define DEBUG
-
-#define VERIFY 0
-
-#ifdef DEBUG
-#define X 1 
-#define Y 1
-#else
-#define X 16 
-#define Y 8
-#endif
-
-#define ROOT 6 //eventually we will need to do 50 start vertices (in parallel)
-#define NUM_LOCKS 1024 //width of manycore * 64
-
-GraphHB edges; 
-GlobalScalar<hb_mc_eva_t> p_dev;
-GlobalScalar<hb_mc_eva_t> old_rank_dev;
-GlobalScalar<hb_mc_eva_t> new_rank_dev;
-GlobalScalar<hb_mc_eva_t> out_degree_dev;
-//GlobalScalar<float > alpha_dev;
-//GlobalScalar<float > epsilon_dev;
-
-#include "pr_host.hpp"
-
-int test_pr_nibble(int argc, char ** argv){
-  InputParser input(argc, argv);
-  if(!input.cmdOptionExists("-g")){
-    std::cerr << "no input args\n";
-    return 0;
-  }
-  std::string ucode_path = input.getRISCVFile();
-
-  int iter = 0;
-  std::string iterstrbase = "iteration-";
-  auto pos = ucode_path.find(iterstrbase);
-  auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos);
-  std::stringstream ss(iterstr);
-  ss >> iter;
-  std::cerr << "iteration: " << iter << std::endl;
-
-  int version = 0; //default to vertex pull
-  if(ucode_path.find("push") != std::string::npos) {
-    version = 1;
-  }
-  int hybrid = 0; //default to vertex pull
-  if(ucode_path.find("hybrid") != std::string::npos) {
-    hybrid = 1;
-  }
-  std::cerr << "version: " << version << std::endl;
-  std::cerr << "hybrid: " << hybrid << std::endl;
-  std::cerr << "load microcode" << std::endl;
-  hammerblade::builtin_loadMicroCodeFromFile(ucode_path);
-
-  std::cerr << "load graph" << std::endl;
-  std::string graph_f = input.getCmdOption("-g");
-  edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); 
-
-  std::cerr << "size of graph: " << std::endl;
-  std::cerr << edges.num_nodes() << std::endl;
-  std::cerr << edges.num_edges() << std::endl; 
-  std::cerr << "init global scalars" << std::endl; 
-
-  p_dev = GlobalScalar<hb_mc_eva_t>("p");
-  hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), p_dev);
-  old_rank_dev = GlobalScalar<hb_mc_eva_t>("old_rank");
-  hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), old_rank_dev);
-  new_rank_dev = GlobalScalar<hb_mc_eva_t>("new_rank");
-  hammerblade::init_global_array<float>(hammerblade::builtin_getVerticesHB(edges), new_rank_dev);
-  out_degree_dev = GlobalScalar<hb_mc_eva_t>("out_degree");
-  hammerblade::init_global_array<int32_t>(hammerblade::builtin_getVerticesHB(edges), out_degree_dev);
-  //alpha_dev = GlobalScalar<float>("alpha");
-  //epsilon_dev = GlobalScalar<float>("epsilon");
-  std::cerr << "init locks" << std::endl;
-  GlobalScalar<hb_mc_eva_t> glbl_locks = GlobalScalar<hb_mc_eva_t>("locks");
-  hammerblade::init_global_array<std::atomic<int>>(NUM_LOCKS, glbl_locks);
-  std::atomic<int> tmp_a[NUM_LOCKS] = {};
-  Device::Ptr device = Device::GetInstance();
-  float alpha = ((float) 0.15) ;
-  float epsilon = ((float) 1e-06) ;
-  int start_vertex = ROOT;
-  Vector<int32_t> frontier = Vector<int32_t>(hammerblade::builtin_getVerticesHB(edges));
-
-  std::vector<int32_t> hfrontier(edges.num_nodes(), 0);
-  std::vector<float> p(edges.num_nodes(), (float) 0.0);
-  std::vector<float> new_rank(edges.num_nodes(), (float) 0.0);
-  std::vector<float> old_rank(edges.num_nodes(), (float) 0.0);
-  std::vector<int32_t> out_degs = edges.get_out_degrees();
-
-  //compute up to current iter on host
-  hfrontier[start_vertex] = 1;
-  new_rank[start_vertex] = (float) 1.0;
-  old_rank[start_vertex] = (float) 1.0;
-  host_pr_calc(p, old_rank, new_rank, hfrontier, iter);
-
-  frontier.copyToDevice(hfrontier.data(), hfrontier.size());
-
-  //next_frontier.copyToDevice(zeros.data(), zeros.size());
-  hammerblade::write_global_buffer_dma<float>(p.data(), p_dev, p.size());  
-  hammerblade::write_global_buffer_dma<float>(old_rank.data(), old_rank_dev, old_rank.size());  
-  hammerblade::write_global_buffer_dma<float>(new_rank.data(), new_rank_dev, new_rank.size());  
-  hammerblade::write_global_buffer_dma<int32_t>(out_degs.data(), out_degree_dev, out_degs.size());  
-  hammerblade::write_global_buffer_dma<std::atomic<int>>(tmp_a, glbl_locks, NUM_LOCKS);
-
-  device->freeze_cores();
-  device->write_dma();
-  device->unfreeze_cores(); 
-  if(hybrid || version == 2) {
-    int num_items = std::count(hfrontier.begin(), hfrontier.end(), 1);
-    int dir = calculate_direction(num_items, hfrontier, edges, edges.num_nodes(), edges.num_edges());
-    if(dir){ 
-      if(version != 2) version = 0; //pull
-    } else {
-      version = 1; //push
-    }
-  }
-
-  std::cerr << "start of while loop\n";
-  int tag_c = 0;
-  //while ( builtin_getVertexSetSizeHB(frontier, edges.num_nodes()) != 0) 
-  for(int i = 0; i < 1; i++)
-  {
-    int f_sz = 0;
-    //new_rank = old_rank;
-    switch(version) {
-      case 0: //vertex pull
-	    std::cerr << "pull kernel\n";
-      std::cerr << "preloading the cache\n";
-      device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()});
-      device->runJobs();
-    	std::cerr << "run update self vertex kernel\n";
-    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
-    	device->runJobs();
-    	tag_c++;
- 			std::cerr << "run update edges kernel on iter : " << iter << "\n";
-      device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
-    	device->runJobs();
-      tag_c++;
-    	std::cerr << "create next frontier\n";
-    	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
-    	device->runJobs();
-    	std::cerr << "swap arrays\n";
-    	hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
-      f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
-      std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
-      break;
-      case 1: //vertex push
-	    std::cerr << "push kernel\n";
-      std::cerr << "preloading the cache\n";
-      device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()});
-      device->runJobs();
-    	std::cerr << "run update self vertex kernel\n";
-    	device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
-    	device->runJobs();
-    	tag_c++;
-    	std::cerr << "run update edges kernel on iter : " << iter << "\n";
-      device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); 
-    	device->runJobs();
-      tag_c++;
-    	std::cerr << "create next frontier\n";
-    	device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
-    	device->runJobs();
-    	std::cerr << "swap arrays\n";
-   	  hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
-      f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
-      std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
-      break;
-    }
-    tag_c++;
-
-    iter++;
-  }
-  std::cerr << "*******end of program********\n";
-  std::cerr << "took: " << iter << " iterations to complete\n";
-  if(VERIFY) {
-    ofstream ver_file;
-    ver_file.open("./rank.txt");
-    float host_rank[edges.num_nodes()];
-    hammerblade::read_global_buffer_dma<float>(host_rank, old_rank_dev, edges.num_nodes());
-    for(int i = 0; i < edges.num_nodes(); i++) {
-      ver_file << host_rank[i] << std::endl;
-    }
-    ver_file.close();  
-  }
-  device->finish(); 
-  return 0;
-}
-
-declare_program_main("test_pr_nibble", test_pr_nibble); 
diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp
deleted file mode 100644
index 1923c6d6d..000000000
--- a/examples/graphit/test_pr_nibble/pr_host.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//function to compute pr-nibble on host up to current iter
-#pragma once
-#include <iostream>
-#include <fstream>
-
-inline void host_pr_calc(std::vector<float> & p, std::vector<float> & old_rank, std::vector<float> & new_rank, std::vector<int> & frontier, int iter) {
-    float alpha = (float) 0.15;
-    float epsilon = (float) 1e-06;
-    auto g = edges.getHostGraph();
-    int * in_neigh = g.in_neighbors_shared_.get();
-    int ** in_index = g.in_index_shared_.get();
-		std::string fname = "iter-" + std::to_string(iter) + ".txt";
-    ofstream ofile;
-    ofile.open (fname);
-    for(int i = 0; i < iter; i++) {
-        new_rank.assign(old_rank.begin(), old_rank.end());
-        //print out iteration and size:
-        int num_items = std::count(frontier.begin(), frontier.end(), 1);
-        std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl;
-        //update_self
-        for(int v = 0; v < g.num_nodes(); v++) {
-						if(frontier[v]) {
-            	p[v] += (2.0 * alpha) / (1.0  + alpha) * old_rank[v];
-            	new_rank[v] = (float) 0.0 ;
-						}
-        }
-        //update edges
-        for(int d = 0; d < g.num_nodes(); d++) {
-            for(int s : g.in_neigh(d)) {
-                if(frontier[s]){
-                    float update = ((1.0 - alpha) / (1.0  + alpha)) * old_rank[s];
-										update = update / ((float) g.out_degree(s));
-										new_rank[d] += update;
-										if(i == (iter - 1)) {ofile << s << " " << d << " " << new_rank[d] << std::endl;}
-                }
-            }
-        }
-        old_rank.assign(new_rank.begin(), new_rank.end());
-        //update frontier
-        for(int v = 0; v < g.num_nodes(); v++) {
-            frontier[v] = 0;
-            if(g.out_degree(v) > 0 && old_rank[v] >= (((float) g.out_degree(v)) * epsilon)) {
-                frontier[v] = 1;
-            }
-        }
-    }
-		ofile.close();
-    int num_items = std::count(frontier.begin(), frontier.end(), 1);
-    std::cerr << "returning with frontier size: " << num_items << std::endl;
-}

From 15278cf7d798dc06b65afb5131b1bd84a2ce7c5a Mon Sep 17 00:00:00 2001
From: Emily Furst <eafurst@cs.washington.edu>
Date: Mon, 10 May 2021 14:16:02 -0700
Subject: [PATCH 22/22] [pr-nibble] fixing bad makefile path, vim trampled all
 over my camel case

---
 examples/graphit/pr_nibble/Makefile |  2 +-
 examples/graphit/pr_nibble/main.cpp | 38 ++++++++++++++---------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/examples/graphit/pr_nibble/Makefile b/examples/graphit/pr_nibble/Makefile
index 720bbdc24..af6475765 100644
--- a/examples/graphit/pr_nibble/Makefile
+++ b/examples/graphit/pr_nibble/Makefile
@@ -128,7 +128,7 @@ TILE_GROUP_DIM_Y = 8
 RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
 RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
 
-RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_pr_nibble/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/
+RISCV_INCLUDES += -I$(CURRENT_PATH)/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/
 
 include $(EXAMPLES_PATH)/cuda/riscv.mk
 
diff --git a/examples/graphit/pr_nibble/main.cpp b/examples/graphit/pr_nibble/main.cpp
index 55a05fd8e..aa2d4032f 100644
--- a/examples/graphit/pr_nibble/main.cpp
+++ b/examples/graphit/pr_nibble/main.cpp
@@ -120,47 +120,47 @@ int test_pr_nibble(int argc, char ** argv){
         case 0: //vertex pull
             std::cerr << "pull kernel\n";
             std::cerr << "preloading the cache\n";
-            device->enqueuejob("prefetch", hb_mc_dimension(x,y),{edges.getinindicesaddr() , edges.getinneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges()});
-            device->runjobs();
+            device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()});
+            device->runJobs();
             std::cerr << "run update self vertex kernel\n";
-            device->enqueuejob("updateself_kernel",hb_mc_dimension(x,y), {frontier.getaddr(), edges.num_nodes(), tag_c});
-            device->runjobs();
+            device->enqueueJob("updateself_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
+            device->runJobs();
             tag_c++;
             std::cerr << "run update edges kernel on iter : " << iter << "\n";
-            device->enqueuejob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(x,y),{edges.getinindicesaddr() , edges.getinneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
-            device->runjobs();
+            device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c});
+            device->runJobs();
             tag_c++;
             std::cerr << "create next frontier\n";
-            device->enqueuejob("filter_frontier_where_call", hb_mc_dimension(x,y),{frontier.getaddr(), edges.num_nodes(), edges.num_edges(), tag_c});
-            device->runjobs();
+            device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+            device->runJobs();
             std::cerr << "swap arrays\n";
             hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
-            f_sz = builtin_getvertexsetsizehb(frontier, edges.num_nodes());
+            f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
             std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
             break;
         case 1: //vertex push
             std::cerr << "push kernel\n";
             std::cerr << "preloading the cache\n";
-            device->enqueuejob("prefetch", hb_mc_dimension(x,y),{edges.getoutindicesaddr() , edges.getoutneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges()});
-            device->runjobs();
+            device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()});
+            device->runJobs();
             std::cerr << "run update self vertex kernel\n";
-            device->enqueuejob("updateself_kernel",hb_mc_dimension(x,y), {frontier.getaddr(), edges.num_nodes(), tag_c});
-            device->runjobs();
+            device->enqueueJob("updateself_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c});
+            device->runJobs();
             tag_c++;
             std::cerr << "run update edges kernel on iter : " << iter << "\n";
-            device->enqueuejob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(x,y),{edges.getoutindicesaddr() , edges.getoutneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); 
-            device->runjobs();
+            device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); 
+            device->runJobs();
             tag_c++;
             std::cerr << "create next frontier\n";
-            device->enqueuejob("filter_frontier_where_call", hb_mc_dimension(x,y),{frontier.getaddr(), edges.num_nodes(), edges.num_edges(), tag_c});
-            device->runjobs();
+            device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c});
+            device->runJobs();
             std::cerr << "swap arrays\n";
             hammerblade::swap_global_arrays<float>(new_rank_dev, old_rank_dev);
-            f_sz = builtin_getvertexsetsizehb(frontier, edges.num_nodes());
+            f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes());
             std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl;
             break;
     }
-    if(verify) {
+    if(VERIFY) {
         ofstream ver_file;
         ver_file.open("./rank.txt");
         float host_rank[edges.num_nodes()];