From 11664ab6ab8300feeaa23cc43a4e9ced48d2fafc Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Fri, 30 Apr 2021 11:25:13 -0700 Subject: [PATCH 01/22] IPNSW source from baseline --- .gitmodules | 3 + .../ipnsw/BeamSearchFactory.hpp | 11 + .../ipnsw/BeamSearchKernelRunner.hpp | 38 ++ .../ipnsw/BeamSearchResultReader.hpp | 25 + .../ipnsw/GreedyWalkFactory.hpp | 12 + .../ipnsw/GreedyWalkKernelRunner.hpp | 25 + .../ipnsw/GreedyWalkResultReader.hpp | 21 + .../ipnsw/GreedyWalkResults.cpp | 517 ++++++++++++++++++ .../ipnsw/GreedyWalkResults.hpp | 9 + examples/sdh-eval-workloads/ipnsw/IO.hpp | 223 ++++++++ .../sdh-eval-workloads/ipnsw/IPNSWFactory.hpp | 17 + .../sdh-eval-workloads/ipnsw/IPNSWGraph.hpp | 69 +++ .../ipnsw/IPNSWKernelRunner.hpp | 31 ++ .../ipnsw/IPNSWResultReader.hpp | 13 + .../sdh-eval-workloads/ipnsw/IPNSWRunner.hpp | 184 +++++++ .../ipnsw/IProductUBmkFactory.hpp | 19 + .../ipnsw/IProductUBmkKernelRunner.hpp | 31 ++ .../ipnsw/IProductUBmkResultReader.hpp | 12 + examples/sdh-eval-workloads/ipnsw/Makefile | 351 ++++++++++++ .../ipnsw/StringHelpers.hpp | 17 + .../sdh-eval-workloads/ipnsw/hb-prog-eval | 1 + examples/sdh-eval-workloads/ipnsw/ipnsw.cpp | 86 +++ examples/sdh-eval-workloads/ipnsw/ipnsw.hpp | 37 ++ .../ipnsw/kernel/beam_search/kernel.cpp | 182 ++++++ .../ipnsw/kernel/beam_search_v1/kernel.cpp | 188 +++++++ .../ipnsw/kernel/beam_search_v2/kernel.cpp | 189 +++++++ .../ipnsw/kernel/beam_search_v3/kernel.cpp | 189 +++++++ .../ipnsw/kernel/beam_search_v4/kernel.cpp | 189 +++++++ .../ipnsw/kernel/beam_search_v5/kernel.cpp | 189 +++++++ .../ipnsw/kernel/debug/kernel.cpp | 2 + .../ipnsw/kernel/greedy_walk/kernel.cpp | 147 +++++ .../ipnsw/kernel/greedy_walk_v1/kernel.cpp | 147 +++++ .../ipnsw/kernel/greedy_walk_v2/kernel.cpp | 147 +++++ .../ipnsw/kernel/greedy_walk_v3/kernel.cpp | 147 +++++ .../ipnsw/kernel/include/heap.hpp | 40 ++ .../ipnsw/kernel/include/hello_world.hpp | 6 + .../ipnsw/kernel/include/inner_product.hpp | 89 +++ .../ipnsw/kernel/include/set.hpp | 73 +++ .../ipnsw/kernel/iproduct_ubmk/kernel.cpp | 71 +++ .../ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp | 76 +++ .../ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp | 76 +++ .../ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp | 76 +++ .../ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp | 76 +++ 43 files changed, 4051 insertions(+) create mode 100644 .gitmodules create mode 100644 examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IO.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/Makefile create mode 100644 examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp create mode 160000 examples/sdh-eval-workloads/ipnsw/hb-prog-eval create mode 100644 examples/sdh-eval-workloads/ipnsw/ipnsw.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/ipnsw.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..5083eb4ea --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "n"] + path = examples/sdh-eval-workloads/ipnsw/hb-prog-eval + url = git@github.com:bespoke-silicon-group/hb-prog-eval diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp new file mode 100644 index 000000000..3d14b2c8d --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchFactory.hpp @@ -0,0 +1,11 @@ +#pragma once +#include "IPNSWFactory.hpp" +#include "BeamSearchKernelRunner.hpp" +#include "BeamSearchResultReader.hpp" +namespace ipnsw { + class BeamSearchFactory : public IPNSWFactory { + private: + IPNSWKernelRunner *_KernelRunner() const { return new BeamSearchKernelRunner; } + IPNSWResultReader *_ResultReader() const { return new BeamSearchResultReader; } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp new file mode 100644 index 000000000..426042f6d --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp @@ -0,0 +1,38 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IPNSWRunner.hpp" +#include "GreedyWalkResults.hpp" + +namespace ipnsw { + class BeamSearchKernelRunner : public IPNSWKernelRunner { + std::string kernelName(const IPNSWRunner & runner) const { + return "ipnsw_beam_search"; + } + + std::vector argv(const IPNSWRunner & runner) const { + int v_curr; + float d_curr; + v_curr = std::get(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]); + d_curr = std::get(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]); + + HammerBlade::Ptr hb = HammerBlade::Get(); + hb->write(runner.v_curr_dev(), &v_curr, sizeof(v_curr)); + hb->write(runner.d_curr_dev(), &d_curr, sizeof(d_curr)); + + std::vector argv = { + runner.graph_metadata_dev(), + runner.db_dev(), + runner.query_dev(), + runner.seen_dev(), + runner.v_curr_dev(), + runner.d_curr_dev(), + runner.candidates_dev(), + runner.results_dev(), + runner.n_results_dev(), + }; + return argv; + }; + Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);} + Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);} + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp new file mode 100644 index 000000000..ce77d324f --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp @@ -0,0 +1,25 @@ +#pragma once +#include "IPNSWRunner.hpp" +#include "IPNSWResultReader.hpp" +#include "GreedyWalkResults.hpp" + +namespace ipnsw { + class BeamSearchResultReader : public IPNSWResultReader { + public: + void readResults(const IPNSWRunner & runner) { + HammerBlade::Ptr hb = HammerBlade::Get(); + + int n_results; + hb->read(runner.n_results_dev(), &n_results, sizeof(int)); + + std::vector results(n_results); + hb->push_read(runner.results_dev(), &results[0], n_results * sizeof(GreedyWalkResult)); + hb->sync_read(); + + std::cout << "Beam search:" << std::endl; + for (auto & r : results) { + std::cout << "{" << std::get<0>(r) << "," << std::get<1>(r) << "}" << std::endl; + } + } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp new file mode 100644 index 000000000..e98f11ad2 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkFactory.hpp @@ -0,0 +1,12 @@ +#pragma once +#include "IPNSWFactory.hpp" +#include "GreedyWalkKernelRunner.hpp" +#include "GreedyWalkResultReader.hpp" + +namespace ipnsw { + class GreedyWalkFactory : public IPNSWFactory { + private: + IPNSWKernelRunner *_KernelRunner() const { return new GreedyWalkKernelRunner; } + IPNSWResultReader *_ResultReader() const { return new GreedyWalkResultReader; } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp new file mode 100644 index 000000000..72eea9f0f --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp @@ -0,0 +1,25 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IPNSWRunner.hpp" + +namespace ipnsw { + class GreedyWalkKernelRunner : public IPNSWKernelRunner { + std::string kernelName(const IPNSWRunner & runner) const { + return "ipnsw_greedy_search"; + } + + std::vector argv(const IPNSWRunner & runner) const { + std::vector argv = { + runner.graph_metadata_dev(), + runner.db_dev(), + runner.query_dev(), + runner.seen_dev(), + runner.v_curr_dev(), + runner.d_curr_dev(), + }; + return argv; + }; + Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);} + Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);} + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp new file mode 100644 index 000000000..ae57cd548 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp @@ -0,0 +1,21 @@ +#pragma once +#include "IPNSWRunner.hpp" +#include "IPNSWResultReader.hpp" + +namespace ipnsw { + class GreedyWalkResultReader : public IPNSWResultReader { + public: + void readResults(const IPNSWRunner & runner) { + HammerBlade::Ptr hb = HammerBlade::Get(); + int v_curr; + float d_curr; + + hb->read(runner.v_curr_dev(), &v_curr, sizeof(int)); + hb->read(runner.d_curr_dev(), &d_curr, sizeof(float)); + + std::cout << "Greedy walk (v_curr,d_curr) = " + << "(" << v_curr << "," << d_curr << ")" + << std::endl; + } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp new file mode 100644 index 000000000..7d37104df --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.cpp @@ -0,0 +1,517 @@ +#include "GreedyWalkResults.hpp" +namespace ipnsw { + std::vector GREEDY_WALK_RESULTS = { + GreedyWalkResult(static_cast(-0x1.94442e0000000p-2), 40323), + GreedyWalkResult(static_cast(-0x1.e72901fffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.cb85360000001p-4),541780), + GreedyWalkResult(static_cast(-0x1.e56d7ffffffffp-8), 78517), + GreedyWalkResult(static_cast(-0x1.655f860000000p-4),732469), + GreedyWalkResult(static_cast(-0x1.04cbcc0000000p-4),380912), + GreedyWalkResult(static_cast(-0x1.3243d20000000p-5),606365), + GreedyWalkResult(static_cast(-0x1.2dbf640000000p-4),950108), + GreedyWalkResult(static_cast(-0x1.fa90ea0000001p-1),168533), + GreedyWalkResult(static_cast(-0x1.2922f80000000p-3),228514), + GreedyWalkResult(static_cast(-0x1.5974060000000p-1),725033), + GreedyWalkResult(static_cast(-0x1.abcf2c0000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.b262380000000p-1),272753), + GreedyWalkResult(static_cast(-0x1.c0e98a0000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.01b4680000000p-2),184077), + GreedyWalkResult(static_cast(-0x1.96e3280000000p-2),208965), + GreedyWalkResult(static_cast(-0x1.58dd120000000p-3),580161), + GreedyWalkResult(static_cast(-0x1.1f333a0000000p-3),236872), + GreedyWalkResult(static_cast(-0x1.8db7de0000000p-2),294738), + GreedyWalkResult(static_cast(-0x1.4e43500000000p-2),909721), + GreedyWalkResult(static_cast(-0x1.a5ae760000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.7fcff00000000p-5),294738), + GreedyWalkResult(static_cast(-0x1.5630f40000000p-1),960530), + GreedyWalkResult(static_cast(-0x1.48d8c20000000p-1),853984), + GreedyWalkResult(static_cast(-0x1.14556ffffffffp+0),909721), + GreedyWalkResult(static_cast(-0x1.a746760000000p-2),865184), + GreedyWalkResult(static_cast(-0x1.ddcb81fffffffp-3),513240), + GreedyWalkResult(static_cast(-0x1.94a92ffffffffp-2),550771), + GreedyWalkResult(static_cast(-0x1.45b69c0000000p-1),432335), + GreedyWalkResult(static_cast(-0x1.2ef8fa0000000p-3),226268), + GreedyWalkResult(static_cast(-0x1.9909440000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.1ce937fffffffp-5),321516), + GreedyWalkResult(static_cast(-0x1.c0de380000000p-2),228514), + GreedyWalkResult(static_cast(-0x1.0de8e60000000p-8),897966), + GreedyWalkResult(static_cast(-0x1.99783c0000000p-1),865184), + GreedyWalkResult(static_cast(-0x1.01316e0000000p+0),886263), + GreedyWalkResult(static_cast(-0x1.a172140000000p-6),177485), + GreedyWalkResult(static_cast(-0x1.2b8f9a0000000p-7),973080), + GreedyWalkResult(static_cast(-0x1.924b440000000p-5),290055), + GreedyWalkResult(static_cast(-0x1.8515aa0000000p-2),905210), + GreedyWalkResult(static_cast(-0x1.f68975ffffffep-3),294738), + GreedyWalkResult(static_cast(-0x1.dd5ed00000001p-6),790506), + GreedyWalkResult(static_cast(-0x1.be40740000000p-1),870888), + GreedyWalkResult(static_cast(-0x1.08f4460000001p-2),666073), + GreedyWalkResult(static_cast(-0x1.2589100000000p-2),385014), + GreedyWalkResult(static_cast(-0x1.e43ad00000001p-3),230001), + GreedyWalkResult(static_cast(-0x1.161b360000000p+0),646867), + GreedyWalkResult(static_cast(-0x1.475e87fffffffp-6),179303), + GreedyWalkResult(static_cast(-0x1.425b1c0000000p-1),463324), + GreedyWalkResult(static_cast(-0x1.f4b68c0000000p-1),909721), + GreedyWalkResult(static_cast(-0x1.1333440000000p-1),168533), + GreedyWalkResult(static_cast(-0x1.0e35aa0000000p-1),312088), + GreedyWalkResult(static_cast(-0x1.1b7653fffffffp+0),854962), + GreedyWalkResult(static_cast(-0x1.cb8adc0000000p-3),491377), + GreedyWalkResult(static_cast(-0x1.51a0380000000p-1),226268), + GreedyWalkResult(static_cast(-0x1.e4b9940000000p-2),603696), + GreedyWalkResult(static_cast(-0x1.623f9a0000000p-2),991097), + GreedyWalkResult(static_cast(-0x1.1660b20000000p-1), 18868), + GreedyWalkResult(static_cast(-0x1.bd75200000000p-7), 56131), + GreedyWalkResult(static_cast(-0x1.4dbbe00000000p+0), 16476), + GreedyWalkResult(static_cast(-0x1.1b55860000000p-5),310512), + GreedyWalkResult(static_cast(-0x1.1f40e00000000p+0),115894), + GreedyWalkResult(static_cast(-0x1.d403c60000001p-2),718485), + GreedyWalkResult(static_cast(-0x1.a7b7bdfffffffp-7),601673), + GreedyWalkResult(static_cast(-0x1.7f5c8c0000000p-2),552153), + GreedyWalkResult(static_cast(-0x1.6834060000001p-3),294738), + GreedyWalkResult(static_cast(-0x1.8ccf620000000p-2),513240), + GreedyWalkResult(static_cast(-0x1.1508660000000p+0),666073), + GreedyWalkResult(static_cast(-0x1.6362300000000p-1),982683), + GreedyWalkResult(static_cast(-0x1.175fbc0000000p-4),226268), + GreedyWalkResult(static_cast(-0x1.10e30a0000000p-5),703851), + GreedyWalkResult(static_cast(-0x1.0343340000000p+0),580161), + GreedyWalkResult(static_cast(-0x1.9337a20000000p-3),236872), + GreedyWalkResult(static_cast(-0x1.986e8a0000000p-7),986292), + GreedyWalkResult(static_cast(-0x1.1f400a0000000p+0),336830), + GreedyWalkResult(static_cast(-0x1.3c0e060000000p-1),168533), + GreedyWalkResult(static_cast(-0x1.8589cc0000000p-1),118607), + GreedyWalkResult(static_cast(-0x1.745f000000000p-3),272753), + GreedyWalkResult(static_cast(-0x1.317ca40000000p-4),494402), + GreedyWalkResult(static_cast(-0x1.ebd52a0000001p-7),517512), + GreedyWalkResult(static_cast(-0x1.7ad9100000001p-6),986292), + GreedyWalkResult(static_cast(-0x1.6ed8a00000000p-2),134880), + GreedyWalkResult(static_cast(-0x1.273edc0000000p-2),294738), + GreedyWalkResult(static_cast(-0x1.93db8c0000000p-1),620143), + GreedyWalkResult(static_cast(-0x1.324dd60000000p-4),778172), + GreedyWalkResult(static_cast(-0x1.3c59a80000000p-1),270175), + GreedyWalkResult(static_cast(-0x1.fc51e80000000p-2),114191), + GreedyWalkResult(static_cast(-0x1.a7fbc60000000p-2),603696), + GreedyWalkResult(static_cast(-0x1.ab76780000000p-1),406402), + GreedyWalkResult(static_cast(-0x1.8733320000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.447bb00000000p-1),513240), + GreedyWalkResult(static_cast(-0x1.b5c3140000000p-5),729175), + GreedyWalkResult(static_cast(-0x1.ca9b880000000p-1),785859), + GreedyWalkResult(static_cast(-0x1.beee640000000p-1),854962), + GreedyWalkResult(static_cast(-0x1.47b4e80000000p-1),738101), + GreedyWalkResult(static_cast(-0x1.069a7c0000000p-1),193430), + GreedyWalkResult(static_cast(-0x1.20f53c0000000p-1),118809), + GreedyWalkResult(static_cast(-0x1.1612f80000000p-2),711979), + GreedyWalkResult(static_cast(-0x1.25c6c80000000p-1),348136), + GreedyWalkResult(static_cast(-0x1.2507300000000p-2), 36731), + GreedyWalkResult(static_cast(-0x1.14ef720000001p+0),268974), + GreedyWalkResult(static_cast(-0x1.2b54f80000000p-4), 40323), + GreedyWalkResult(static_cast(-0x1.e07ccbfffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.070d960000000p-4),785239), + GreedyWalkResult(static_cast(-0x1.49e6200000000p-1),496330), + GreedyWalkResult(static_cast(-0x1.86c9080000000p-1),969505), + GreedyWalkResult(static_cast(-0x1.0b584c0000000p-1),587902), + GreedyWalkResult(static_cast(-0x1.bb1ee00000000p-7),439426), + GreedyWalkResult(static_cast(-0x1.ff17c9fffffffp-11),467026), + GreedyWalkResult(static_cast(-0x1.0da6980000000p+0),294738), + GreedyWalkResult(static_cast(-0x1.1d0ba40000001p-3),288912), + GreedyWalkResult(static_cast(-0x1.301dec0000000p-1),541780), + GreedyWalkResult(static_cast(-0x1.2f9b800000000p-4),261103), + GreedyWalkResult(static_cast(-0x1.8d769e0000000p-4),239334), + GreedyWalkResult(static_cast(-0x1.6ea4f80000000p-3),223977), + GreedyWalkResult(static_cast(-0x1.fcc7dc0000000p-2),662137), + GreedyWalkResult(static_cast(-0x1.5949fe0000000p-3),565830), + GreedyWalkResult(static_cast(-0x1.1a11aa0000000p-1),908217), + GreedyWalkResult(static_cast(-0x1.8bff140000000p-1), 2251), + GreedyWalkResult(static_cast(-0x1.7ccda1fffffffp-2),467026), + GreedyWalkResult(static_cast(-0x1.80bf6e0000000p-2), 50016), + GreedyWalkResult(static_cast(-0x1.3444300000000p-2), 2251), + GreedyWalkResult(static_cast(-0x1.c8e8bc0000000p-1),223249), + GreedyWalkResult(static_cast(-0x1.679767fffffffp-3),494887), + GreedyWalkResult(static_cast(-0x1.6c896c0000000p-3),114191), + GreedyWalkResult(static_cast(-0x1.413b740000000p-4),772422), + GreedyWalkResult(static_cast(-0x1.4e1d760000000p-3),168533), + GreedyWalkResult(static_cast(-0x1.7202fe0000001p-1),131611), + GreedyWalkResult(static_cast(-0x1.2589840000000p+0),385014), + GreedyWalkResult(static_cast(-0x1.5820da0000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.96ceb00000001p-3),177485), + GreedyWalkResult(static_cast(-0x1.d6ac77fffffffp-4),865184), + GreedyWalkResult(static_cast(-0x1.bfefa00000000p-7),149329), + GreedyWalkResult(static_cast(-0x1.69ac280000000p-1), 73867), + GreedyWalkResult(static_cast(-0x1.04bb900000000p+0),567514), + GreedyWalkResult(static_cast(-0x1.142a3dfffffffp+0),550771), + GreedyWalkResult(static_cast(-0x1.2f1ca40000000p-5),552153), + GreedyWalkResult(static_cast(-0x1.1def580000000p-1),679881), + GreedyWalkResult(static_cast(-0x1.072ac60000000p-4), 29163), + GreedyWalkResult(static_cast(-0x1.2821940000000p-4),854962), + GreedyWalkResult(static_cast(-0x1.72a68e0000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.cafce80000000p-3),729852), + GreedyWalkResult(static_cast(-0x1.3ba2d80000000p-2),729021), + GreedyWalkResult(static_cast(-0x1.68739e0000000p-3),226268), + GreedyWalkResult(static_cast(-0x1.aeb25c0000000p-1),134880), + GreedyWalkResult(static_cast(-0x1.18c0840000000p-5),693842), + GreedyWalkResult(static_cast(-0x1.fe21ce0000001p-1), 40323), + GreedyWalkResult(static_cast(-0x1.b41fb00000001p-1),735181), + GreedyWalkResult(static_cast(-0x1.2826320000000p-8),379502), + GreedyWalkResult(static_cast(-0x1.5eecda0000000p-1),925333), + GreedyWalkResult(static_cast(-0x1.b002d40000000p-1),842476), + GreedyWalkResult(static_cast(-0x1.4e53aa0000000p-2),228514), + GreedyWalkResult(static_cast(-0x1.a1b49bfffffffp-2),228514), + GreedyWalkResult(static_cast(-0x1.f1c7ac0000000p-1),750819), + GreedyWalkResult(static_cast(-0x1.67f6720000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.31a6600000001p-6),341861), + GreedyWalkResult(static_cast(-0x1.61c1080000000p-3),790506), + GreedyWalkResult(static_cast(-0x1.aaa3780000000p-2),550771), + GreedyWalkResult(static_cast(-0x1.3fa68a0000001p-6),160291), + GreedyWalkResult(static_cast(-0x1.38c0b20000000p-1),379199), + GreedyWalkResult(static_cast(-0x1.ee68980000001p-2),318485), + GreedyWalkResult(static_cast(-0x1.dd852c0000001p-2),655315), + GreedyWalkResult(static_cast(-0x1.06fa43fffffffp+0),790506), + GreedyWalkResult(static_cast(-0x1.07007e0000000p+0),926790), + GreedyWalkResult(static_cast(-0x1.f352a1fffffffp-1),523435), + GreedyWalkResult(static_cast(-0x1.c6d6160000000p-1),169991), + GreedyWalkResult(static_cast(-0x1.090c620000000p-5),168533), + GreedyWalkResult(static_cast(-0x1.19f6860000000p+0),239334), + GreedyWalkResult(static_cast(-0x1.e3f8580000001p-2),255916), + GreedyWalkResult(static_cast(-0x1.2148180000000p-1),206826), + GreedyWalkResult(static_cast(-0x1.0487660000000p-2),494402), + GreedyWalkResult(static_cast(-0x1.be5ea00000000p-3),532480), + GreedyWalkResult(static_cast(-0x1.114b0a0000000p-3),294738), + GreedyWalkResult(static_cast(-0x1.1e0a2a0000000p-7),379350), + GreedyWalkResult(static_cast(-0x1.22f06bfffffffp+0),239334), + GreedyWalkResult(static_cast(-0x1.bc42c20000000p-1),133288), + GreedyWalkResult(static_cast(-0x1.9ec387fffffffp-2),495101), + GreedyWalkResult(static_cast(-0x1.ab66b80000000p-3),115894), + GreedyWalkResult(static_cast(-0x1.9be6e80000000p-4),513240), + GreedyWalkResult(static_cast(-0x1.4cdc7ffffffffp-6),973080), + GreedyWalkResult(static_cast(-0x1.c7a31c0000000p-7),764589), + GreedyWalkResult(static_cast(-0x1.a35f1c0000000p-8),115043), + GreedyWalkResult(static_cast(-0x1.3422a00000000p-1),228514), + GreedyWalkResult(static_cast(-0x1.5a4aa60000000p-4), 49557), + GreedyWalkResult(static_cast(-0x1.06eddc0000000p-2),226268), + GreedyWalkResult(static_cast(-0x1.d46bde0000000p-1),790506), + GreedyWalkResult(static_cast(-0x1.02e72c0000000p-3),294738), + GreedyWalkResult(static_cast(-0x1.e33abffffffffp-2),112248), + GreedyWalkResult(static_cast(-0x1.ae74060000001p-4),133288), + GreedyWalkResult(static_cast(-0x1.272a2bfffffffp-7),850826), + GreedyWalkResult(static_cast(-0x1.357f25fffffffp-2),239334), + GreedyWalkResult(static_cast(-0x1.33c9f1fffffffp-3), 25893), + GreedyWalkResult(static_cast(-0x1.771fdc0000001p-5),305162), + GreedyWalkResult(static_cast(-0x1.18a1080000000p-4),729175), + GreedyWalkResult(static_cast(-0x1.46ad1e0000000p-4),790506), + GreedyWalkResult(static_cast(-0x1.0a53300000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.783f4e0000000p-6),546811), + GreedyWalkResult(static_cast(-0x1.3f05b60000000p-3),239334), + GreedyWalkResult(static_cast(-0x1.602d5c0000000p-3),463324), + GreedyWalkResult(static_cast(-0x1.c8f2b20000000p-5),513240), + GreedyWalkResult(static_cast(-0x1.0bde920000000p+0),236872), + GreedyWalkResult(static_cast(-0x1.8eb3fe0000000p-1),168533), + GreedyWalkResult(static_cast(-0x1.d981120000002p-3),849285), + GreedyWalkResult(static_cast(-0x1.d8151a0000001p-1),133288), + GreedyWalkResult(static_cast(-0x1.c231ec0000000p-1),790506), + GreedyWalkResult(static_cast(-0x1.c742700000000p-1),239334), + GreedyWalkResult(static_cast(-0x1.2a6c6a0000000p+0),945767), + GreedyWalkResult(static_cast(-0x1.5b8c5bfffffffp-2),294738), + GreedyWalkResult(static_cast(-0x1.391a700000000p-12),562015), + GreedyWalkResult(static_cast(-0x1.896b960000000p-1),969505), + GreedyWalkResult(static_cast(-0x1.28e7fe0000000p-3),228514), + GreedyWalkResult(static_cast(-0x1.577a11fffffffp-4),348136), + GreedyWalkResult(static_cast(-0x1.43b7f80000000p-4),950108), + GreedyWalkResult(static_cast(-0x1.7e64600000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.97ebe20000000p-5),392823), + GreedyWalkResult(static_cast(-0x1.a856440000000p-3),793084), + GreedyWalkResult(static_cast(-0x1.84531a0000000p-6),986292), + GreedyWalkResult(static_cast(-0x1.7c80d40000000p-4),186838), + GreedyWalkResult(static_cast(-0x1.0c56e5fffffffp+0),294738), + GreedyWalkResult(static_cast(-0x1.72c0da0000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.1844d00000000p-5),606365), + GreedyWalkResult(static_cast(-0x1.52a5d40000000p-10),470059), + GreedyWalkResult(static_cast(-0x1.7d31400000000p-1),738101), + GreedyWalkResult(static_cast(-0x1.c47df00000000p-7),710471), + GreedyWalkResult(static_cast(-0x1.dc3ccbfffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.5e773c0000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.7ffd660000000p-2),920345), + GreedyWalkResult(static_cast(-0x1.ab0dc00000001p-2),677155), + GreedyWalkResult(static_cast(-0x1.7f8db00000000p-5),973080), + GreedyWalkResult(static_cast(-0x1.add3b60000000p-1),293302), + GreedyWalkResult(static_cast(-0x1.e0328c0000000p-4),758625), + GreedyWalkResult(static_cast(-0x1.6022ce0000000p-5),666073), + GreedyWalkResult(static_cast(-0x1.a1d241fffffffp-4),226268), + GreedyWalkResult(static_cast(-0x1.cec5e60000000p-2),294738), + GreedyWalkResult(static_cast(-0x1.893f260000000p-3),855760), + GreedyWalkResult(static_cast(-0x1.0790c00000000p-2),145893), + GreedyWalkResult(static_cast(-0x1.49456ffffffffp-7),215955), + GreedyWalkResult(static_cast(-0x1.71b1bc0000001p-5),312088), + GreedyWalkResult(static_cast(-0x1.8b1c580000000p-1),729175), + GreedyWalkResult(static_cast(-0x1.2010d20000000p-4),142436), + GreedyWalkResult(static_cast(-0x1.c33ecc0000000p-4),280878), + GreedyWalkResult(static_cast(-0x1.6b1dce0000000p-2),444780), + GreedyWalkResult(static_cast(-0x1.f76bb60000001p-2),294738), + GreedyWalkResult(static_cast(-0x1.87151ffffffffp-2),294738), + GreedyWalkResult(static_cast(-0x1.f522ae0000000p-2), 9333), + GreedyWalkResult(static_cast(-0x1.77d5c40000001p-4),114191), + GreedyWalkResult(static_cast(-0x1.f7f4edfffffffp-5),239334), + GreedyWalkResult(static_cast(-0x1.1c46b00000000p-1),270226), + GreedyWalkResult(static_cast(-0x1.a4f43bfffffffp-6),140906), + GreedyWalkResult(static_cast(-0x1.8952480000000p-1),670146), + GreedyWalkResult(static_cast(-0x1.ca891c0000000p-7),973080), + GreedyWalkResult(static_cast(-0x1.e36b85fffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.1aaf580000000p-3),909372), + GreedyWalkResult(static_cast(-0x1.8116920000000p-8), 51434), + GreedyWalkResult(static_cast(-0x1.acc07e0000000p-1), 26012), + GreedyWalkResult(static_cast(-0x1.a2316c0000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.3a68660000000p-3),628152), + GreedyWalkResult(static_cast(-0x1.c199e80000000p-2),907223), + GreedyWalkResult(static_cast(-0x1.8bfc920000000p-3), 16476), + GreedyWalkResult(static_cast(-0x1.c9b8520000000p-5),568921), + GreedyWalkResult(static_cast(-0x1.be82e20000000p-2),134880), + GreedyWalkResult(static_cast(-0x1.8cabe60000001p-2),660609), + GreedyWalkResult(static_cast(-0x1.7222980000000p-1),118809), + GreedyWalkResult(static_cast(-0x1.b313ea0000000p-1),842476), + GreedyWalkResult(static_cast(-0x1.8b56380000000p-7), 38538), + GreedyWalkResult(static_cast(-0x1.3e74440000000p-3),729175), + GreedyWalkResult(static_cast(-0x1.6349900000000p-9),136557), + GreedyWalkResult(static_cast(-0x1.2128060000001p+0),672634), + GreedyWalkResult(static_cast(-0x1.25d0560000001p-8),314066), + GreedyWalkResult(static_cast(-0x1.206c1a0000000p+0),288181), + GreedyWalkResult(static_cast(-0x1.696a200000001p-3),114191), + GreedyWalkResult(static_cast(-0x1.1a74180000000p-1),226268), + GreedyWalkResult(static_cast(-0x1.608e8a0000000p-2),239334), + GreedyWalkResult(static_cast(-0x1.e583780000001p-1),854962), + GreedyWalkResult(static_cast(-0x1.cdfae5fffffffp-1),288181), + GreedyWalkResult(static_cast(-0x1.53c3200000001p-5),926790), + GreedyWalkResult(static_cast(-0x1.a8f37bfffffffp-5),164698), + GreedyWalkResult(static_cast(-0x1.e1399ffffffffp-7),517512), + GreedyWalkResult(static_cast(-0x1.adf8240000000p-3),587902), + GreedyWalkResult(static_cast(-0x1.f91ca60000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.b717880000000p-3), 70417), + GreedyWalkResult(static_cast(-0x1.57b0760000000p-4),939764), + GreedyWalkResult(static_cast(-0x1.1de1ca0000000p+0), 74899), + GreedyWalkResult(static_cast(-0x1.c67da40000000p-2),114191), + GreedyWalkResult(static_cast(-0x1.64c96c0000001p-2),261103), + GreedyWalkResult(static_cast(-0x1.54c6240000000p-1),107308), + GreedyWalkResult(static_cast(-0x1.0274f60000000p-2),236872), + GreedyWalkResult(static_cast(-0x1.5b05140000000p-1),969505), + GreedyWalkResult(static_cast(-0x1.1a4ca80000000p-6),950108), + GreedyWalkResult(static_cast(-0x1.de24900000000p-1),836318), + GreedyWalkResult(static_cast(-0x1.5c834e0000000p-3),228059), + GreedyWalkResult(static_cast(-0x1.682d5c0000000p-3),107308), + GreedyWalkResult(static_cast(-0x1.b96de80000000p-1),532480), + GreedyWalkResult(static_cast(-0x1.f1c5680000000p-1),186838), + GreedyWalkResult(static_cast(-0x1.d87015fffffffp-3),236872), + GreedyWalkResult(static_cast(-0x1.992d1ffffffffp-2),884850), + GreedyWalkResult(static_cast(-0x1.38d1580000001p-6),986292), + GreedyWalkResult(static_cast(-0x1.a59a700000001p-3),550771), + GreedyWalkResult(static_cast(-0x1.bb07fe0000001p-5),531816), + GreedyWalkResult(static_cast(-0x1.48fa060000000p-1),128603), + GreedyWalkResult(static_cast(-0x1.81b2000000001p-7),129055), + GreedyWalkResult(static_cast(-0x1.4bfc5bfffffffp-2),576030), + GreedyWalkResult(static_cast(-0x1.4683200000000p-1),727476), + GreedyWalkResult(static_cast(-0x1.9165800000000p-5), 38538), + GreedyWalkResult(static_cast(-0x1.2b59be0000000p-3),941181), + GreedyWalkResult(static_cast(-0x1.21086e0000000p-5),467026), + GreedyWalkResult(static_cast(-0x1.1fb5700000000p-7),986292), + GreedyWalkResult(static_cast(-0x1.3fa0620000000p-2), 40323), + GreedyWalkResult(static_cast(-0x1.d2b8bdfffffffp-2),355312), + GreedyWalkResult(static_cast(-0x1.ec8a43fffffffp-2),532480), + GreedyWalkResult(static_cast(-0x1.eeaace0000000p-9),385014), + GreedyWalkResult(static_cast(-0x1.5649140000000p-1),842476), + GreedyWalkResult(static_cast(-0x1.49e3ae0000001p-6), 29163), + GreedyWalkResult(static_cast(-0x1.b53db20000001p-5),442413), + GreedyWalkResult(static_cast(-0x1.5aa6380000000p-3),909721), + GreedyWalkResult(static_cast(-0x1.cdc0f80000000p-3),450479), + GreedyWalkResult(static_cast(-0x1.c9aab80000000p-2),541408), + GreedyWalkResult(static_cast(-0x1.0d78740000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.1a48820000000p-6),810043), + GreedyWalkResult(static_cast(-0x1.3a76fc0000000p-1),804725), + GreedyWalkResult(static_cast(-0x1.2f318a0000000p-7),562579), + GreedyWalkResult(static_cast(-0x1.6c91920000000p-2),270226), + GreedyWalkResult(static_cast(-0x1.9ac5940000000p-4),263560), + GreedyWalkResult(static_cast(-0x1.42bc8c0000000p-1),112754), + GreedyWalkResult(static_cast(-0x1.906b7c0000000p-1),909721), + GreedyWalkResult(static_cast(-0x1.3586ac0000000p-7), 53791), + GreedyWalkResult(static_cast(-0x1.69ef5a0000000p-3),385014), + GreedyWalkResult(static_cast(-0x1.4e4f3e0000000p-3),294738), + GreedyWalkResult(static_cast(-0x1.b379440000000p-1),980037), + GreedyWalkResult(static_cast(-0x1.1a94380000000p+0),624004), + GreedyWalkResult(static_cast(-0x1.5e22e00000001p-8), 36331), + GreedyWalkResult(static_cast(-0x1.919a7c0000000p-1),883883), + GreedyWalkResult(static_cast(-0x1.0313ea0000000p+0),117555), + GreedyWalkResult(static_cast(-0x1.8781320000000p-2),467026), + GreedyWalkResult(static_cast(-0x1.8504900000000p-2),236872), + GreedyWalkResult(static_cast(-0x1.2e79740000000p-2),827608), + GreedyWalkResult(static_cast(-0x1.91ac000000000p-5),355549), + GreedyWalkResult(static_cast(-0x1.e0b6b80000000p-6),973080), + GreedyWalkResult(static_cast(-0x1.ae8bd00000000p-1), 26012), + GreedyWalkResult(static_cast(-0x1.edd4cc0000001p-5),587902), + GreedyWalkResult(static_cast(-0x1.1191160000000p-6),750819), + GreedyWalkResult(static_cast(-0x1.3c69140000000p-2),192244), + GreedyWalkResult(static_cast(-0x1.30a7540000000p+0),804725), + GreedyWalkResult(static_cast(-0x1.77bda40000002p-5),654035), + GreedyWalkResult(static_cast(-0x1.f0496e0000001p-1), 2251), + GreedyWalkResult(static_cast(-0x1.788009fffffffp-4),439426), + GreedyWalkResult(static_cast(-0x1.3527f9fffffffp+0),354262), + GreedyWalkResult(static_cast(-0x1.1914b20000000p+0), 16476), + GreedyWalkResult(static_cast(-0x1.4b03460000000p-4),648421), + GreedyWalkResult(static_cast(-0x1.25ae300000000p-1),292300), + GreedyWalkResult(static_cast(-0x1.cd467c0000000p-6), 47898), + GreedyWalkResult(static_cast(-0x1.e082960000001p-3),169790), + GreedyWalkResult(static_cast(-0x1.38970e0000000p-5),495101), + GreedyWalkResult(static_cast(-0x1.d88693fffffffp-2),348136), + GreedyWalkResult(static_cast(-0x1.13046c0000000p-1),439129), + GreedyWalkResult(static_cast(-0x1.ed2e720000001p-4),749981), + GreedyWalkResult(static_cast(-0x1.b162180000000p-5),864388), + GreedyWalkResult(static_cast(-0x1.458a1a0000000p-2),121683), + GreedyWalkResult(static_cast(-0x1.ffddf40000000p-6), 82234), + GreedyWalkResult(static_cast(-0x1.c99b320000001p-6),495323), + GreedyWalkResult(static_cast(-0x1.aa13de0000000p-3),226268), + GreedyWalkResult(static_cast(-0x1.36671e0000000p-4),236872), + GreedyWalkResult(static_cast(-0x1.276aaa0000000p-2),467026), + GreedyWalkResult(static_cast(-0x1.41718e0000000p-6),973080), + GreedyWalkResult(static_cast(-0x1.39280c0000000p-2),228514), + GreedyWalkResult(static_cast(-0x1.8156020000000p-6),854497), + GreedyWalkResult(static_cast(-0x1.075a840000000p+0),930775), + GreedyWalkResult(static_cast(-0x1.0b01560000000p-1), 52041), + GreedyWalkResult(static_cast(-0x1.fabeec0000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.794f3a0000000p-1),384841), + GreedyWalkResult(static_cast(-0x1.d9d54dfffffffp-1),419057), + GreedyWalkResult(static_cast(-0x1.c27da80000000p-2),219992), + GreedyWalkResult(static_cast(-0x1.0d06660000000p-5),563395), + GreedyWalkResult(static_cast(-0x1.7ee86e0000000p-1),348136), + GreedyWalkResult(static_cast(-0x1.a219b9fffffffp-3),969505), + GreedyWalkResult(static_cast(-0x1.434a760000000p-4), 16476), + GreedyWalkResult(static_cast(-0x1.6cf4380000000p-1),677921), + GreedyWalkResult(static_cast(-0x1.94c9c00000000p-6),901398), + GreedyWalkResult(static_cast(-0x1.c625540000000p-5),932100), + GreedyWalkResult(static_cast(-0x1.2309d40000000p-1),677155), + GreedyWalkResult(static_cast(-0x1.3719a60000000p-4),112754), + GreedyWalkResult(static_cast(-0x1.2c1eba0000000p-6),527498), + GreedyWalkResult(static_cast(-0x1.affd100000000p-1),909721), + GreedyWalkResult(static_cast(-0x1.09db9c0000000p-2),790506), + GreedyWalkResult(static_cast(-0x1.b991e00000000p-4),535044), + GreedyWalkResult(static_cast(-0x1.2c3aec0000000p-8),938124), + GreedyWalkResult(static_cast(-0x1.cce0d20000000p-1),496356), + GreedyWalkResult(static_cast(-0x1.d80a4a0000000p-8),776790), + GreedyWalkResult(static_cast(-0x1.b3f6ec0000000p-1),749772), + GreedyWalkResult(static_cast(-0x1.d370f60000000p-1),441230), + GreedyWalkResult(static_cast(-0x1.17859e0000000p+0), 12009), + GreedyWalkResult(static_cast(-0x1.552dde0000000p-2),228514), + GreedyWalkResult(static_cast(-0x1.1e56f40000000p+0), 56131), + GreedyWalkResult(static_cast(-0x1.5b74140000000p-4),186084), + GreedyWalkResult(static_cast(-0x1.2bc8580000000p+0),870888), + GreedyWalkResult(static_cast(-0x1.03ba840000000p+0),385014), + GreedyWalkResult(static_cast(-0x1.9e8ea80000000p-2),114191), + GreedyWalkResult(static_cast(-0x1.9181880000000p-6),517512), + GreedyWalkResult(static_cast(-0x1.fd3e6a0000000p-3),255668), + GreedyWalkResult(static_cast(-0x1.d793e5fffffffp-6),511753), + GreedyWalkResult(static_cast(-0x1.335bf00000000p-6),679881), + GreedyWalkResult(static_cast(-0x1.98bd340000000p-1), 56131), + GreedyWalkResult(static_cast(-0x1.37253c0000000p-3),337863), + GreedyWalkResult(static_cast(-0x1.55a79e0000000p-2),270226), + GreedyWalkResult(static_cast(-0x1.f2ead00000001p-3),430269), + GreedyWalkResult(static_cast(-0x1.f45e060000002p-3),226356), + GreedyWalkResult(static_cast(-0x1.c435d60000001p-9), 81654), + GreedyWalkResult(static_cast(-0x1.1ea9580000000p+0),550771), + GreedyWalkResult(static_cast(-0x1.cc1a520000000p-2),444956), + GreedyWalkResult(static_cast(-0x1.9428000000000p-2),914163), + GreedyWalkResult(static_cast(-0x1.8f2a440000000p-2), 40323), + GreedyWalkResult(static_cast(-0x1.077cdc0000000p+0),582680), + GreedyWalkResult(static_cast(-0x1.31819c0000000p-3),292300), + GreedyWalkResult(static_cast(-0x1.5ae2840000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.0f86240000000p-1),854962), + GreedyWalkResult(static_cast(-0x1.e4b8040000000p-2), 5217), + GreedyWalkResult(static_cast(-0x1.92a3020000000p-6),866106), + GreedyWalkResult(static_cast(-0x1.4c2bd40000000p-3),560074), + GreedyWalkResult(static_cast(-0x1.96bfae0000000p-2),225945), + GreedyWalkResult(static_cast(-0x1.7cfb9a0000000p-6),986292), + GreedyWalkResult(static_cast(-0x1.809e320000001p-5),890893), + GreedyWalkResult(static_cast(-0x1.1156de0000000p-1),313671), + GreedyWalkResult(static_cast(-0x1.eb64960000000p-1), 23136), + GreedyWalkResult(static_cast(-0x1.5a97fa0000000p-2),228059), + GreedyWalkResult(static_cast(-0x1.2f87c20000001p-1),945767), + GreedyWalkResult(static_cast(-0x1.45a1460000000p-4), 29348), + GreedyWalkResult(static_cast(-0x1.ddef220000001p-3),580161), + GreedyWalkResult(static_cast(-0x1.0b9e120000000p-5),179074), + GreedyWalkResult(static_cast(-0x1.f977160000001p-4),141149), + GreedyWalkResult(static_cast(-0x1.b366bc0000000p-1),660609), + GreedyWalkResult(static_cast(-0x1.7009520000000p-2),467026), + GreedyWalkResult(static_cast(-0x1.08adbe0000000p-3),550091), + GreedyWalkResult(static_cast(-0x1.c989580000000p-4),168533), + GreedyWalkResult(static_cast(-0x1.56433a0000000p-1),672634), + GreedyWalkResult(static_cast(-0x1.dbe0b00000000p-5),667763), + GreedyWalkResult(static_cast(-0x1.11c0620000000p+0),294738), + GreedyWalkResult(static_cast(-0x1.d6d5560000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.899de00000000p-1),239334), + GreedyWalkResult(static_cast(-0x1.fc2835fffffffp-4),550771), + GreedyWalkResult(static_cast(-0x1.28141e0000000p-6),986292), + GreedyWalkResult(static_cast(-0x1.abb32c0000000p-1),134340), + GreedyWalkResult(static_cast(-0x1.2c2b640000001p-3),926855), + GreedyWalkResult(static_cast(-0x1.3447780000000p-3), 47688), + GreedyWalkResult(static_cast(-0x1.5fb8300000000p-6),226268), + GreedyWalkResult(static_cast(-0x1.73cba7fffffffp-4), 40323), + GreedyWalkResult(static_cast(-0x1.b99f040000000p-1), 16476), + GreedyWalkResult(static_cast(-0x1.6b9ba60000000p-1),112754), + GreedyWalkResult(static_cast(-0x1.d3aa360000000p-1),192244), + GreedyWalkResult(static_cast(-0x1.25282a0000000p+0),275023), + GreedyWalkResult(static_cast(-0x1.16c09a0000000p-5), 56131), + GreedyWalkResult(static_cast(-0x1.bdd6720000000p-3),667763), + GreedyWalkResult(static_cast(-0x1.7421400000000p-1),587902), + GreedyWalkResult(static_cast(-0x1.dfa079fffffffp-9),630231), + GreedyWalkResult(static_cast(-0x1.debb760000001p-2),778627), + GreedyWalkResult(static_cast(-0x1.3589be0000000p-4),294738), + GreedyWalkResult(static_cast(-0x1.a659d00000000p-3),353498), + GreedyWalkResult(static_cast(-0x1.9f913bfffffffp-4),936836), + GreedyWalkResult(static_cast(-0x1.3b78740000000p-3),504419), + GreedyWalkResult(static_cast(-0x1.42611c0000000p-3),107308), + GreedyWalkResult(static_cast(-0x1.4e66860000000p-6),439809), + GreedyWalkResult(static_cast(-0x1.4a79000000000p-1),513240), + GreedyWalkResult(static_cast(-0x1.41902a0000000p+0),774981), + GreedyWalkResult(static_cast(-0x1.4850a60000000p-1),294738), + GreedyWalkResult(static_cast(-0x1.a7bf000000000p-1),236872), + GreedyWalkResult(static_cast(-0x1.9d67d60000001p-5),517512), + GreedyWalkResult(static_cast(-0x1.c908860000000p-2),854962), + GreedyWalkResult(static_cast(-0x1.63e9520000000p-2),513240), + GreedyWalkResult(static_cast(-0x1.e423200000000p-5),295526), + GreedyWalkResult(static_cast(-0x1.91894ffffffffp-2),476414), + GreedyWalkResult(static_cast(-0x1.29ba4a0000000p-4),774219), + GreedyWalkResult(static_cast(-0x1.a577500000000p-1),582680), + GreedyWalkResult(static_cast(-0x1.de39c80000000p-2),909721), + GreedyWalkResult(static_cast(-0x1.f75ad40000001p-1),385014), + GreedyWalkResult(static_cast(-0x1.93794a0000000p-1),750819), + GreedyWalkResult(static_cast(-0x1.5f65ec0000000p-3),294738), + GreedyWalkResult(static_cast(-0x1.23f7820000000p-6),786537), + GreedyWalkResult(static_cast(-0x1.a4f01e0000000p-1),239334), + GreedyWalkResult(static_cast(-0x1.218c620000000p-12),134340), + GreedyWalkResult(static_cast(-0x1.33a59e0000000p-1), 40323), + GreedyWalkResult(static_cast(-0x1.c9920c0000000p-2),523435), + GreedyWalkResult(static_cast(-0x1.18be840000000p-2),865184), + GreedyWalkResult(static_cast(-0x1.0442d60000000p-1),729175), + GreedyWalkResult(static_cast(-0x1.047e940000000p+0),255668), + GreedyWalkResult(static_cast(-0x1.0d97ac0000000p-1),239334), + GreedyWalkResult(static_cast(-0x1.2a5e4e0000001p-4),660609), + GreedyWalkResult(static_cast(-0x1.f4887bfffffffp-1),294738), + GreedyWalkResult(static_cast(-0x1.a8d50c0000001p-2),531816), + GreedyWalkResult(static_cast(-0x1.8e5e300000001p-4),541780), + GreedyWalkResult(static_cast(-0x1.06e1a40000000p-2),236872), + GreedyWalkResult(static_cast(-0x1.2e98940000000p-5),385014), + GreedyWalkResult(static_cast(-0x1.1d7bb60000000p-4),320041), + GreedyWalkResult(static_cast(-0x1.93514a0000000p-6), 38538), + GreedyWalkResult(static_cast(-0x1.fe2429fffffffp-2),292300), + GreedyWalkResult(static_cast(-0x1.161f500000000p-6), 38538), + GreedyWalkResult(static_cast(-0x1.3d90900000000p-6),318039), + GreedyWalkResult(static_cast(-0x1.01c5040000000p-2),532480), + GreedyWalkResult(static_cast(-0x1.4f30960000000p-4),223261), + GreedyWalkResult(static_cast(-0x1.8a9b3c0000000p-4),382537), + GreedyWalkResult(static_cast(-0x1.02d07a0000000p-4),790506), + GreedyWalkResult(static_cast(-0x1.9527260000001p-2),294738), + GreedyWalkResult(static_cast(-0x1.047eea0000000p-1),886263), + GreedyWalkResult(static_cast(-0x1.d0deba0000000p-1),278930), + GreedyWalkResult(static_cast(-0x1.5c2d320000000p-1),236872), + GreedyWalkResult(static_cast(-0x1.f1670a0000000p-8),580161), + GreedyWalkResult(static_cast(-0x1.1426ce0000000p-3),550771), + GreedyWalkResult(static_cast(-0x1.b5f0ee0000000p-5),517512), + GreedyWalkResult(static_cast(-0x1.efd5180000000p-6),696486), + GreedyWalkResult(static_cast(-0x1.f1b0440000000p-6),118809), + GreedyWalkResult(static_cast(-0x1.28d45c0000000p-1),854962), + GreedyWalkResult(static_cast(-0x1.f18c5e0000000p-1),184077), + GreedyWalkResult(static_cast(-0x1.50e1320000000p-1),385014), + GreedyWalkResult(static_cast(-0x1.fb43600000000p-2),467026), + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp new file mode 100644 index 000000000..ec4a799d7 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResults.hpp @@ -0,0 +1,9 @@ +#pragma once +#include +#include +namespace ipnsw { + using GreedyWalkResult = std::pair; + extern std::vector GREEDY_WALK_RESULTS; + static constexpr int GWR_DIST = 0; + static constexpr int GWR_VERT = 1; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IO.hpp b/examples/sdh-eval-workloads/ipnsw/IO.hpp new file mode 100644 index 000000000..7dd4ef05e --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IO.hpp @@ -0,0 +1,223 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ipnsw { + //using graph_tools::Graph; + //using graph_tools::Graph500Data; + + class Parser { + public: + using OptionTable = std::map; + + Parser(){} + + void parse(int argc, char *argv[]) { + int pos = 0; + int arg = 0; + + while (arg < argc) { + std::string argstr = std::string(argv[arg]); + if (ipnsw::startswith(argstr, "--")) { + // optional argument + if (++arg >= argc) { + throw std::runtime_error("'" + argstr + "' requries an argument"); + } + _options[argstr] = std::string(argv[arg]); + + } else { + // positional argument + switch (pos++) { + case 0: + _exe = argstr; + break; + + case 1: + _ucode = argstr; + break; + + case 2: + _version = argstr; + break; + + case 3: + _data = argstr; + break; + + case 4: + _queries = argstr; + break; + + case 5: + case 6: + case 7: + case 8: + _graphs.push_back(argstr); + break; + + default: + break; + } + } + arg++; + }; + + // _exe = std::string(argv[0]); + // _ucode = std::string(argv[1]); + // _version = std::string(argv[2]); + // _data = std::string(argv[3]); + // _queries = std::string(argv[4]); + // // graphs + // for (int i = 5; i < argc; ++i) { + // _graphs.push_back(std::string(argv[i])); + // } + } + + std::string str() const { + std::stringstream ss; + ss << "ucode: " << _ucode << "\n" + << "version: " << _version << "\n" + << "exe: " << _exe << "\n" + << "data: " << _data << "\n" + << "queries: " << _queries << "\n"; + + for (int i = 0; i < _graphs.size(); ++i) { + ss << "graph " << i << ": " << _graphs[i] << "\n"; + } + + return ss.str(); + } + + std::string option(const std::string &opt) const { + auto it = _options.find(opt); + if (it != _options.end()) + return it->second; + + return ""; + } + + std::vector do_queries() const { + std::string do_queries_str = option("--queries"); + if (do_queries_str.empty()) { + return {}; + } + + std::vector _do_queries; + size_t pos = 0; + size_t at = 0; + + while ((at = do_queries_str.find(",", pos)) != std::string::npos) { + do_queries_str.replace(at, 1, " "); + pos = at+1; + } + + std::stringstream ss(do_queries_str); + while (ss.good()) { + int q; + ss >> q; + _do_queries.push_back(q); + } + + return _do_queries; + } + + int num_iproducts() const { + int n = 100; + auto s = option("--num-iproducts"); + if (!s.empty()) { + n = from_string(s); + } + return n; + } + + std::string ucode() const { return _ucode; } + std::string version() const { return _version; } + std::string exe() const { return _exe; } + std::vector graphs() const { return _graphs; } + std::string graph(int i) const { return _graphs[i]; } + std::string data() const { return _data; } + std::string queries() const { return _queries; } + + std::string _ucode; + std::string _version; + std::string _exe; + std::vector _graphs; + std::string _data; + std::string _queries; + OptionTable _options; + }; + + class IO { + public: + IO() {} + IO(const Parser &p): _parser(p) {} + + + graph_tools::Graph graph(int i) { + std::cout << "Reading graph " << i << ": " + << _parser._graphs[i] << std::endl; + + graph_tools::Graph500Data d = graph_tools::Graph500Data::FromASCIIFile(_parser._graphs[i]); + return graph_tools::Graph::FromGraph500Data(d); + } + + std::vector graphs() { + std::vector graphs; + for (int i = 0; i < _parser._graphs.size(); ++i) + graphs.push_back(graph(i)); + + return graphs; + } + + template + std::vector read(const std::string & fname) { + int r; + struct stat st; + + std::cerr << "Opening " << fname << std::endl; + + r = stat(fname.c_str(), &st); + if (r != 0) { + auto s = fname + ": " + std::string(strerror(errno)); + throw std::runtime_error(s); + } + std::vector v(st.st_size/sizeof(T)); + + FILE *f = fopen(fname.c_str(), "rb"); + if (!f) { + auto s = fname + ": " + std::string(strerror(errno)); + throw std::runtime_error(s); + } + + fread(&v[0], st.st_size, 1, f); + fclose(f); + return v; + } + + template + std::vector> + database() { + using array = std::array; + return read(_parser._data); + } + + template + std::vector> + queries() { + using array = std::array; + return read(_parser._queries); + } + + std::string ucode() const { return _parser._ucode; } + std::vector do_queries() const { return _parser.do_queries(); } + + Parser _parser; + }; + +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp new file mode 100644 index 000000000..55e410789 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWFactory.hpp @@ -0,0 +1,17 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IPNSWResultReader.hpp" +namespace ipnsw { + class IPNSWFactory { + public: + std::unique_ptr KernelRunner()const { + return std::unique_ptr(_KernelRunner()); + } + std::unique_ptr ResultReader()const { + return std::unique_ptr(_ResultReader()); + } + protected: + virtual IPNSWKernelRunner* _KernelRunner()const = 0; + virtual IPNSWResultReader* _ResultReader()const = 0; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp new file mode 100644 index 000000000..4f942db01 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWGraph.hpp @@ -0,0 +1,69 @@ +#pragma once +#include +#include +#include +#include + +namespace ipnsw { + class Graph { + public: + Graph() : Graph(graph_tools::Graph()) {} + Graph(const graph_tools::Graph &g) : _graph(g) {} + Graph(graph_tools::Graph &&g) : _graph(g) {} + + void initialize_on_device() { + using hammerblade::host::HammerBlade; + HammerBlade::Ptr hb = HammerBlade::Get(); + + auto & offsets = _graph.get_offsets(); + auto & neighbors = _graph.get_neighbors(); + + _offsets = hb->alloc(offsets.size() * sizeof(offsets[0])); + _neighbors = hb->alloc(neighbors.size() * sizeof(neighbors[0])); + + hb->push_write(_offsets, &offsets[0], offsets.size() * sizeof(offsets[0])); + hb->push_write(_neighbors, &neighbors[0], neighbors.size() * sizeof(neighbors[0])); + } + + graph_tools::Graph & graph() { return _graph; } + const graph_tools::Graph & graph() const { return _graph; } + hb_mc_eva_t offsets() const { return _offsets; } + hb_mc_eva_t neighbors() const { return _neighbors; } + + static hb_mc_eva_t InitializeMetadataOnDevice(const std::vector & Gs) { + using hammerblade::host::HammerBlade; + HammerBlade::Ptr hb = HammerBlade::Get(); + struct metadata { + hb_mc_eva_t offset; + hb_mc_eva_t neighbors; + int V; + int E; + }; + + std::vector metad; + for (auto & g : Gs) { + std::cout << "Host: offset = " << std::hex << g.offsets() << " neighbors = " << g.neighbors() << std::endl; + std::cout << std::dec; + metadata m = { + .offset = g.offsets(), + .neighbors = g.neighbors(), + g.graph().num_nodes(), + g.graph().num_edges() + }; + metad.push_back(m); + } + + hb_mc_eva_t metadata = hb->alloc(sizeof(struct metadata) * metad.size()); + hb->push_write(metadata, &metad[0], sizeof(struct metadata) * metad.size()); + hb->sync_write(); + + return metadata; + } + + private: + graph_tools::Graph _graph; + + hb_mc_eva_t _offsets; + hb_mc_eva_t _neighbors; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp new file mode 100644 index 000000000..e6042acaa --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp @@ -0,0 +1,31 @@ +#pragma once +#include "HammerBlade.hpp" +#include +#include +namespace ipnsw { + class IPNSWRunner; // forward declaration + + class IPNSWKernelRunner { + public: + using HammerBlade = hammerblade::host::HammerBlade; + using Dim = hammerblade::host::Dim; + IPNSWKernelRunner() {} + + protected: + virtual std::string kernelName(const IPNSWRunner & runner) const =0; + virtual std::vector argv(const IPNSWRunner & runner) const =0; + virtual Dim gd(const IPNSWRunner &runner) const = 0; + virtual Dim tgd(const IPNSWRunner &runner) const = 0; + + public: + void runKernel(IPNSWRunner &runner) { + HammerBlade::Ptr hb = HammerBlade::Get(); + hb->push_jobv(gd(runner), + tgd(runner), + kernelName(runner), + argv(runner)); + hb->exec(); + } + }; + +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp new file mode 100644 index 000000000..19eaff181 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWResultReader.hpp @@ -0,0 +1,13 @@ +#pragma once +#include "HammerBlade.hpp" +namespace ipnsw { + class IPNSWRunner; + + class IPNSWResultReader { + protected: + using HammerBlade = hammerblade::host::HammerBlade; + + public: + virtual void readResults(const IPNSWRunner & runner) {} + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp new file mode 100644 index 000000000..3dbca5bec --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp @@ -0,0 +1,184 @@ +#pragma once +#include "IO.hpp" +#include "HammerBlade.hpp" +#include "IPNSWGraph.hpp" +#include "IPNSWFactory.hpp" +#include "IPNSWKernelRunner.hpp" +#include "IPNSWResultReader.hpp" +#include "GreedyWalkResults.hpp" +#include + +namespace ipnsw { + + class IPNSWRunner { + public: + //static constexpr int QUERY = 276; // fewest dot products for greedy walk + //static constexpr int QUERY = 472; // fewest dot products for beam search + //static constexpr int QUERY = 427; + //static constexpr int QUERY = 355; + //static constexpr int QUERY = 2; + static constexpr int QUERY = 188; + //static constexpr int QUERY = 229; + //static constexpr int QUERY = 490; + //static constexpr int QUERY = 16; + //static constexpr int QUERY = 461; + //static constexpr int QUERY = 470; + + using HammerBlade = hammerblade::host::HammerBlade; + using Dim = hammerblade::host::Dim; + + IPNSWRunner(const Parser &p, + std::unique_ptr & fact): + _factory(std::move(fact)) { + _io = std::unique_ptr(new IO(p)); + _hb = HammerBlade::Get(); + _kernel_runner = _factory->KernelRunner(); + _result_reader = _factory->ResultReader(); + } + + virtual ~IPNSWRunner() { delete _hb; } + + void readInput() { + auto graphs = _io->graphs(); + _graphs = { + Graph(std::move(graphs[3])), + Graph(std::move(graphs[2])), + Graph(std::move(graphs[1])), + Graph(std::move(graphs[0])) + }; + + _db = _io->database(); + _queries = _io->queries(); + } + + void loadProgram() { + _hb->load_application(ucodePath()); + } + + void initializeDeviceMemoryDB() { + std::cout << "Initializing database " << std::endl; + _db_dev = _hb->alloc(_db.size() * sizeof(_db[0])); + _hb->push_write(_db_dev, &_db[0], _db.size() * sizeof(_db[0])); + } + + void initializeDeviceMemoryQuery() { + std::cout << "Initializing query " << std::endl; + int query = QUERY; + + auto do_queries = _io->do_queries(); + if (!do_queries.empty()) + query = do_queries[0]; + + _query_dev = _hb->alloc(sizeof(_queries[query])); + _hb->push_write(_query_dev, &_queries[query], sizeof(_queries[query])); + } + + void initializeDeviceMemorySeen() { + std::cout << "Initializing seen set " << std::endl; + _seen_dev = _hb->alloc(_db.size() * sizeof(int)); + } + + void initializeDeviceMemoryGraphs() { + for (auto & graph : _graphs) + graph.initialize_on_device(); + + _graph_metadata_dev = Graph::InitializeMetadataOnDevice(_graphs); + } + + void initializeDeviceVCurr() { + _v_curr_dev = _hb->alloc(sizeof(int)); + } + void initializeDeviceDCurr() { + _d_curr_dev = _hb->alloc(sizeof(float)); + } + + void initializeDeviceCandidateDev() { + _candidates_dev = _hb->alloc(sizeof(GreedyWalkResult)*513); + } + + void initializeDeviceResultsDev() { + _results_dev = _hb->alloc(sizeof(GreedyWalkResult) * 129); + } + + void initializeDeviceNResultsDev() { + _n_results_dev = _hb->alloc(sizeof(int)); + } + + void initializeDeviceMemory() { + initializeDeviceMemoryDB(); + initializeDeviceMemoryQuery(); + initializeDeviceMemorySeen(); + initializeDeviceMemoryGraphs(); + initializeDeviceVCurr(); + initializeDeviceDCurr(); + initializeDeviceCandidateDev(); + initializeDeviceResultsDev(); + initializeDeviceNResultsDev(); + // sync + std::cout << "Starting DMA" << std::endl; + _hb->sync_rw(); + } + + void runKernel() { + std::cout << "Launching kernel" << std::endl; + _kernel_runner->runKernel(*this); + } + + void readResults() { + _result_reader->readResults(*this); + + } + + void run() { + readInput(); + loadProgram(); + initializeDeviceMemory(); + runKernel(); + readResults(); + } + + ///////////// + // Getters // + ///////////// + std::string ucodePath() const { + return _io->ucode(); + } + + hb_mc_eva_t db_dev() const { return _db_dev; } + hb_mc_eva_t query_dev() const { return _query_dev; } + hb_mc_eva_t seen_dev() const { return _seen_dev; } + hb_mc_eva_t v_curr_dev() const { return _v_curr_dev; } + hb_mc_eva_t d_curr_dev() const { return _d_curr_dev; } + hb_mc_eva_t graph_metadata_dev() const { return _graph_metadata_dev; } + hb_mc_eva_t candidates_dev() const { return _candidates_dev; } + hb_mc_eva_t results_dev() const { return _results_dev; } + hb_mc_eva_t n_results_dev() const { return _n_results_dev; } + + ///////////// + // Setters // + ///////////// + + private: + std::unique_ptr _io; + std::vector _graphs; + std::vector> _db; + std::vector> _queries; + HammerBlade::Ptr _hb; + + // device pointers + hb_mc_eva_t _db_dev; + hb_mc_eva_t _query_dev; + hb_mc_eva_t _seen_dev; + hb_mc_eva_t _v_curr_dev; + hb_mc_eva_t _d_curr_dev; + hb_mc_eva_t _graph_metadata_dev; + hb_mc_eva_t _candidates_dev; + hb_mc_eva_t _results_dev; + hb_mc_eva_t _n_results_dev; + + // composites + std::unique_ptr _kernel_runner; + std::unique_ptr _result_reader; + std::unique_ptr _factory; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp new file mode 100644 index 000000000..9a4861844 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp @@ -0,0 +1,19 @@ +#pragma once +#include "IPNSWFactory.hpp" +#include "IProductUBmkKernelRunner.hpp" +#include "IProductUBmkResultReader.hpp" +namespace ipnsw { + class IProductUBmkFactory : public IPNSWFactory { + public: + IProductUBmkFactory(int iterations = 10): + _iterations(iterations) { + } + + private: + IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkKernelRunner(_iterations); } + IPNSWResultReader *_ResultReader() const { return new IProductUBmkResultReader; } + + int _iterations; + }; +} + diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp new file mode 100644 index 000000000..e9d3010bc --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp @@ -0,0 +1,31 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IPNSWRunner.hpp" + +namespace ipnsw { + class IProductUBmkKernelRunner : public IPNSWKernelRunner { + public: + IProductUBmkKernelRunner(int iterations = 10) : + IPNSWKernelRunner(), + _iterations(iterations) { + } + + private: + std::string kernelName(const IPNSWRunner & runner) const { + return "inner_product_ubmk"; + } + + std::vector argv(const IPNSWRunner & runner) const { + std::vector argv = { + runner.db_dev(), // database + runner.query_dev(), // query + static_cast(_iterations), // number of inner products + }; + return argv; + }; + Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);} + Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);} + + int _iterations; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp new file mode 100644 index 000000000..300990b18 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkResultReader.hpp @@ -0,0 +1,12 @@ +#pragma once +#include "IPNSWRunner.hpp" +#include "IPNSWResultReader.hpp" + +namespace ipnsw { + class IProductUBmkResultReader : public IPNSWResultReader { + public: + void readResults(const IPNSWRunner & runner) { + std::cout << "Done" << std::endl; + } + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile new file mode 100644 index 000000000..6e814f018 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/Makefile @@ -0,0 +1,351 @@ +# Copyright (c) 2019, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +################################################################################ +# Paths / Environment Configuration +################################################################################ +_REPO_ROOT ?= $(shell git rev-parse --show-toplevel) +CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + +-include $(_REPO_ROOT)/environment.mk + +################################################################################ +# Define BSG_MACHINE_PATH, the location of the Makefile.machine.include file +# that defines the machine to compile and simulate on. Using BSG_F1_DIR (which +# is set in environment.mk) uses the same machine as in bsg_replicant. +################################################################################ + +BSG_MACHINE_PATH=$(BSG_F1_DIR)/machines/pod_X1Y1_ruche_X16Y8_hbm + +################################################################################ +# Define the range of versions +################################################################################ +# Kernel versions. See kernel/README.md for more information. Version names do +# not need to use v* and can be any string +VERSIONS := greedy_walk # inner product with ipc=0.3 (8x4) +VERSIONS += greedy_walk_v1 # inner product with ipc=0.43 (8x4) +VERSIONS += greedy_walk_v2 # inner product with FLOPS/cycle=0.2 (8x4) +VERSIONS += greedy_walk_v3 # inner product with FLOPS/cycle=0.26 (8x4) +VERSIONS += beam_search # very slow - uses a very dumb sparse set +VERSIONS += beam_search_v1 # dense set - inner product with ipc=0.3 (8x4) +VERSIONS += beam_search_v2 # dense set - inner product with ipc=0.43 (8x4) +VERSIONS += beam_search_v3 # + inner_product_v2 (flops/cycle=0.2039) (8x4) +VERSIONS += beam_search_v4 # + inner_product_v3 (flops/cycle=0.2663) (8x4) +VERSIONS += beam_search_v5 # + Bit vector for dense set +VERSIONS += debug +VERSIONS += iproduct_ubmk # baseline - ipc = 0.3 +VERSIONS += iproduct_ubmk_v1 # using clang and NBLs, ipc = 0.43, flops/cycle = 0.1867 +VERSIONS += iproduct_ubmk_v2 # + FMA, ipc = 0.386, flops/cycle = 0.2039 +VERSIONS += iproduct_ubmk_v3 # + explicit parallel fma (ipc=0.45,flops/cycle = 0.2663) (8x4) +VERSIONS += iproduct_ubmk_v4 # Slightly cleaner code than v3 - similar performance + +_KERNEL_COMPILER = CLANG +################################################################################ +# Define any sources that should be used compiled during kernel compilation, +# including the source file with the kernel itself. kernel.riscv will +# be the name of the compiled RISC-V Binary for the Manycore +# +# Use KERNEL_*LIBRARIES list sources that should be compiled and linked with all +# kernel.cpp versions. However, if you have version-specific sources you must +# come up with your own solution. +# +# Use KERNEL_INCLUDES to specify the path to directories that contain headers. +################################################################################ + +# C Libraries +KERNEL_CLIBRARIES += +# C++ Libraries +KERNEL_CXXLIBRARIES += + +KERNEL_INCLUDES += -I$(CURRENT_PATH)/kernel/include + +# Define the default kernel.cpp file. If KERNEL_DEFAULT is not defined it will +# be set to kernel.cpp in the same directory as this Makefile. +DEFAULT_VERSION := greedy_walk_v3 +KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.cpp +#KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.c + +################################################################################ +# Include the kernel build rules (This must be included after KERNEL_*LIBRARIES, +# KERNEL_DEFAULT, KERNEL_INCLUDES, etc) +################################################################################ + +-include $(EXAMPLES_PATH)/examples/cuda/riscv.mk + +################################################################################ +# END OF KERNEL-SPECIFIC RULES / START OF HOST-SPECIFIC RULES +################################################################################ + + +################################################################################ +# Define the $(HOST_TARGET), the name of the host executable to generate. The +# cosimulation host executable will be called +# $(HOST_TARGET).cosim. HOST_*SOURCES list the host files that should be +# compiled and linked into the executable. +################################################################################ + +HOST_TARGET := ipnsw +HOST_CSOURCES := +HOST_CXXSOURCES += GreedyWalkResults.cpp +HOST_INCLUDES := -I$(CURRENT_PATH) + +################################################################################ +# Include the Cosimulation host build rules (This must be included after +# HOST_*SOURCES, HOST_TARGET, HOST_INCLUDES, etc) +################################################################################ + +ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/database_music100.bin +ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/query_music100.bin +ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_0 +ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1 +ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2 +ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3 + + +################################ +# Inner Product U-Benchmarking # +################################ +# number iproducts +N-IPRODUCTS := 150 500 1000 1500 2000 3000 +IPRODUCT-BASENAME := iproduct_ubmk_v4 + +define IPRODUCT-UBMK-RULE +# creates run directory from template +kernel/iproduct_ubmk-$(1)/kernel.cpp: kernel/$(IPRODUCT-BASENAME)/kernel.cpp + mkdir -p $$(dir $$@) + cp $$< $$@ + +# adds arguments +kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: ARGS += --num-iproducts $(1) + +# adds to list of iproduct u-bmk +IPRODUCT-UBMK-VERSIONS += iproduct_ubmk-$(1) +endef + +# Expand rule for each inner product input +$(foreach nip,$(N-IPRODUCTS),$(eval $(call IPRODUCT-UBMK-RULE,$(nip)))) + +.PHONY: create-iproduct-ubmk +.PHONY: purge-iproduct-ubmk +.PHONY: iproduct-ubmk-stats + +# create rule +create-iproduct-ubmk: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/kernel.cpp) + +# purge rule +purge-iproduct-ubmk: + rm -rf $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v) + +# collect stats for all +iproduct-ubmk-stats: create-iproduct-ubmk +iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/stats) + +# Add to versions +VERSIONS += $(IPRODUCT-UBMK-VERSIONS) + +#################### +# Greedy Walk Runs # +#################### +GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490 +GREEDY-WALK-BASENAME := greedy_walk_v3 +define GREEDY-WALK-RULE +# creates run directory from template +kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.cpp + mkdir -p $$(dir $$@) + cp $$< $$@ + +# adds arguments +kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: ARGS += --queries $(1) + +# adds to list of greedy walk versions +GREEDY-WALK-VERSIONS += greedy_walk-query$(1) +endef + +# Expand rule for each query +$(foreach q,$(GREEDY-WALK-QUERIES),$(eval $(call GREEDY-WALK-RULE,$(q)))) + +.PHONY: create-greedy-walk +.PHONY: purge-greedy-walk +.PHONY: greedy-walk-stats + +# create rule +create-greedy-walk: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/kernel.cpp) + +# purge rule +purge-greedy-walk: + rm -rf $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v) + +# collect stats for all +greedy-walk-stats: create-greedy-walk +greedy-walk-stats: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/stats) + +# Add to versions +VERSIONS += $(GREEDY-WALK-VERSIONS) + +#################### +# Beam Search Runs # +#################### +BEAM-SEARCH-QUERIES := 2 188 229 355 427 472 +BEAM-SEARCH-BASENAME := beam_search_v5 + +define BEAM-SEARCH-RULE +# creates run directory from template +kernel/beam_search-query$(1)/kernel.cpp: kernel/$(BEAM-SEARCH-BASENAME)/kernel.cpp + mkdir -p $$(dir $$@) + cp $$< $$@ + +# adds arguments +kernel/beam_search-query$(1)/$(HOST_TARGET).log: ARGS += --queries $(1) + +# adds to list of greedy walk versions +BEAM-SEARCH-VERSIONS += beam_search-query$(1) +endef + + +# Expand rule for each query +$(foreach q,$(BEAM-SEARCH-QUERIES),$(eval $(call BEAM-SEARCH-RULE,$(q)))) + +.PHONY: create-beam-search +.PHONY: purge-beam-search +.PHONY: beam-search-stats + +# create rule +create-beam-search: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/kernel.cpp) + +# purge rule +purge-beam-search: + rm -rf $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v) + +# collect stats for all +beam-search-stats: create-beam-search +beam-search-stats: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/stats) + +# Add to versions +VERSIONS += $(BEAM-SEARCH-VERSIONS) + +######################################## +# Continue including cosim build rules # +######################################## + +-include $(FRAGMENTS_PATH)/host/cosim.mk + +GRAPH-TOOLS := $(CURRENT_PATH)/graph-tools +graphtools-dir := $(GRAPH-TOOLS) + +include $(GRAPH-TOOLS)/libgraphtools.mk + +HB-HELPERS := $(CURRENT_PATH)/hammerblade-helpers +include $(HB-HELPERS)/libhammerblade-helpers-host.mk + +CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags) +CXXFLAGS += $(libgraphtools-interface-cxxflags) + +LDFLAGS += $(libhammerblade-helpers-host-interface-ldflags) +LDFLAGS += $(libgraphtools-interface-ldflags) +VSOURCES += GreedyWalkResults.cpp + +$(HOST_TARGET): $(libhammerblade-helpers-host-interface-headers) +$(HOST_TARGET): $(libgraphtools-interface-headers) +$(HOST_TARGET): $(libgraphtools-interface-libraries) +$(HOST_TARGET): GreedyWalkResults.o + +GreedyWalkResults.o: GreedyWalkResults.cpp +GreedyWalkResults.o: GreedyWalkResults.hpp + +ipnsw.o: IO.hpp +ipnsw.o: IPNSWGraph.hpp +ipnsw.o: IPNSWRunner.hpp +ipnsw.o: IPNSWKernelRunner.hpp +ipnsw.o: GreedyWalkKernelRunner.hpp +ipnsw.o: BeamSearchKernelRunner.hpp +ipnsw.o: IProductUBmkKernelRunner.hpp +ipnsw.o: IPNSWResultReader.hpp +ipnsw.o: GreedyWalkResultReader.hpp +ipnsw.o: BeamSearchResultReader.hpp +ipnsw.o: GreedyWalkResults.hpp +ipnsw.o: IPNSWFactory.hpp +ipnsw.o: GreedyWalkFactory.hpp +ipnsw.o: BeamSearchFactory.hpp +ipnsw.o: IProductUBmkFactory.hpp +ipnsw.o: StringHelpers.hpp +################################################################################ +# Define the clean rules. clean calls the makefile-specific cleans, whereas +# users can add commands and dependencies to custom.clean. +################################################################################ +version.clean: + rm -rf kernel/*/*{.csv,.log,.rvo,.riscv,.vpd,.key,.png,.dis} + rm -rf kernel/*/{stats,pc_stats} + +custom.clean: version.clean + +clean: cosim.clean analysis.clean cudalite.clean custom.clean + +################################################################################ +# Define overall-goals. The all rule runs all kernel versions, and the default +# kernel. +################################################################################ + +_HELP_STRING := "Makefile Rules\n" + +_HELP_STRING += " default: \n" +_HELP_STRING += " - Run the default kernel ($KERNEL_DEFAULT) and generate all of the\n" +_HELP_STRING += " analysis products\n" +default: pc_stats graphs stats + +_HELP_STRING += " analysis: \n" +_HELP_STRING += " - Launch indpendent cosimulation executions of each kernel version.\n" +_HELP_STRING += " When execution finishes, it generates all the analysis products \n" +_HELP_STRING += " for each kernel in each respective kernel// \n" +_HELP_STRING += " directory\n" +analysis: $(foreach v,$(VERSIONS),kernel/$v/pc_stats kernel/$v/graphs kernel/$v/stats) + +_HELP_STRING += " statistics: \n" +_HELP_STRING += " - Launch indpendent cosimulation executions of each kernel version.\n" +_HELP_STRING += " When execution finishes, it generates ONLY the parsed operation \n" +_HELP_STRING += " stats for each kernel in each respective kernel// \n" +_HELP_STRING += " directory\n" +statistics: $(foreach v,$(VERSIONS),kernel/$v/stats) + +_HELP_STRING += " all: \n" +_HELP_STRING += " - Launch both the default and analysis target\n" +all: analysis default + +.DEFAULT_GOAL = help +_HELP_STRING += " help: \n" +_HELP_STRING += " - Output a friendly help message.\n" +help: + @echo -e $(HELP_STRING) + +# Always re-run, if asked. +.PHONY: default analysis help + +# These last three lines ensure that _HELP_STRING is appended to the top of +# whatever else comes before it. +_HELP_STRING += "\n" +_HELP_STRING += $(HELP_STRING) +HELP_STRING := $(_HELP_STRING) + diff --git a/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp b/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp new file mode 100644 index 000000000..39e09b1a9 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/StringHelpers.hpp @@ -0,0 +1,17 @@ +#pragma once +#include +#include + +namespace ipnsw { + static bool startswith(const std::string &st, const std::string &prefix) { + return st.rfind(prefix, 0) == 0; + } + + template + T from_string(const std::string &str) { + std::stringstream ss(str); + T v; + ss >> v; + return v; + } +} diff --git a/examples/sdh-eval-workloads/ipnsw/hb-prog-eval b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval new file mode 160000 index 000000000..5915cc2c4 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval @@ -0,0 +1 @@ +Subproject commit 5915cc2c4bc6336102c452a4e7d0a7b06ccf9222 diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp new file mode 100644 index 000000000..ea920b295 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp @@ -0,0 +1,86 @@ +#include "ipnsw.hpp" +#include "HammerBlade.hpp" +#include "Graph500Data.hpp" +#include "Graph.hpp" +#include "IO.hpp" +#include "IPNSWGraph.hpp" +#include "IPNSWRunner.hpp" +#include "IProductUBmkKernelRunner.hpp" +#include "IProductUBmkResultReader.hpp" +#include "IProductUBmkFactory.hpp" +#include "BeamSearchKernelRunner.hpp" +#include "BeamSearchResultReader.hpp" +#include "BeamSearchFactory.hpp" +#include "GreedyWalkKernelRunner.hpp" +#include "GreedyWalkResultReader.hpp" +#include "GreedyWalkFactory.hpp" +#include "GreedyWalkResults.hpp" +#include "StringHelpers.hpp" +#include +#include + +#include "GreedyWalkResults.cpp" + +using namespace ipnsw; + +int Main(int argc, char *argv[]) +{ + Parser args; + args.parse(argc, argv); + + std::unique_ptr runner; + std::unique_ptr factory; + + if (ipnsw::startswith(args.version(), "greedy_walk")) { + factory = std::unique_ptr(new GreedyWalkFactory); + } else if (ipnsw::startswith(args.version(), "beam_search")) { + factory = std::unique_ptr(new BeamSearchFactory); + } else if (ipnsw::startswith(args.version(), "iproduct_ubmk")) { + /* parse the number of inner products */ + std::cout << "num inner products " << args.num_iproducts() << std::endl; + int n_iproducts = args.num_iproducts(); + factory = std::unique_ptr(new IProductUBmkFactory(n_iproducts)); + } else if (args._version == "debug") { + /* just for debugging */ + std::cout << "--num-iproducts=" << args.num_iproducts() << std::endl; + std::cout << "--queries="; + auto do_queries = args.do_queries(); + for (auto q : do_queries) { + std::cout << q << " "; + } + std::cout << std::endl; + return 0; + } else { + return 0; + } + + runner = std::unique_ptr(new IPNSWRunner(args, factory)); + runner->run(); + + return 0; +} + +#ifdef COSIM +void cosim_main(uint32_t *exit_code, char * args) { + // We aren't passed command line arguments directly so we parse them + // from *args. args is a string from VCS - to pass a string of arguments + // to args, pass c_args to VCS as follows: +c_args="" + int argc = get_argc(args); + char *argv[argc]; + get_argv(args, argc, argv); + +#ifdef VCS + svScope scope; + scope = svGetScopeFromName("tb"); + svSetScope(scope); +#endif + int rc = Main(argc, argv); + *exit_code = rc; + return; +} +#else +int main(int argc, char ** argv) { + return Main(argc, argv); +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp new file mode 100644 index 000000000..385873c50 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp @@ -0,0 +1,37 @@ +// Copyright (c) 2019, University of Washington All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// Redistributions of source code must retain the above copyright notice, this list +// of conditions and the following disclaimer. +// +// Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// +// Neither the name of the copyright holder nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include "../common.h" diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp new file mode 100644 index 000000000..a75d5b1bf --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search/kernel.cpp @@ -0,0 +1,182 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product(v0, v1)) + + int ipnsw_beam_search(const graph *Gs, const float *database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DynSet> seen(seen_mem, N_V); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); + //v_worst = std::get<1>(results.top()); + bsg_print_int(-v_best); + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + bsg_print_int(dst); + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp new file mode 100644 index 000000000..a69965073 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp @@ -0,0 +1,188 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, const float *database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSetseen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp new file mode 100644 index 000000000..b0f374a4c --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v2/kernel.cpp @@ -0,0 +1,189 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product_v1(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSetseen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp new file mode 100644 index 000000000..f98216636 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v3/kernel.cpp @@ -0,0 +1,189 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product_v2(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSetseen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp new file mode 100644 index 000000000..01f62555f --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v4/kernel.cpp @@ -0,0 +1,189 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product_v3(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSetseen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp new file mode 100644 index 000000000..18a29fd33 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp @@ -0,0 +1,189 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + +#define distance(v0, v1) \ + (-1 * inner_product_v3(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+n_res, LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp new file mode 100644 index 000000000..9c761e94d --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/debug/kernel.cpp @@ -0,0 +1,2 @@ +extern "C" int empty() { +} diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp new file mode 100644 index 000000000..385e69d8a --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, const float *database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp new file mode 100644 index 000000000..67533d6da --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v1/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product_v1(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp new file mode 100644 index 000000000..d7c2bd9c3 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v2/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product_v2(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp new file mode 100644 index 000000000..ddea465b0 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product_v3(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp new file mode 100644 index 000000000..aaaf5317d --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/heap.hpp @@ -0,0 +1,40 @@ +#pragma once +#include +#include + +template +class DynHeap { +public: + DynHeap(T *data, int N): + _data(data), + _data_N(N), + _n(0){ + } + + void push(T i) { + _data[_n++] = i; + std::push_heap(_data, _data+_n, Comparitor()); + if (_n > _data_N) pop(); + } + + T pop() { + std::pop_heap(_data, _data+_n--, Comparitor()); + return _data[_n]; + } + + T top() const { + return _data[0]; + } + + bool empty() const { + return _n == 0; + } + + int size() const { + return _n; + } + + int _n; + int _data_N; + T *_data; +}; diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp new file mode 100644 index 000000000..95d6b291e --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/hello_world.hpp @@ -0,0 +1,6 @@ +#ifndef __HELLO_WORLD_HPP +#define __HELLO_WORLD_HPP + +#include + +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp new file mode 100644 index 000000000..8bb83077a --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp @@ -0,0 +1,89 @@ +#pragma once +#include "bsg_striped_array.hpp" +#include +#include + +template +__attribute__((noinline)) +FLOAT_T inner_product(const FLOAT_T *__restrict a, const FLOAT_T *__restrict b) +{ + FLOAT_T r = 0.0; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) { + #pragma GCC unroll 32 + for (int j = 0; j < BSIZE; ++j) { + r += a[i + j]*b[i + j]; + } + } + return r; +} + + +template +__attribute__((noinline)) +FLOAT_T inner_product_v1(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + FLOAT_T r = 0.0; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) { + #pragma GCC unroll 32 + for (int j = 0; j < BSIZE; ++j) { + r += a[i + j]*b[i + j]; + } + } + return r; +} + + +template +__attribute__((noinline)) +FLOAT_T inner_product_v2(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + FLOAT_T r = 0.0; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += BSIZE * TG_X * TG_Y) { + #pragma GCC unroll 32 + for (int j = 0; j < BSIZE; ++j) { + r = fmaf(a[i+j], b[i+j], r); + } + } + return r; +} + + + +template +__attribute__((noinline)) +FLOAT_T inner_product_v3(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + FLOAT_T r0 = 0.0, r1 = 0.0; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += 2 * BSIZE * TG_X * TG_Y) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { + r0 = fmaf(a[i+j+0*BSIZE], b[i+j+0*BSIZE], r0); + r1 = fmaf(a[i+j+1*BSIZE], b[i+j+1*BSIZE], r1); + } + } + return r0+r1; +} + +template +__attribute__((noinline)) +FLOAT_T inner_product_v4(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + register FLOAT_T r[UNROLL]; + for (int i = __bsg_id * BSIZE; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { +#pragma bsg_unroll(32) + for (int k =0 ; k < UNROLL; ++k) { + r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]); + } + } + } + int rs = 0.0; + for (int i = 0; i < UNROLL; ++i) + rs += r[i]; + return rs; +} diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp new file mode 100644 index 000000000..2ef683838 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/set.hpp @@ -0,0 +1,73 @@ +#pragma once +#include +#include +template +class DynSet { +public: + DynSet(T *data, int N): + _data(data), + _data_N(N), + _n(0) { + } + + void insert(T i) { + _data[_n++] = i; + std::sort(_data, _data+_n, Comparitor()); + } + + bool in(T i) { + return std::binary_search(_data, _data+_n, i, Comparitor()); + } + + int size() const { + return _n; + } + + T *_data; + int _n; + int _data_N; +}; + +template +class DenseSet { +public: + DenseSet(int *data): + _data(data) { + } + + void insert(T i) { + _data[i] = 1; + } + + bool in(T i) { + return _data[i] == 1; + } + + int *_data; +}; + +template +class DenseSet_v1 { +public: + DenseSet_v1(int *data) : + _data(data){ + } + + void insert(T i) { + _data[word(i)] |= (1 << bit(i)); + } + + bool in(T i) { + return _data[word(i)] & (1 << bit(i)); + } + + int word(T i) const { + return i >> 5; + } + + int bit(T i) const { + return i & 31; + } + int *_data; +}; + diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp new file mode 100644 index 000000000..fc8dd7c82 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk/kernel.cpp @@ -0,0 +1,71 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + + int inner_product_ubmk(const float *database, const float *query, int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + const float *b = &database[i*3*VSIZE]; + r += inner_product(q,b); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp new file mode 100644 index 000000000..2deb68437 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v1/kernel.cpp @@ -0,0 +1,76 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v1(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + //const float *b = &database[i*3*VSIZE]; + r += iproduct(q, &database[i*3*VSIZE]); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp new file mode 100644 index 000000000..0d4fce43b --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v2/kernel.cpp @@ -0,0 +1,76 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v2(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + //const float *b = &database[i*3*VSIZE]; + r += iproduct(q, &database[i*3*VSIZE]); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp new file mode 100644 index 000000000..8f1058017 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v3/kernel.cpp @@ -0,0 +1,76 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v3(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + //const float *b = &database[i*3*VSIZE]; + r += iproduct(q, &database[i*3*VSIZE]); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp new file mode 100644 index 000000000..c1ab7a9ba --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_v4/kernel.cpp @@ -0,0 +1,76 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v4(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N) + { + float q[VSIZE]; + float r = 0; + + bsg_print_int(N); + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + // perform a random inner product N times + for (int i = 0; i < N; ++i) { + //const float *b = &database[i*3*VSIZE]; + r += iproduct(q, &database[i*3*VSIZE]); + } + bsg_cuda_print_stat_end(0); + return (int)(r); + } +#ifdef __cplusplus +} +#endif From 0723d9070199aae2b6b9d4d52a9869ae190c692a Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Fri, 30 Apr 2021 15:31:30 -0700 Subject: [PATCH 02/22] [ipnsw] Older versions --- .../ipnsw/BeamSearchKernelRunner.hpp | 42 ++- .../ipnsw/BeamSearchResultReader.hpp | 5 +- .../ipnsw/GreedyWalkKernelRunner.hpp | 24 +- .../ipnsw/GreedyWalkResultReader.hpp | 4 +- .../sdh-eval-workloads/ipnsw/GroupData.hpp | 10 + examples/sdh-eval-workloads/ipnsw/IO.hpp | 32 ++ .../ipnsw/IPNSWKernelRunner.hpp | 15 +- .../sdh-eval-workloads/ipnsw/IPNSWRunner.hpp | 231 ++++++++++++--- .../ipnsw/IProductUBmkFactory.hpp | 6 +- .../ipnsw/IProductUBmkKernelRunner.hpp | 9 +- .../ipnsw/IProductUBmkParallelFactory.hpp | 20 ++ .../IProductUBmkParallelKernelRunner.hpp | 64 ++++ examples/sdh-eval-workloads/ipnsw/Makefile | 120 +++----- examples/sdh-eval-workloads/ipnsw/ipnsw.cpp | 50 ++-- examples/sdh-eval-workloads/ipnsw/ipnsw.hpp | 1 - .../ipnsw/kernel/beam_search_v1/kernel.cpp | 2 +- .../ipnsw/kernel/beam_search_v10/kernel.cpp | 279 ++++++++++++++++++ .../beam_search_v5-ipv4serial/kernel.cpp | 192 ++++++++++++ .../ipnsw/kernel/beam_search_v5/kernel.cpp | 11 +- .../ipnsw/kernel/beam_search_v6/kernel.cpp | 195 ++++++++++++ .../ipnsw/kernel/beam_search_v7/kernel.cpp | 194 ++++++++++++ .../ipnsw/kernel/beam_search_v8/kernel.cpp | 270 +++++++++++++++++ .../ipnsw/kernel/beam_search_v9/kernel.cpp | 249 ++++++++++++++++ .../greedy_walk_v3-ipv4serial/kernel.cpp | 147 +++++++++ .../ipnsw/kernel/greedy_walk_v3/kernel.cpp | 2 +- .../ipnsw/kernel/greedy_walk_v4/kernel.cpp | 152 ++++++++++ .../kernel/greedy_walk_v4/kernel.loc.cpp | 113 +++++++ .../ipnsw/kernel/greedy_walk_v4/loc.sh | 1 + .../ipnsw/kernel/include/inner_product.hpp | 236 ++++++++++++++- .../ipnsw/kernel/include/set.hpp | 15 +- .../kernel/include/sleep_until_valid.hpp | 28 ++ .../iproduct_ubmk-parallel_v1/kernel.cpp | 180 +++++++++++ .../iproduct_ubmk-parallel_v2/kernel.cpp | 154 ++++++++++ .../iproduct_ubmk-parallel_v3/kernel.cpp | 80 +++++ .../kernel/iproduct_ubmk_parallel/kernel.cpp | 94 ++++++ 35 files changed, 3030 insertions(+), 197 deletions(-) create mode 100644 examples/sdh-eval-workloads/ipnsw/GroupData.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/include/sleep_until_valid.hpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp create mode 100644 examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp index 426042f6d..6fe724b68 100644 --- a/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp +++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchKernelRunner.hpp @@ -9,30 +9,44 @@ namespace ipnsw { return "ipnsw_beam_search"; } + Dim tgd(const IPNSWRunner & runner) const { + return Dim(runner.cfg().grp_x(), + runner.cfg().grp_y()); + } + + Dim gd(const IPNSWRunner & runner) const { + return Dim(runner.cfg().grid_x(), + runner.cfg().grid_y()); + } std::vector argv(const IPNSWRunner & runner) const { int v_curr; float d_curr; - v_curr = std::get(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]); - d_curr = std::get(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]); + std::vector do_queries = runner._io->do_queries(); + if (do_queries.empty()) { + v_curr = std::get(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]); + d_curr = std::get(GREEDY_WALK_RESULTS[IPNSWRunner::QUERY]); + } else { + v_curr = std::get(GREEDY_WALK_RESULTS[do_queries[0]]); + d_curr = std::get(GREEDY_WALK_RESULTS[do_queries[0]]); + } HammerBlade::Ptr hb = HammerBlade::Get(); - hb->write(runner.v_curr_dev(), &v_curr, sizeof(v_curr)); - hb->write(runner.d_curr_dev(), &d_curr, sizeof(d_curr)); + hb->write(runner.v_curr_dev(0), &v_curr, sizeof(v_curr)); + hb->write(runner.d_curr_dev(0), &d_curr, sizeof(d_curr)); std::vector argv = { runner.graph_metadata_dev(), runner.db_dev(), - runner.query_dev(), - runner.seen_dev(), - runner.v_curr_dev(), - runner.d_curr_dev(), - runner.candidates_dev(), - runner.results_dev(), - runner.n_results_dev(), + runner.query_dev(0), + runner.seen_dev(0), + runner.v_curr_dev(0), + runner.d_curr_dev(0), + runner.candidates_dev(0), + runner.results_dev(0), + runner.n_results_dev(0), }; return argv; - }; - Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);} - Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);} + } + }; } diff --git a/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp index ce77d324f..3d4cc7493 100644 --- a/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp +++ b/examples/sdh-eval-workloads/ipnsw/BeamSearchResultReader.hpp @@ -9,11 +9,12 @@ namespace ipnsw { void readResults(const IPNSWRunner & runner) { HammerBlade::Ptr hb = HammerBlade::Get(); + hb_mc_eva_t grp = 0; int n_results; - hb->read(runner.n_results_dev(), &n_results, sizeof(int)); + hb->read(runner.n_results_dev(grp), &n_results, sizeof(int)); std::vector results(n_results); - hb->push_read(runner.results_dev(), &results[0], n_results * sizeof(GreedyWalkResult)); + hb->push_read(runner.results_dev(grp), &results[0], n_results * sizeof(GreedyWalkResult)); hb->sync_read(); std::cout << "Beam search:" << std::endl; diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp index 72eea9f0f..ac51739b4 100644 --- a/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkKernelRunner.hpp @@ -4,6 +4,17 @@ namespace ipnsw { class GreedyWalkKernelRunner : public IPNSWKernelRunner { + + Dim tgd(const IPNSWRunner & runner) const { + return Dim(runner.cfg().grp_x(), + runner.cfg().grp_y()); + } + + Dim gd(const IPNSWRunner & runner) const { + return Dim(runner.cfg().grid_x(), + runner.cfg().grid_y()); + } + std::string kernelName(const IPNSWRunner & runner) const { return "ipnsw_greedy_search"; } @@ -12,14 +23,13 @@ namespace ipnsw { std::vector argv = { runner.graph_metadata_dev(), runner.db_dev(), - runner.query_dev(), - runner.seen_dev(), - runner.v_curr_dev(), - runner.d_curr_dev(), + runner.query_dev(0), + runner.seen_dev(0), + runner.v_curr_dev(0), + runner.d_curr_dev(0), }; return argv; - }; - Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);} - Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);} + } + }; } diff --git a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp index ae57cd548..6ca7851ff 100644 --- a/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp +++ b/examples/sdh-eval-workloads/ipnsw/GreedyWalkResultReader.hpp @@ -10,8 +10,8 @@ namespace ipnsw { int v_curr; float d_curr; - hb->read(runner.v_curr_dev(), &v_curr, sizeof(int)); - hb->read(runner.d_curr_dev(), &d_curr, sizeof(float)); + hb->read(runner.v_curr_dev(0), &v_curr, sizeof(int)); + hb->read(runner.d_curr_dev(0), &d_curr, sizeof(float)); std::cout << "Greedy walk (v_curr,d_curr) = " << "(" << v_curr << "," << d_curr << ")" diff --git a/examples/sdh-eval-workloads/ipnsw/GroupData.hpp b/examples/sdh-eval-workloads/ipnsw/GroupData.hpp new file mode 100644 index 000000000..b9052ab23 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/GroupData.hpp @@ -0,0 +1,10 @@ +#include +namespace ipnsw { + struct GroupData { + hb_mc_eva_t seen_mem; + hb_mc_eva_t candidates_mem; + hb_mc_eva_t results_mem; + hb_mc_eva_t curr; + hb_mc_eva_t n_results; + }; +}; diff --git a/examples/sdh-eval-workloads/ipnsw/IO.hpp b/examples/sdh-eval-workloads/ipnsw/IO.hpp index 7dd4ef05e..52f0bad5b 100644 --- a/examples/sdh-eval-workloads/ipnsw/IO.hpp +++ b/examples/sdh-eval-workloads/ipnsw/IO.hpp @@ -136,6 +136,38 @@ namespace ipnsw { return n; } + int grid_x() const { + auto s = option("--grid-x"); + if (!s.empty()) + return from_string(s); + else + return 1; + } + + int grid_y() const { + auto s = option("--grid-y"); + if (!s.empty()) + return from_string(s); + else + return 1; + } + + int grp_x() const { + auto s = option("--group-x"); + if (!s.empty()) + return from_string(s); + else + return 1; + } + + int grp_y() const { + auto s = option("--group-y"); + if (!s.empty()) + return from_string(s); + else + return 1; + } + std::string ucode() const { return _ucode; } std::string version() const { return _version; } std::string exe() const { return _exe; } diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp index e6042acaa..1604cb93e 100644 --- a/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWKernelRunner.hpp @@ -9,15 +9,24 @@ namespace ipnsw { public: using HammerBlade = hammerblade::host::HammerBlade; using Dim = hammerblade::host::Dim; - IPNSWKernelRunner() {} + IPNSWKernelRunner(){} protected: virtual std::string kernelName(const IPNSWRunner & runner) const =0; virtual std::vector argv(const IPNSWRunner & runner) const =0; - virtual Dim gd(const IPNSWRunner &runner) const = 0; - virtual Dim tgd(const IPNSWRunner &runner) const = 0; public: + virtual Dim gd(const IPNSWRunner &runner) const { + return Dim(1,1); + } + virtual Dim tgd(const IPNSWRunner &runner) const { + return Dim(1,1); + } + + public: + virtual void beforeLaunchKernel(const IPNSWRunner &runner) { } + virtual void afterLaunchKernel(const IPNSWRunner &runner) { } + void runKernel(IPNSWRunner &runner) { HammerBlade::Ptr hb = HammerBlade::Get(); hb->push_jobv(gd(runner), diff --git a/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp index 3dbca5bec..feebf121d 100644 --- a/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp +++ b/examples/sdh-eval-workloads/ipnsw/IPNSWRunner.hpp @@ -6,10 +6,59 @@ #include "IPNSWKernelRunner.hpp" #include "IPNSWResultReader.hpp" #include "GreedyWalkResults.hpp" +#include "GroupData.hpp" #include namespace ipnsw { + class IPNSWRunnerConfig { + public: + typedef enum { + Dense, + BitVector, + Sparse, + } SetType; + + IPNSWRunnerConfig(): + _set_type(BitVector), + _grid_x(1), + _grid_y(1), + _grp_x(1), + _grp_y(1) { + } + + SetType set_type() const { return _set_type; } + SetType & set_type() { return _set_type; } + + std::string set_type_str() const { + switch (set_type()) { + case Dense: + return "Dense"; + case BitVector: + return "Dense Bit Vector"; + case Sparse: + return "Sparse"; + } + } + + int & grid_x() { return _grid_x; } + int grid_x() const { return _grid_x; } + int & grid_y() { return _grid_y; } + int grid_y() const { return _grid_y; } + + int & grp_x() { return _grp_x; } + int grp_x() const { return _grp_x; } + int & grp_y() { return _grp_y; } + int grp_y() const { return _grp_y; } + + private: + SetType _set_type; + int _grid_x; + int _grid_y; + int _grp_x; + int _grp_y; + }; + class IPNSWRunner { public: //static constexpr int QUERY = 276; // fewest dot products for greedy walk @@ -24,12 +73,23 @@ namespace ipnsw { //static constexpr int QUERY = 461; //static constexpr int QUERY = 470; + + static constexpr size_t CANDIDATES_MAX = 513; + static constexpr size_t RESULTS_MAX = 129; + using HammerBlade = hammerblade::host::HammerBlade; using Dim = hammerblade::host::Dim; IPNSWRunner(const Parser &p, - std::unique_ptr & fact): - _factory(std::move(fact)) { + std::unique_ptr & fact) : + IPNSWRunner(p, fact, IPNSWRunnerConfig()) { + } + + IPNSWRunner(const Parser &p, + std::unique_ptr & fact, + const IPNSWRunnerConfig &cfg): + _factory(std::move(fact)), + _cfg(cfg) { _io = std::unique_ptr(new IO(p)); _hb = HammerBlade::Get(); _kernel_runner = _factory->KernelRunner(); @@ -63,19 +123,41 @@ namespace ipnsw { void initializeDeviceMemoryQuery() { std::cout << "Initializing query " << std::endl; - int query = QUERY; - auto do_queries = _io->do_queries(); - if (!do_queries.empty()) - query = do_queries[0]; + std::vector do_queries = _io->do_queries(); + if (do_queries.empty()) { + do_queries = {QUERY}; + } + + _query_dev = _hb->alloc(sizeof(_queries[0]) * do_queries.size()); - _query_dev = _hb->alloc(sizeof(_queries[query])); - _hb->push_write(_query_dev, &_queries[query], sizeof(_queries[query])); + for (hb_mc_eva_t qidx = 0; qidx < do_queries.size(); ++qidx) { + int query = do_queries[qidx]; + _hb->push_write(_query_dev + qidx * sizeof(_queries[query]), + &_queries[query], + sizeof(_queries[query])); + } } + size_t seen_dev_size_per_group() const { + size_t size, words; + switch (_cfg.set_type()) { + case IPNSWRunnerConfig::Dense: + case IPNSWRunnerConfig::Sparse: + return _db.size() * sizeof(int); + case IPNSWRunnerConfig::BitVector: + words = _db.size()/32; + if (_db.size() % 32 != 0) + words += 1; + return words * sizeof(int); + } + } void initializeDeviceMemorySeen() { std::cout << "Initializing seen set " << std::endl; - _seen_dev = _hb->alloc(_db.size() * sizeof(int)); + for (int i = 0; i < numGroups(); ++i) { + hb_mc_eva_t dev = _hb->alloc(seen_dev_size_per_group()); + _seen_dev.push_back(dev); + } } void initializeDeviceMemoryGraphs() { @@ -85,23 +167,55 @@ namespace ipnsw { _graph_metadata_dev = Graph::InitializeMetadataOnDevice(_graphs); } - void initializeDeviceVCurr() { - _v_curr_dev = _hb->alloc(sizeof(int)); + void initializeDeviceVCurrDCurr() { + _curr_dev = _hb->alloc(sizeof(GreedyWalkResult) * numGroups()); + hb_mc_eva_t grp = 0; + std::cout << std::hex; + std::cout << "_curr_dev=" << std::hex << _curr_dev << std::endl; + std::cout << " curr(" << std::dec << grp << ")=" << std::hex << curr_dev(grp) << std::endl; + std::cout << "v_curr(" << std::dec << grp << ")=" << std::hex << v_curr_dev(grp) << std::endl; + std::cout << "d_curr(" << std::dec << grp << ")=" << std::hex << d_curr_dev(grp) << std::endl; + std::cout << std::dec; } - void initializeDeviceDCurr() { - _d_curr_dev = _hb->alloc(sizeof(float)); + + size_t candidates_dev_size_per_group() const { + return sizeof(GreedyWalkResult) * CANDIDATES_MAX; } void initializeDeviceCandidateDev() { - _candidates_dev = _hb->alloc(sizeof(GreedyWalkResult)*513); + for (int i = 0; i < numGroups(); ++i) { + hb_mc_eva_t dev = _hb->alloc(candidates_dev_size_per_group()); + _candidates_dev.push_back(dev); + } + } + + size_t results_dev_size_per_group() const { + return sizeof(GreedyWalkResult) * RESULTS_MAX; } void initializeDeviceResultsDev() { - _results_dev = _hb->alloc(sizeof(GreedyWalkResult) * 129); + for (int i = 0; i < numGroups(); ++i) { + hb_mc_eva_t dev = _hb->alloc(results_dev_size_per_group()); + _results_dev.push_back(dev); + } } void initializeDeviceNResultsDev() { - _n_results_dev = _hb->alloc(sizeof(int)); + _n_results_dev = _hb->alloc(sizeof(int) * numGroups()); + } + + void initializeGroupData() { + _group_data_dev = _hb->alloc(sizeof(GroupData) * numGroups()); + for (int i = 0; i < numGroups(); ++i) { + GroupData gd = { + .seen_mem = seen_dev(i), + .candidates_mem = candidates_dev(i), + .results_mem = results_dev(i), + .curr = curr_dev(i), + .n_results = n_results_dev(i), + }; + _hb->push_write(group_data_dev(i), &gd, sizeof(gd)); + } } void initializeDeviceMemory() { @@ -109,19 +223,21 @@ namespace ipnsw { initializeDeviceMemoryQuery(); initializeDeviceMemorySeen(); initializeDeviceMemoryGraphs(); - initializeDeviceVCurr(); - initializeDeviceDCurr(); + initializeDeviceVCurrDCurr(); initializeDeviceCandidateDev(); initializeDeviceResultsDev(); initializeDeviceNResultsDev(); - // sync - std::cout << "Starting DMA" << std::endl; - _hb->sync_rw(); + initializeGroupData(); } void runKernel() { + _kernel_runner->beforeLaunchKernel(*this); + // sync + std::cout << "Starting DMA" << std::endl; + _hb->sync_rw(); std::cout << "Launching kernel" << std::endl; _kernel_runner->runKernel(*this); + _kernel_runner->afterLaunchKernel(*this); } void readResults() { @@ -145,36 +261,75 @@ namespace ipnsw { } hb_mc_eva_t db_dev() const { return _db_dev; } - hb_mc_eva_t query_dev() const { return _query_dev; } - hb_mc_eva_t seen_dev() const { return _seen_dev; } - hb_mc_eva_t v_curr_dev() const { return _v_curr_dev; } - hb_mc_eva_t d_curr_dev() const { return _d_curr_dev; } + hb_mc_eva_t query_dev(hb_mc_eva_t qidx) const { + return _query_dev + qidx * sizeof(_queries[qidx]); + } + + hb_mc_eva_t seen_dev(hb_mc_eva_t grp) const { + return _seen_dev[grp]; + } + + hb_mc_eva_t curr_dev(hb_mc_eva_t grp = 0) const { + return _curr_dev + (grp*sizeof(GreedyWalkResult)); + } + + hb_mc_eva_t v_curr_dev(hb_mc_eva_t grp) const { + return curr_dev(grp) + sizeof(float); + } + hb_mc_eva_t d_curr_dev(hb_mc_eva_t grp) const { + return curr_dev(grp); + } + hb_mc_eva_t graph_metadata_dev() const { return _graph_metadata_dev; } - hb_mc_eva_t candidates_dev() const { return _candidates_dev; } - hb_mc_eva_t results_dev() const { return _results_dev; } - hb_mc_eva_t n_results_dev() const { return _n_results_dev; } + hb_mc_eva_t candidates_dev(hb_mc_eva_t grp) const { + return _candidates_dev[grp]; + } + + hb_mc_eva_t results_dev(hb_mc_eva_t grp) const { + return _results_dev[grp]; + } + + hb_mc_eva_t n_results_dev(hb_mc_eva_t grp) const { + return _n_results_dev + grp * sizeof(int); + } + + hb_mc_eva_t group_data_dev(hb_mc_eva_t grp) const { + return _group_data_dev + grp * sizeof(GroupData); + } + + int numGroups() const { return _kernel_runner->gd(*this).x() * _kernel_runner->gd(*this).y(); } + + const std::vector> & db() const { return _db; } + + const IPNSWRunnerConfig & cfg() const { return _cfg; } ///////////// // Setters // ///////////// private: - std::unique_ptr _io; - std::vector _graphs; - std::vector> _db; - std::vector> _queries; - HammerBlade::Ptr _hb; + IPNSWRunnerConfig _cfg; + + public: + std::unique_ptr _io; + + private: + std::vector _graphs; + std::vector> _db; + std::vector> _queries; + std::vector _group_data; + HammerBlade::Ptr _hb; // device pointers hb_mc_eva_t _db_dev; hb_mc_eva_t _query_dev; - hb_mc_eva_t _seen_dev; - hb_mc_eva_t _v_curr_dev; - hb_mc_eva_t _d_curr_dev; + std::vector _seen_dev; + hb_mc_eva_t _curr_dev; hb_mc_eva_t _graph_metadata_dev; - hb_mc_eva_t _candidates_dev; - hb_mc_eva_t _results_dev; + std::vector _candidates_dev; + std::vector _results_dev; hb_mc_eva_t _n_results_dev; + hb_mc_eva_t _group_data_dev; // composites std::unique_ptr _kernel_runner; diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp index 9a4861844..ff0468903 100644 --- a/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkFactory.hpp @@ -9,9 +9,9 @@ namespace ipnsw { _iterations(iterations) { } - private: - IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkKernelRunner(_iterations); } - IPNSWResultReader *_ResultReader() const { return new IProductUBmkResultReader; } + protected: + virtual IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkKernelRunner(_iterations); } + virtual IPNSWResultReader *_ResultReader() const { return new IProductUBmkResultReader; } int _iterations; }; diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp index e9d3010bc..1ee4da763 100644 --- a/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkKernelRunner.hpp @@ -15,17 +15,16 @@ namespace ipnsw { return "inner_product_ubmk"; } - std::vector argv(const IPNSWRunner & runner) const { + virtual std::vector argv(const IPNSWRunner & runner) const { std::vector argv = { runner.db_dev(), // database - runner.query_dev(), // query + runner.query_dev(0), // query static_cast(_iterations), // number of inner products }; return argv; - }; - Dim gd(const IPNSWRunner &runner) const {return Dim(1,1);} - Dim tgd(const IPNSWRunner &runner) const {return Dim(1,1);} + } + protected: int _iterations; }; } diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp new file mode 100644 index 000000000..964cc2d8e --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelFactory.hpp @@ -0,0 +1,20 @@ +#pragma once +#include "IPNSWFactory.hpp" +#include "IProductUBmkKernelRunner.hpp" +#include "IProductUBmkResultReader.hpp" +#include "IProductUBmkFactory.hpp" +#include "IProductUBmkParallelKernelRunner.hpp" + +namespace ipnsw { + class IProductUBmkParallelFactory : public IProductUBmkFactory { + public: + IProductUBmkParallelFactory(int itertions = 10): + IProductUBmkFactory(itertions) { + } + + private: + IPNSWKernelRunner *_KernelRunner() const { return new IProductUBmkParallelKernelRunner(_iterations); } + + }; +} + diff --git a/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp new file mode 100644 index 000000000..668114fb2 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/IProductUBmkParallelKernelRunner.hpp @@ -0,0 +1,64 @@ +#pragma once +#include "IPNSWKernelRunner.hpp" +#include "IProductUBmkKernelRunner.hpp" +#include "IPNSWRunner.hpp" +#include "HammerBlade.hpp" +#include + +namespace ipnsw { + class IProductUBmkParallelKernelRunner : public IProductUBmkKernelRunner { + public: + IProductUBmkParallelKernelRunner(int iterations = 10) : + IProductUBmkKernelRunner(iterations) { + } + + private: + using HammerBlade = hammerblade::host::HammerBlade; + + void beforeLaunchKernel(const IPNSWRunner &runner) { + HammerBlade::Ptr _hb = HammerBlade::Get(); + + _visit.clear(); + + for (int i = 0; i < _iterations * runner.numGroups(); ++i) { + _visit.push_back((i*3) % runner.db().size()); + } + std::random_shuffle(_visit.begin(), _visit.end()); + + _visit_dev = _hb->alloc(sizeof(int) * _visit.size()); + + std::cout << "beforeLaunchKernel called: _visit_dev = " << std::hex << _visit_dev << std::endl; + std::cout << std::dec; + + _hb->push_write(_visit_dev, &_visit[0], sizeof(int) * _visit.size()); + } + + std::vector argv(const IPNSWRunner & runner) const { + std::cout << "Called" << std::endl; + std::vector argv = { + runner.db_dev(), // database + runner.query_dev(0), // query + static_cast(_iterations), // number of inner products + _visit_dev, // vectors to visit + }; + return argv; + } + + void afterLaunchKernel(const IPNSWRunner &runner) { + HammerBlade::Ptr _hb = HammerBlade::Get(); + _hb->free(_visit_dev); + _visit.clear(); + } + + virtual Dim gd(const IPNSWRunner &runner) const { + return Dim(runner.cfg().grid_x(),runner.cfg().grid_y()); + } + + virtual Dim tgd(const IPNSWRunner &runner) const { + return Dim(runner.cfg().grp_x(),runner.cfg().grp_y()); + } + + hb_mc_eva_t _visit_dev; + std::vector _visit; + }; +} diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile index 6e814f018..b4bfa09d7 100644 --- a/examples/sdh-eval-workloads/ipnsw/Makefile +++ b/examples/sdh-eval-workloads/ipnsw/Makefile @@ -100,30 +100,17 @@ KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.cpp # END OF KERNEL-SPECIFIC RULES / START OF HOST-SPECIFIC RULES ################################################################################ - -################################################################################ -# Define the $(HOST_TARGET), the name of the host executable to generate. The -# cosimulation host executable will be called -# $(HOST_TARGET).cosim. HOST_*SOURCES list the host files that should be -# compiled and linked into the executable. -################################################################################ - -HOST_TARGET := ipnsw -HOST_CSOURCES := -HOST_CXXSOURCES += GreedyWalkResults.cpp -HOST_INCLUDES := -I$(CURRENT_PATH) - ################################################################################ # Include the Cosimulation host build rules (This must be included after # HOST_*SOURCES, HOST_TARGET, HOST_INCLUDES, etc) ################################################################################ - -ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/database_music100.bin -ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/query_music100.bin -ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_0 -ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1 -ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2 -ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3 +HOST_TARGET = ipnsw +C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/database_music100.bin +C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/query_music100.bin +C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_0 +C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1 +C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2 +C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3 ################################ @@ -140,7 +127,9 @@ kernel/iproduct_ubmk-$(1)/kernel.cpp: kernel/$(IPRODUCT-BASENAME)/kernel.cpp cp $$< $$@ # adds arguments -kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: ARGS += --num-iproducts $(1) +kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: C_ARGS += --num-iproducts $(1) +kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv +kernel/iproduct_ubmk-$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) # adds to list of iproduct u-bmk IPRODUCT-UBMK-VERSIONS += iproduct_ubmk-$(1) @@ -162,7 +151,7 @@ purge-iproduct-ubmk: # collect stats for all iproduct-ubmk-stats: create-iproduct-ubmk -iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/stats) +iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log) # Add to versions VERSIONS += $(IPRODUCT-UBMK-VERSIONS) @@ -179,7 +168,9 @@ kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.c cp $$< $$@ # adds arguments -kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: ARGS += --queries $(1) +kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: C_ARGS += --queries $(1) +kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv +kernel/greedy_walk-query$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) # adds to list of greedy walk versions GREEDY-WALK-VERSIONS += greedy_walk-query$(1) @@ -201,7 +192,7 @@ purge-greedy-walk: # collect stats for all greedy-walk-stats: create-greedy-walk -greedy-walk-stats: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/stats) +greedy-walk-stats: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log) # Add to versions VERSIONS += $(GREEDY-WALK-VERSIONS) @@ -219,7 +210,9 @@ kernel/beam_search-query$(1)/kernel.cpp: kernel/$(BEAM-SEARCH-BASENAME)/kernel.c cp $$< $$@ # adds arguments -kernel/beam_search-query$(1)/$(HOST_TARGET).log: ARGS += --queries $(1) +kernel/beam_search-query$(1)/$(HOST_TARGET).log: C_ARGS += --queries $(1) +kernel/beam_search-query$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/beam_search-query$(1)/kernel.riscv +kernel/beam_search-query$$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) # adds to list of greedy walk versions BEAM-SEARCH-VERSIONS += beam_search-query$(1) @@ -242,7 +235,7 @@ purge-beam-search: # collect stats for all beam-search-stats: create-beam-search -beam-search-stats: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/stats) +beam-search-stats: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log) # Add to versions VERSIONS += $(BEAM-SEARCH-VERSIONS) @@ -251,31 +244,32 @@ VERSIONS += $(BEAM-SEARCH-VERSIONS) # Continue including cosim build rules # ######################################## --include $(FRAGMENTS_PATH)/host/cosim.mk - -GRAPH-TOOLS := $(CURRENT_PATH)/graph-tools +GRAPH-TOOLS := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/graph-tools graphtools-dir := $(GRAPH-TOOLS) include $(GRAPH-TOOLS)/libgraphtools.mk -HB-HELPERS := $(CURRENT_PATH)/hammerblade-helpers +HB-HELPERS := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hammerblade-helpers +hammerblade-helpers-dir := $(HB-HELPERS) include $(HB-HELPERS)/libhammerblade-helpers-host.mk CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags) CXXFLAGS += $(libgraphtools-interface-cxxflags) +CXXFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw +CXXFLAGS += -DCOSIM LDFLAGS += $(libhammerblade-helpers-host-interface-ldflags) LDFLAGS += $(libgraphtools-interface-ldflags) -VSOURCES += GreedyWalkResults.cpp - -$(HOST_TARGET): $(libhammerblade-helpers-host-interface-headers) -$(HOST_TARGET): $(libgraphtools-interface-headers) -$(HOST_TARGET): $(libgraphtools-interface-libraries) -$(HOST_TARGET): GreedyWalkResults.o +GreedyWalkResults.o: $(libhammerblade-helpers-host-interface-headers) +GreedyWalkResults.o: $(libgraphtools-interface-headers) +GreedyWalkResults.o: $(libgraphtools-interface-libraries) GreedyWalkResults.o: GreedyWalkResults.cpp GreedyWalkResults.o: GreedyWalkResults.hpp +ipnsw.o: $(libhammerblade-helpers-host-interface-headers) +ipnsw.o: $(libgraphtools-interface-headers) +ipnsw.o: $(libgraphtools-interface-libraries) ipnsw.o: IO.hpp ipnsw.o: IPNSWGraph.hpp ipnsw.o: IPNSWRunner.hpp @@ -292,6 +286,13 @@ ipnsw.o: GreedyWalkFactory.hpp ipnsw.o: BeamSearchFactory.hpp ipnsw.o: IProductUBmkFactory.hpp ipnsw.o: StringHelpers.hpp + +TEST_SOURCES = ipnsw.cpp GreedyWalkResults.cpp + +-include $(EXAMPLES_PATH)/compilation.mk +-include $(EXAMPLES_PATH)/link.mk +-include $(EXAMPLES_PATH)/execution.mk + ################################################################################ # Define the clean rules. clean calls the makefile-specific cleans, whereas # users can add commands and dependencies to custom.clean. @@ -302,50 +303,3 @@ version.clean: custom.clean: version.clean -clean: cosim.clean analysis.clean cudalite.clean custom.clean - -################################################################################ -# Define overall-goals. The all rule runs all kernel versions, and the default -# kernel. -################################################################################ - -_HELP_STRING := "Makefile Rules\n" - -_HELP_STRING += " default: \n" -_HELP_STRING += " - Run the default kernel ($KERNEL_DEFAULT) and generate all of the\n" -_HELP_STRING += " analysis products\n" -default: pc_stats graphs stats - -_HELP_STRING += " analysis: \n" -_HELP_STRING += " - Launch indpendent cosimulation executions of each kernel version.\n" -_HELP_STRING += " When execution finishes, it generates all the analysis products \n" -_HELP_STRING += " for each kernel in each respective kernel// \n" -_HELP_STRING += " directory\n" -analysis: $(foreach v,$(VERSIONS),kernel/$v/pc_stats kernel/$v/graphs kernel/$v/stats) - -_HELP_STRING += " statistics: \n" -_HELP_STRING += " - Launch indpendent cosimulation executions of each kernel version.\n" -_HELP_STRING += " When execution finishes, it generates ONLY the parsed operation \n" -_HELP_STRING += " stats for each kernel in each respective kernel// \n" -_HELP_STRING += " directory\n" -statistics: $(foreach v,$(VERSIONS),kernel/$v/stats) - -_HELP_STRING += " all: \n" -_HELP_STRING += " - Launch both the default and analysis target\n" -all: analysis default - -.DEFAULT_GOAL = help -_HELP_STRING += " help: \n" -_HELP_STRING += " - Output a friendly help message.\n" -help: - @echo -e $(HELP_STRING) - -# Always re-run, if asked. -.PHONY: default analysis help - -# These last three lines ensure that _HELP_STRING is appended to the top of -# whatever else comes before it. -_HELP_STRING += "\n" -_HELP_STRING += $(HELP_STRING) -HELP_STRING := $(_HELP_STRING) - diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp index ea920b295..8de8e073e 100644 --- a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp +++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp @@ -1,3 +1,4 @@ +#include "bsg_manycore_regression.h" #include "ipnsw.hpp" #include "HammerBlade.hpp" #include "Graph500Data.hpp" @@ -8,6 +9,7 @@ #include "IProductUBmkKernelRunner.hpp" #include "IProductUBmkResultReader.hpp" #include "IProductUBmkFactory.hpp" +#include "IProductUBmkParallelFactory.hpp" #include "BeamSearchKernelRunner.hpp" #include "BeamSearchResultReader.hpp" #include "BeamSearchFactory.hpp" @@ -19,8 +21,6 @@ #include #include -#include "GreedyWalkResults.cpp" - using namespace ipnsw; int Main(int argc, char *argv[]) @@ -31,6 +31,12 @@ int Main(int argc, char *argv[]) std::unique_ptr runner; std::unique_ptr factory; + IPNSWRunnerConfig cfg; + cfg.grid_x() = args.grid_x(); + cfg.grid_y() = args.grid_y(); + cfg.grp_x() = args.grp_x(); + cfg.grp_y() = args.grp_y(); + if (ipnsw::startswith(args.version(), "greedy_walk")) { factory = std::unique_ptr(new GreedyWalkFactory); } else if (ipnsw::startswith(args.version(), "beam_search")) { @@ -39,11 +45,20 @@ int Main(int argc, char *argv[]) /* parse the number of inner products */ std::cout << "num inner products " << args.num_iproducts() << std::endl; int n_iproducts = args.num_iproducts(); - factory = std::unique_ptr(new IProductUBmkFactory(n_iproducts)); + + bool parallel = args.version().find("parallel") != std::string::npos; + if (parallel) { + factory = std::unique_ptr(new IProductUBmkParallelFactory(n_iproducts)); + } else { + factory = std::unique_ptr(new IProductUBmkFactory(n_iproducts)) ; + } + } else if (args._version == "debug") { /* just for debugging */ std::cout << "--num-iproducts=" << args.num_iproducts() << std::endl; - std::cout << "--queries="; + std::cout << "--queries=" << std::endl; + std::cout << "--group-x=" << args.grp_x() << std::endl; + std::cout << "--group-y=" << args.grp_y() << std::endl; auto do_queries = args.do_queries(); for (auto q : do_queries) { std::cout << q << " "; @@ -54,33 +69,10 @@ int Main(int argc, char *argv[]) return 0; } - runner = std::unique_ptr(new IPNSWRunner(args, factory)); + runner = std::unique_ptr(new IPNSWRunner(args, factory, cfg)); runner->run(); return 0; } -#ifdef COSIM -void cosim_main(uint32_t *exit_code, char * args) { - // We aren't passed command line arguments directly so we parse them - // from *args. args is a string from VCS - to pass a string of arguments - // to args, pass c_args to VCS as follows: +c_args="" - int argc = get_argc(args); - char *argv[argc]; - get_argv(args, argc, argv); - -#ifdef VCS - svScope scope; - scope = svGetScopeFromName("tb"); - svSetScope(scope); -#endif - int rc = Main(argc, argv); - *exit_code = rc; - return; -} -#else -int main(int argc, char ** argv) { - return Main(argc, argv); -} -#endif +declare_program_main("IPNSW", Main); diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp index 385873c50..9c91c72bd 100644 --- a/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp +++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.hpp @@ -34,4 +34,3 @@ #include #include #include -#include "../common.h" diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp index a69965073..9ee2ce5e7 100644 --- a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v1/kernel.cpp @@ -175,7 +175,7 @@ extern "C" { } int n_res = std::min(results.size(), N_RESULTS); - std::sort(results_mem, results_mem+n_res, LT()); + std::sort(results_mem, results_mem+results.size(), LT()); bsg_cuda_print_stat_end(0); *n_results = n_res; diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp new file mode 100644 index 000000000..d55e7e900 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v10/kernel.cpp @@ -0,0 +1,279 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 8 +#define BSG_TILE_GROUP_Y_DIM 4 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + +#define distance(v0, v1) \ + (-1 * inner_product_v4_serial(v0, v1)) + + + static constexpr int SYNC_INV = -1; + static constexpr int SYNC_DONE = -2; + + void ipnsw_distance_slave(bsg_attr_remote const float *__restrict database, + const float *query, + int *dst_p, + float *distance_p, + int *done_p, + DenseSet_v1 *seen) + { + float *result = bsg_tile_group_remote_pointer(0, 0, &distance_p[__bsg_id]); + int *done = bsg_tile_group_remote_pointer( 0, 0, &done_p[__bsg_id]); + while (true) { + int dst = sleep_until_valid(dst_p, SYNC_INV); + if (dst == SYNC_DONE) + break; + + if (!seen->in(dst)) { + seen->atomic_insert(dst); + //bsg_print_int(dst); + float tmp = distance(query, &database[dst * VSIZE]); + //bsg_print_float(tmp); + *result = tmp; + } else { + *result = -INFINITY; + } + *done = 1; + } + } + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, + int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + int dst_slave = SYNC_INV; + float dist_result[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + int dist_done [BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + + if (__bsg_id != 0) { + ipnsw_distance_slave(database, q, &dst_slave, dist_result, dist_done, &seen); + } else { + bsg_saif_start(); + int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x) + for (int y = 0; y < BSG_TILE_GROUP_Y_DIM; ++y) { + dst_slave_ptr[bsg_x_y_to_id(x,y)] + = bsg_tile_group_remote_pointer(x, y, &dst_slave); + dist_result[bsg_x_y_to_id(x,y)] = INFINITY; + dist_done[bsg_x_y_to_id(x,y)] = 0; + } + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + + // traverse neighbors + for (int dst_i = 0; + dst_i < degree; + dst_i += BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM) { + // read-in work + int dst_n = std::min(BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM, degree-dst_i); + int dst_v[dst_n]; + memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v)); + + // delegate work + int dst; + for (int dst_j = 1; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + *dst_slave_ptr[dst_j] = dst; + } + // work myself + { + dst = dst_v[0]; + if (!seen.in(dst)) { + seen.atomic_insert(dst); + dist_result[0] = distance(q, &database[dst * VSIZE]); + } else { + dist_result[0] = -INFINITY; + } + dist_done[0] = 1; + } + // reduce + for (int dst_j = 0; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + bsg_wait_local_int_asm_blind(&dist_done[dst_j], 1); + dist_done[dst_j] = 0; + float d_neib = dist_result[dst_j]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_float(d_neib); +#endif + // already seen? + if (d_neib == -INFINITY) + continue; + + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + *n_results = n_res; + bsg_saif_end(); + + } + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp new file mode 100644 index 000000000..1ced33c51 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5-ipv4serial/kernel.cpp @@ -0,0 +1,192 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + +#define distance(v0, v1) \ + (-1 * inner_product_v4_serial(v0, v1)) + + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = distance(q, &database[dst*VSIZE]); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + bsg_cuda_print_stat_end(0); + + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp index 18a29fd33..7073bb548 100644 --- a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v5/kernel.cpp @@ -92,7 +92,8 @@ extern "C" { } // Uncomment to turn on debugging -#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT #define distance(v0, v1) \ (-1 * inner_product_v3(v0, v1)) @@ -117,8 +118,10 @@ extern "C" { // retrieve results from greedy walk int v_curr = *v_curr_o; float d_curr = *d_curr_o; - //bsg_print_int(v_curr); - //bsg_print_float(d_curr); +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // initialize priority queues DynHeap, GT> candidates(candidates_mem, 512); @@ -176,7 +179,7 @@ extern "C" { } int n_res = std::min(results.size(), N_RESULTS); - std::sort(results_mem, results_mem+n_res, LT()); + std::sort(results_mem, results_mem+results.size(), LT()); bsg_cuda_print_stat_end(0); *n_results = n_res; diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp new file mode 100644 index 000000000..e88095dea --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v6/kernel.cpp @@ -0,0 +1,195 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +using InnerProduct = InnerProductParallel_v1; + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + + // Pepare other tiles for parallel inner products + InnerProduct ip(database, q); + + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + ip.init(); + + if (__bsg_id == 0) { + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = -1 * ip.inner_product(dst); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + //ip.exit(); + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + *n_results = n_res; + } + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp new file mode 100644 index 000000000..37d995573 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v7/kernel.cpp @@ -0,0 +1,194 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +using InnerProduct = InnerProductParallel_v1; + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, const float *query, int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + + // Pepare other tiles for parallel inner products + InnerProduct ip(database, q); + + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + ip.init(); + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; + //bsg_print_int(v_curr); + //bsg_print_float(d_curr); + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + if (!seen.in(dst)) { + // mark as seen + seen.insert(dst); + float d_neib = -1 * ip.inner_product(dst); + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + } + + } + + //ip.exit(); + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + + bsg_cuda_print_stat_end(0); + *n_results = n_res; + + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp new file mode 100644 index 000000000..d87eaf3bd --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v8/kernel.cpp @@ -0,0 +1,270 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + +#define distance(v0, v1) \ + (-1 * inner_product_v4_serial(v0, v1)) + + + static constexpr int SYNC_INV = -1; + static constexpr int SYNC_DONE = -2; + + void ipnsw_distance_slave(bsg_attr_remote const float *__restrict database, + const float *query, + int *dst_p, + float *distance_p, + DenseSet_v1 *seen) + { + float *result = bsg_tile_group_remote_pointer(0, 0, &distance_p[__bsg_id]); + while (true) { + int dst = sleep_until_valid(dst_p, SYNC_INV); + if (dst == SYNC_DONE) + break; + + if (!seen->in(dst)) { + seen->atomic_insert(dst); + //bsg_print_int(dst); + float tmp = distance(query, &database[dst * VSIZE]); + //bsg_print_float(tmp); + *result = tmp; + } else { + *result = -INFINITY; + } + } + } + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, + int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + int dst_slave = SYNC_INV; + float dist_result[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + + if (__bsg_id != 0) { + ipnsw_distance_slave(database, q, &dst_slave, dist_result, &seen); + } else { + + int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM]; + for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x) + for (int y = 0; y < BSG_TILE_GROUP_Y_DIM; ++y) { + dst_slave_ptr[bsg_x_y_to_id(x,y)] + = bsg_tile_group_remote_pointer(x, y, &dst_slave); + dist_result[bsg_x_y_to_id(x,y)] = INFINITY; + } + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + + // traverse neighbors + for (int dst_i = 0; + dst_i < degree; + dst_i += BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM) { + // read-in work + int dst_n = std::min(BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM, degree-dst_i); + int dst_v[dst_n]; + memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v)); + + // delegate work + int dst; + for (int dst_j = 1; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + *dst_slave_ptr[dst_j] = dst; + } + // work myself + { + dst = dst_v[0]; + if (!seen.in(dst)) { + seen.atomic_insert(dst); + dist_result[0] = distance(q, &database[dst * VSIZE]); + } else { + dist_result[0] = -INFINITY; + } + } + // reduce + for (int dst_j = 0; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + float d_neib = sleep_until_valid(&dist_result[dst_j], INFINITY); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_float(d_neib); +#endif + // already seen? + if (d_neib == -INFINITY) + continue; + + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + + } + + } + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + *n_results = n_res; + + } + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp new file mode 100644 index 000000000..69def7bdd --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/beam_search_v9/kernel.cpp @@ -0,0 +1,249 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 4 +#define BSG_TILE_GROUP_Y_DIM 4 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +#include "set.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +#define N_V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +class LT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) < std::get<0>(rhs); + } +}; + +class GT { +public: + bool operator()(const std::pair &lhs, const std::pair &rhs) { + return std::get<0>(lhs) > std::get<0>(rhs); + } +}; + +#ifdef __cplusplus +extern "C" { +#endif + +// Uncomment to turn on debugging +#define DEBUG_BEAM_SEARCH_TRAVERSED_TRACE +//#define DEBUG_BEAM_SEARCH_INPUT + + using InnerProduct = InnerProductParallel_Y; + + static constexpr int SYNC_INV = -1; + static constexpr int SYNC_DONE = -2; + + void ipnsw_x_master(bsg_attr_remote const float *__restrict database, + const float *query, + int *dst_p, + float *distance_p, + DenseSet_v1 *seen, + InnerProduct *ip_y) + { + float *result = bsg_tile_group_remote_pointer(0, 0, &distance_p[__bsg_x]); + while (true) { + int dst = sleep_until_valid(dst_p, SYNC_INV); + if (dst == SYNC_DONE) + break; + + if (!seen->in(dst)) { + seen->atomic_insert(dst); + //bsg_print_int(dst); + *result = -1.0 * ip_y->inner_product(dst); + } else { + *result = -INFINITY; + } + } + } + + int ipnsw_beam_search(const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, + int *seen_mem, + int *v_curr_o, float *d_curr_o, + std::pair *candidates_mem, + std::pair *results_mem, + int *n_results) + { + // keep track of vertices seen + DenseSet_v1seen(seen_mem); + + // fetch graph and q out of memory + struct graph G = Gs[G_0]; + float q[VSIZE]; + bsg_cuda_print_stat_start(0); + memcpy(q, query, sizeof(q)); + + InnerProduct ip_y(database, q); + ip_y.init(); + + int dst_slave = SYNC_INV; + float dist_result[BSG_TILE_GROUP_X_DIM]; + + if (__bsg_y == 0) { + if (__bsg_x == 0) { + + int *dst_slave_ptr[BSG_TILE_GROUP_X_DIM]; + for (int x = 0; x < BSG_TILE_GROUP_X_DIM; ++x) { + dst_slave_ptr[x] = bsg_tile_group_remote_pointer(x, 0, &dst_slave); + dist_result[x] = INFINITY; + } + + // retrieve results from greedy walk + int v_curr = *v_curr_o; + float d_curr = *d_curr_o; +#ifdef DEBUG_BEAM_SEARCH_INPUT + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif + + // initialize priority queues + DynHeap, GT> candidates(candidates_mem, 512); + DynHeap, LT> results(results_mem, 128); + + candidates.push({d_curr, v_curr}); + results.push({d_curr, v_curr}); + + float d_worst = d_curr; + seen.insert(v_curr); + + while (!candidates.empty()) { + int v_best; + float d_best; + + auto best = candidates.pop(); + v_best = std::get<1>(best); + d_best = std::get<0>(best); + + d_worst = std::get<0>(results.top()); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(-v_best); +#endif + + if (d_best > d_worst) { + break; + } + + // traverse neighbors of v_best + int dst_0 = G.offsets[v_best]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_best+1] - dst_0; + + // traverse neighbors + for (int dst_i = 0; + dst_i < degree; + dst_i += BSG_TILE_GROUP_X_DIM) { + // read-in work + int dst_n = std::min(BSG_TILE_GROUP_X_DIM, degree-dst_i); + int dst_v[dst_n]; + memcpy(dst_v, &G.neighbors[dst_0+dst_i], sizeof(dst_v)); + + // delegate work + int dst; + for (int dst_j = 1; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + *dst_slave_ptr[dst_j] = dst; + } + // work myself + { + dst = dst_v[0]; + if (!seen.in(dst)) { + seen.atomic_insert(dst); + dist_result[0] = -1.0 * ip_y.inner_product(dst); + } else { + dist_result[0] = -INFINITY; + } + } + // reduce + for (int dst_j = 0; dst_j < dst_n; ++dst_j) { + dst = dst_v[dst_j]; + +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_int(dst); +#endif + float d_neib = sleep_until_valid(&dist_result[dst_j], INFINITY); +#ifdef DEBUG_BEAM_SEARCH_TRAVERSED_TRACE + bsg_print_float(d_neib); +#endif + // already seen? + if (d_neib == -INFINITY) + continue; + + d_worst = std::get<0>(results.top()); + // if there's room for new result or this distance is promising + if ((results.size() < EF) || (d_neib < d_worst)) { + + // push onto candidates and results + candidates.push({d_neib, dst}); + results.push({d_neib, dst}); + + // prune down to recall + if (results.size() > EF) + results.pop(); + } + } + + } + + } + + // signal all columns done + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM; ++tile) + *dst_slave_ptr[tile] = SYNC_DONE; + + int n_res = std::min(results.size(), N_RESULTS); + std::sort(results_mem, results_mem+results.size(), LT()); + *n_results = n_res; + + } else { // bsg_x != 0 + ipnsw_x_master(database, q, &dst_slave, dist_result, &seen, &ip_y); + } + } + + ip_y.exit(); + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp new file mode 100644 index 000000000..aafefe6fd --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3-ipv4serial/kernel.cpp @@ -0,0 +1,147 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +//#define DEBUG_GREEDY_VIS_TR + +#define distance(v0, v1) \ + (-1 * inner_product_v4_serial(v0, v1)) + + int ipnsw_greedy_search (const graph *Gs, bsg_attr_remote const float *__restrict database, const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = distance(q, &database[v_curr*VSIZE]); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = distance(q, &database[dst*VSIZE]); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VCURR_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp index ddea465b0..99614fc8b 100644 --- a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v3/kernel.cpp @@ -126,7 +126,7 @@ extern "C" { v_curr = dst; changed = true; -#if defined(DEBUG_GREEDY_VIS_TR) +#if defined(DEBUG_GREEDY_VCURR_TR) bsg_print_int(v_curr); bsg_print_float(d_curr); #endif // #if defined(DEBUG_GREEDY_VIS_TR) diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp new file mode 100644 index 000000000..c60b3e125 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.cpp @@ -0,0 +1,152 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +using InnerProduct = InnerProductParallel_v1; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + + int input_test(const graph *Gs, const float *database, const float *query, int *seen) + { +#if defined(DEBUG_INPUT_TEST) + bsg_printf("Gs = %08x\n", Gs); + bsg_printf("database = %08x\n", database); + bsg_printf("query = %08x\n", query); + bsg_printf("seen = %08x\n", seen); +#endif // #if defined(DEBUG_INPUT_TEST) + + struct graph G; + int v_i [] = {G_0, G_1, G_2, G_3}; + for (int j = 0; j < 4; ++j) { + int i = v_i[j]; + memcpy(&G, &Gs[i], sizeof(G)); +#if defined(DEBUG_INPUT_TEST) + bsg_printf("G[%d].offsets = %08x\n", j, G.offsets); + bsg_printf("G[%d].neighbors = %08x\n", j, G.neighbors); + bsg_printf("G[%d].V = %d\n", j, G.V); + bsg_printf("G[%d].E = %d\n", j, G.E); +#endif // #if defined(DEBUG_INPUT_TEST) + } + + return 0; + } + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +#define DEBUG_GREEDY_VIS_TR + + int ipnsw_greedy_search (const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + float q[VSIZE]; + + bsg_cuda_print_stat_start(0); + + memcpy(q, query, sizeof(q)); + + InnerProduct ip(database, q); + ip.init(); + if (__bsg_id == 0) { + bsg_saif_start(); + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = -1.0 * ip.inner_product(v_curr); + +#if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VCURR_TR) || defined(DEBUG_GREEDY_VIS_TR) + + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + // fetch neighbors + int dst_0 = G.offsets[v_curr]; + int degree = v_curr == G.V-1 ? G.E - dst_0 : G.offsets[v_curr+1] - dst_0; + for (int dst_i = 0; dst_i < degree; dst_i++) { + int dst = G.neighbors[dst_0+dst_i]; + // calc. iproduct + float d = -1.0 * ip.inner_product(dst); + +#if defined(DEBUG_GREEDY_VIS_TR) + bsg_print_int(dst); + bsg_print_float(d); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + +#if defined(DEBUG_GREEDY_VCURR_TR) + bsg_print_int(v_curr); + bsg_print_float(d_curr); +#endif // #if defined(DEBUG_GREEDY_VIS_TR) + } + } + } + } + + *v_curr_o = v_curr; + *d_curr_o = d_curr; + bsg_saif_end(); + } + bsg_cuda_print_stat_end(0); + return 0; + } + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp new file mode 100644 index 000000000..12da1aebb --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/kernel.loc.cpp @@ -0,0 +1,113 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +using InnerProduct = InnerProductParallel_v1; + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_INPUT_TEST + +// Uncomment to turn on debugging +//#define DEBUG_GREEDY_VCURR_TR +#define DEBUG_GREEDY_VIS_TR + + /**/ + int ipnsw_greedy_search (const graph *Gs, + bsg_attr_remote const float *__restrict database, + const float *query, int *seen, + int *v_curr_o, float *d_curr_o) + { + /* loc:2 */ + /**/ + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + /* loc:2 */ + + /* init code - can be hidden by library*/ + InnerProduct ip(database, q); + ip.init(); + if (__bsg_id == 0) { + bsg_saif_start(); + /**/ + int v_curr = V_ENTRY; + float d_curr = 0; + + d_curr = -1.0 * ip.inner_product(v_curr); + + /**/ + for (int i = 0; i < NG-1; i++) { + struct graph G = Gs[i]; + bool changed = true; + while (changed) { + changed = false; + /* loc:5 */ + // fetch neighbors + /**/ + for (int dst : G.neighbors(v_curr)) { + float d = -1.0 * ip.inner_product(dst); + if (d < d_curr) { + d_curr = d; + v_curr = dst; + changed = true; + } + } + } + } + /* loc: 10 */ + /**/ + *v_curr_o = v_curr; + *d_curr_o = d_curr; + } + return 0; + } + /* loc: 5 */ + +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh new file mode 100644 index 000000000..1f12e76ba --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/greedy_walk_v4/loc.sh @@ -0,0 +1 @@ +cat kernel.loc.cpp | grep loc: | cut -d: -f2 | cut -d* -f1 | awk 'BEGIN{x=0}{x = x+$1}END{print x}' diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp index 8bb83077a..6099411c2 100644 --- a/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp +++ b/examples/sdh-eval-workloads/ipnsw/kernel/include/inner_product.hpp @@ -2,6 +2,8 @@ #include "bsg_striped_array.hpp" #include #include +#include +#include "sleep_until_valid.hpp" template __attribute__((noinline)) @@ -72,8 +74,8 @@ __attribute__((noinline)) FLOAT_T inner_product_v4(const FLOAT_T *__restrict a, bsg_attr_remote const FLOAT_T *__restrict b) { - register FLOAT_T r[UNROLL]; - for (int i = __bsg_id * BSIZE; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) { + register FLOAT_T r[UNROLL] = {0}; + for (int i = __bsg_id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) { #pragma bsg_unroll(32) for (int j = 0; j < BSIZE; ++j) { #pragma bsg_unroll(32) @@ -82,8 +84,236 @@ FLOAT_T inner_product_v4(const FLOAT_T *__restrict a, } } } - int rs = 0.0; + FLOAT_T rs = 0.0; for (int i = 0; i < UNROLL; ++i) rs += r[i]; return rs; } + +template +__attribute__((noinline)) +FLOAT_T inner_product_parallel_v1(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + register FLOAT_T r[UNROLL] = {0.0}; + + for (int i = __bsg_id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_X * TG_Y) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { +#pragma bsg_unroll(32) + for (int k =0 ; k < UNROLL; ++k) { + r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]); + } + } + } + FLOAT_T rs = 0.0; + for (int i = 0; i < UNROLL; ++i) + rs += r[i]; + + return rs; +} + + +template +__attribute__((noinline)) +FLOAT_T inner_product_v4_serial(const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + register FLOAT_T r[UNROLL] = {0}; + for (int i = 0; i < VSIZE; i += UNROLL * BSIZE) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { +#pragma bsg_unroll(32) + for (int k =0 ; k < UNROLL; ++k) { + r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]); + } + } + } + FLOAT_T rs = 0.0; + for (int i = 0; i < UNROLL; ++i) + rs += r[i]; + return rs; +} + + +template +FLOAT_T inner_product_parallel_v2( + int id, + const FLOAT_T *__restrict a, + bsg_attr_remote const FLOAT_T *__restrict b) +{ + register FLOAT_T r[UNROLL] = {0.0}; + + for (int i = id * BSIZE * UNROLL; i < VSIZE; i += UNROLL * BSIZE * TG_N) { +#pragma bsg_unroll(32) + for (int j = 0; j < BSIZE; ++j) { +#pragma bsg_unroll(32) + for (int k =0 ; k < UNROLL; ++k) { + r[k] = fmaf(a[i+j+k*BSIZE], b[i+j+k*BSIZE], r[k]); + } + } + } + FLOAT_T rs = 0.0; + for (int i = 0; i < UNROLL; ++i) + rs += r[i]; + + return rs; +} + +template +class InnerProductParallel_v1 { +public: + static constexpr std::size_t VSIZE = 100; + static constexpr std::size_t TG_N = TG_X * TG_Y; + static constexpr int SYNC_DONE = -2; + static constexpr int SYNC_INV = -1; + + InnerProductParallel_v1(bsg_attr_remote const float *t1, const float *t2) { + _inf = INFINITY; + for (int i = 0; i < TG_N; ++i) + _partial[i] = _inf; + + for (int x = 0; x < TG_X; ++x) + for (int y = 0; y < TG_Y; ++y) + _t1_idx_group[bsg_x_y_to_id(x,y)] + = bsg_tile_group_remote_pointer(x,y,&_t1_idx); + + _t1 = t1; + _t2 = t2; + _t1_idx = SYNC_INV; + } + + void init() { + if (__bsg_id == 0) { + return; + } + + float p = 0.0; + int t1_idx; + float *partial_result = bsg_tile_group_remote_pointer(0, 0, &_partial[__bsg_id]); + + while (true) { + t1_idx = sleep_until_valid(&_t1_idx, SYNC_INV); + if (t1_idx == SYNC_DONE) + break; + + p = inner_product_parallel_v1(_t2, &_t1[t1_idx * VSIZE]); + *partial_result = p; + } + } + + float inner_product(int idx) { + if (__bsg_id != 0) + return 0.0; + + for (int tile = 0; tile < TG_X*TG_Y; ++tile) + *_t1_idx_group[tile] = idx; + + _partial[__bsg_id] = inner_product_parallel_v1(_t2, &_t1[idx * VSIZE]); + + float r = 0.0; + for (int tile = 0; tile +class InnerProductParallel_Y { +public: + static constexpr std::size_t VSIZE = 100; + static constexpr int SYNC_DONE = -2; + static constexpr int SYNC_INV = -1; + + InnerProductParallel_Y(bsg_attr_remote const float *t1, const float *t2) { + _inf = INFINITY; + for (int i = 0; i < TG_Y; ++i) + _partial[i] = _inf; + + for (int y = 0; y < TG_Y; ++y) + _t1_idx_group[y] = bsg_tile_group_remote_pointer(__bsg_x, y, &_t1_idx); + + _t1 = t1; + _t2 = t2; + _t1_idx = SYNC_INV; + } + + void init() { + if (__bsg_y == 0) { + return; + } + + float p = 0.0; + int t1_idx; + float *partial_result = bsg_tile_group_remote_pointer(__bsg_x, 0, &_partial[__bsg_y]); + + while (true) { + t1_idx = sleep_until_valid(&_t1_idx, SYNC_INV); + if (t1_idx == SYNC_DONE) + break; + + p = inner_product_parallel_v2(__bsg_y, _t2, &_t1[t1_idx * VSIZE]); + *partial_result = p; + } + } + + float inner_product(int idx) { + if (__bsg_y != 0) + return 0.0; + + for (int tile = 0; tile < TG_Y; ++tile) + *_t1_idx_group[tile] = idx; + + _partial[__bsg_y] = inner_product_parallel_v2(__bsg_y, _t2, &_t1[idx * VSIZE]); + + float r = 0.0; + for (int tile = 0; tile +template +static inline T sleep_on_update(volatile T *ptr) +{ + T r; + asm volatile ("lr.w.aq %[r], %[ptr]" : + [r] "=r" (r) : + [ptr] "m" (*ptr) + ); + return r; +} + +template +static inline T sleep_until_valid(volatile T *ptr, T not_valid) +{ + T r; + + asm volatile ("lr.w %[r], %[ptr]" : + [r] "=r" (r) : + [ptr] "m" (*ptr)); + + while (r == not_valid) { + r = sleep_on_update(ptr); + } + *ptr = not_valid; + return r; +} diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp new file mode 100644 index 000000000..9fe605f3a --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v1/kernel.cpp @@ -0,0 +1,180 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +//#include +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define VISIT_BUFSIZE 512 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +//#define DEBUG_SLAVE +//#define DEBUG_MASTER + +using barrier = bsg_barrier; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_parallel_v1(x,y) + +#define SYNC_DONE -1 + + __attribute__((noinline)) + int inner_product_ubmk_master(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all, + barrier *group_barrier, + std::atomic *kp, + std::atomic *rp) + { + float r = 0.0; + int visit[VISIT_BUFSIZE]; + int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id]; + + // pre-compute addresses on remote tiles + std::atomic *kp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM]; + std::atomic *rp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM]; + + for (int tile_x = 0; tile_x < BSG_TILE_GROUP_X_DIM; ++tile_x) { + for (int tile_y = 0; tile_y < BSG_TILE_GROUP_Y_DIM; ++tile_y) { + kp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, kp); + rp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, rp); + } + } + + for (int i = 0; i < N; i += VISIT_BUFSIZE) { + size_t sz = std::min(VISIT_BUFSIZE, (N-i)); + memcpy(visit, &visit_remote[i], sz*sizeof(int)); + + for (int j = 0; j < sz; ++j) { + // read k + int k = visit[j]; + + // set k on all tiles + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) + kp_group[tile]->store(k, std::memory_order_relaxed); + + // do inner product + group_barrier->sync(); // signal ready + float r_local = iproduct(query, &database[k * VSIZE]); +#ifdef DEBUG_MASTER + bsg_print_float(r_local); +#endif + rp_group[__bsg_id]->store(r_local, std::memory_order_relaxed); + group_barrier->sync(); // signal done + + // read r from all tiles + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) { + float r_remote = rp_group[tile]->load(std::memory_order_relaxed); +#ifdef DEBUG_MASTER + bsg_print_float(r_remote); +#endif + r += r_remote; + } + } + } + + return (int)r; + } + + __attribute__((noinline)) + void inner_product_ubmk_slave(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + barrier *group_barrier, + std::atomic *kp, + std::atomic *rp) + { + float r = 0.0; + int k; + + while (true) { + // load next + group_barrier->sync(); // signal ready + k = kp->load(std::memory_order_relaxed); + if (k == SYNC_DONE) + break; + + // do inner product + r = iproduct(query, &database[k * VSIZE]); + rp->store(r, std::memory_order_relaxed); +#ifdef DEBUG_SLAVE + bsg_print_float(r); +#endif + group_barrier->sync(); // signal done + } + } + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all) + { + static barrier group_barrier; + static std::atomic k; + static std::atomic r; + float rr; + + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + if (__bsg_id == 0) { + // enter master loop + rr = inner_product_ubmk_master(database, q, N, visit_remote_all, + &group_barrier, &k, &r); + } else { + // enter slave loop + inner_product_ubmk_slave(database, q, &group_barrier, &k, &r); + } + bsg_cuda_print_stat_end(0); + + return (int)(rr); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp new file mode 100644 index 000000000..df8be2dae --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v2/kernel.cpp @@ -0,0 +1,154 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +//#include +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" +#include "sleep_until_valid.hpp" + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define VISIT_BUFSIZE 512 + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_MASTER +//#define DEBUG_SLAVE +#define iproduct(x,y) \ + inner_product_parallel_v1(x,y) + + #define SYNC_INV -2 + #define SYNC_DONE -1 + + __attribute__((noinline)) + int inner_product_ubmk_master(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all, + int *kp, + float *rp) + { + float r = 0.0; + int visit[VISIT_BUFSIZE]; + int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id]; + + // pre-compute addresses on remote tiles + int *kp_group[BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM]; + + for (int tile_x = 0; tile_x < BSG_TILE_GROUP_X_DIM; ++tile_x) { + for (int tile_y = 0; tile_y < BSG_TILE_GROUP_Y_DIM; ++tile_y) { + kp_group[bsg_x_y_to_id(tile_x, tile_y)] = bsg_tile_group_remote_pointer(tile_x, tile_y, kp); + } + } + + for (int i = 0; i < N; i += VISIT_BUFSIZE) { + size_t sz = std::min(VISIT_BUFSIZE, (N-i)); + memcpy(visit, &visit_remote[i], sz*sizeof(int)); + + for (int j = 0; j < sz; ++j) { + // read k + int k = visit[j]; + + // set k on all tiles + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) + *kp_group[tile] = k; + + float r_local = iproduct(query, &database[k * VSIZE]); +#ifdef DEBUG_MASTER + bsg_print_float(r_local); +#endif + rp[__bsg_id] = r_local; + + // read r from all tiles + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) { + float r_remote = sleep_until_valid(&rp[tile], INFINITY); +#ifdef DEBUG_MASTER + bsg_print_float(r_remote); +#endif + r += r_remote; + } + } + } + + for (int tile = 0; tile < BSG_TILE_GROUP_X_DIM*BSG_TILE_GROUP_Y_DIM; ++tile) + *kp_group[tile] = SYNC_DONE; + + return (int)r; + } + + __attribute__((noinline)) + void inner_product_ubmk_slave(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int *kp, + float *rp) + { + float r = 0.0; + int k; + + while (true) { + // load next + k = sleep_until_valid(kp, SYNC_INV); + if (k == SYNC_DONE) + break; + + r = iproduct(query, &database[k * VSIZE]); + *rp = r; +#ifdef DEBUG_SLAVE + bsg_print_float(r); +#endif + } + } + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all) + { + static int k = SYNC_INV; + static float r [BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM] = {INFINITY}; + float rr; + + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + if (__bsg_id == 0) { + // enter master loop + rr = inner_product_ubmk_master(database, q, N, visit_remote_all, &k, r); + } else { + // enter slave loop + inner_product_ubmk_slave(database, q, &k, bsg_tile_group_remote_pointer(0,0, &r[__bsg_id])); + } + bsg_cuda_print_stat_end(0); + + return (int)(rr); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp new file mode 100644 index 000000000..be92dfcc9 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk-parallel_v3/kernel.cpp @@ -0,0 +1,80 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 2 +#define BSG_TILE_GROUP_Y_DIM 2 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +//#include +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" +#include "sleep_until_valid.hpp" + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define VISIT_BUFSIZE 512 + +using InnerProduct = InnerProductParallel_v1; +using barrier = bsg_barrier; + +#ifdef __cplusplus +extern "C" { +#endif + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all) + { + float q[VSIZE]; + memcpy(q, query, sizeof(q)); + barrier b; + + bsg_cuda_print_stat_start(0); + + InnerProduct ip(database, q); + ip.init(); + float r = 0.0; + int visit[VISIT_BUFSIZE]; + int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id]; + + for (int i = 0; i < N; i += VISIT_BUFSIZE) { + size_t sz = std::min(VISIT_BUFSIZE, (N-i)); + memcpy(visit, &visit_remote[i], sz*sizeof(int)); + + for (int j = 0; j < sz; ++j) { + // read k + int k = visit[j]; + float rp = ip.inner_product(k); + r += rp; + } + } + + ip.exit(); + bsg_cuda_print_stat_end(0); + b.sync(); + return (int)(r); + } +#ifdef __cplusplus +} +#endif diff --git a/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp new file mode 100644 index 000000000..9d9dcbc11 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/kernel/iproduct_ubmk_parallel/kernel.cpp @@ -0,0 +1,94 @@ +/* + * This kernel prints the Hello World message + */ + +// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined +// before bsg_manycore.h and bsg_tile_group_barrier.h are +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +#include +#include +#include +#include +#include +//#include +#include "inner_product.hpp" +#include "heap.hpp" +//#include "inner_product.h" + +/* We wrap all external-facing C++ kernels with `extern "C"` to + * prevent name mangling + */ + +//#define V 1000000 +#define VSIZE 100 +#define NG 4 +#define V_ENTRY 82026 + +#define EF 128 +#define N_RESULTS 10 + +#define VISIT_BUFSIZE 512 + +#define G_0 3 +#define G_1 2 +#define G_2 1 +#define G_3 0 + +struct graph { + const int *offsets; + const int *neighbors; + int V; + int E; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#define iproduct(x,y) \ + inner_product_v3(x,y) + + int inner_product_ubmk(bsg_attr_remote const float * __restrict database, + const float * __restrict query, + int N, + int *visit_remote_all) + { + float q[VSIZE]; + float r = 0; + int visit[VISIT_BUFSIZE]; + //int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id_x * __bsg_tile_group_id_y]; + int *visit_remote = &visit_remote_all[N * __bsg_tile_group_id]; + //int *visit_remote = &visit_remote_all[0]; + + bsg_print_int(-1 * __bsg_tile_group_id); + bsg_print_int(N); + bsg_print_hexadecimal(reinterpret_cast(database)); + bsg_print_hexadecimal(reinterpret_cast(query)); + bsg_print_hexadecimal(reinterpret_cast(visit_remote_all)); + + memcpy(q, query, sizeof(q)); + + bsg_cuda_print_stat_start(0); + for (int i = 0; i < N; i += VISIT_BUFSIZE) { + size_t sz = std::min(VISIT_BUFSIZE, (N-i)); + memcpy(visit, &visit_remote[i], sz*sizeof(int)); + + for (int j = 0; j < sz; ++j) { + int k = visit[j]; + //r += iproduct(q, &database[(i+j*3)*VSIZE]); + r += iproduct(q, &database[k*VSIZE]); + } + } + bsg_cuda_print_stat_end(0); + + return (int)(r); + } +#ifdef __cplusplus +} +#endif From 2d040fb880e4ba9351718c3b45424322cd46ded7 Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Fri, 30 Apr 2021 16:03:50 -0700 Subject: [PATCH 03/22] [ipnsw] Newer versions --- examples/sdh-eval-workloads/ipnsw/Makefile | 203 ++++++++++++++++++--- 1 file changed, 182 insertions(+), 21 deletions(-) diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile index b4bfa09d7..a8350c2cd 100644 --- a/examples/sdh-eval-workloads/ipnsw/Makefile +++ b/examples/sdh-eval-workloads/ipnsw/Makefile @@ -44,24 +44,125 @@ BSG_MACHINE_PATH=$(BSG_F1_DIR)/machines/pod_X1Y1_ruche_X16Y8_hbm ################################################################################ # Define the range of versions ################################################################################ -# Kernel versions. See kernel/README.md for more information. Version names do -# not need to use v* and can be any string -VERSIONS := greedy_walk # inner product with ipc=0.3 (8x4) -VERSIONS += greedy_walk_v1 # inner product with ipc=0.43 (8x4) -VERSIONS += greedy_walk_v2 # inner product with FLOPS/cycle=0.2 (8x4) -VERSIONS += greedy_walk_v3 # inner product with FLOPS/cycle=0.26 (8x4) -VERSIONS += beam_search # very slow - uses a very dumb sparse set -VERSIONS += beam_search_v1 # dense set - inner product with ipc=0.3 (8x4) -VERSIONS += beam_search_v2 # dense set - inner product with ipc=0.43 (8x4) -VERSIONS += beam_search_v3 # + inner_product_v2 (flops/cycle=0.2039) (8x4) -VERSIONS += beam_search_v4 # + inner_product_v3 (flops/cycle=0.2663) (8x4) -VERSIONS += beam_search_v5 # + Bit vector for dense set + # inner product with ipc=0.3 (8x4) +VERSIONS := greedy_walk +greedy_walk-grp-x := 1 +greedy_walk-grp-y := 1 +# inner product with ipc=0.43 (8x4) +VERSIONS += greedy_walk_v1 +greedy_walk_v1-grp-x := 1 +greedy_walk_v1-grp-y := 1 +# inner product with FLOPS/cycle=0.2 (8x4) +VERSIONS += greedy_walk_v2 +greedy_walk_v2-grp-x := 1 +greedy_walk_v2-grp-y := 1 +# inner product with FLOPS/cycle=0.26 (8x4) +VERSIONS += greedy_walk_v3 +greedy_walk_v3-grp-x := 1 +greedy_walk_v3-grp-y := 1 +# inner product v4-serial +VERSIONS += greedy_walk_v3-ipv4serial +greedy_walk_v3-ipv4serial-grp-x := 1 +greedy_walk_v3-ipv4serial-grp-y := 1 +# greedy_walk_v3 + ParallelInnerProduct_v1 +VERSIONS += greedy_walk_v4 +greedy_walk_v4-grp-x := 2 +greedy_walk_v4-grp-y := 2 + +# very slow - uses a very dumb sparse set +VERSIONS += beam_search +beam_search-grp-x := 1 +beam_search-grp-y := 1 +# dense set - inner product with ipc=0.3 (8x4) +VERSIONS += beam_search_v1 +beam_search_v1-grp-x := 1 +beam_search_v1-grp-y := 1 +# dense set - inner product with ipc=0.43 (8x4) +VERSIONS += beam_search_v2 +beam_search_v2-grp-x := 1 +beam_search_v2-grp-y := 1 +# + inner_product_v2 (flops/cycle=0.2039) (8x4) +VERSIONS += beam_search_v3 +beam_search_v3-grp-x := 1 +beam_search_v3-grp-y := 1 +# + inner_product_v3 (flops/cycle=0.2663) (8x4) +VERSIONS += beam_search_v4 +beam_search_v4-grp-x := 1 +beam_search_v4-grp-y := 1 +# + Bit vector for dense set +VERSIONS += beam_search_v5 +beam_search_v5-grp-x := 1 +beam_search_v5-grp-y := 1 +# + Bit vector for dense set + inner product v4 seria; +VERSIONS += beam_search_v5-ipv4serial +beam_search_v5-ipv4serial-grp-x := 1 +beam_search_v5-ipv4serial-grp-y := 1 +# beam_search_v5 + inner_product_parallel_v3 +VERSIONS += beam_search_v6 +beam_search_v6-grp-x := 2 +beam_search_v6-grp-y := 2 +# beam_search_v6 but with 1x2 tile group +VERSIONS += beam_search_v7 +beam_search_v7-grp-x := 1 +beam_search_v7-grp-y := 2 +# beam_search_v5 but edges of candidates traversed in parallel +VERSIONS += beam_search_v8 +beam_search_v8-grp-x := 4 +beam_search_v8-grp-y := 4 +# combination of beam_search_v8 + beam_search_v6 +VERSIONS += beam_search_v9 +beam_search_v9-grp-x := 4 +beam_search_v9-grp-y := 4 +# beam_search_v5 but edges of candidates traversed in parallel +VERSIONS += beam_search_v10 +beam_search_v10-grp-x := 8 +beam_search_v10-grp-y := 4 + +# debugging this makefile VERSIONS += debug -VERSIONS += iproduct_ubmk # baseline - ipc = 0.3 -VERSIONS += iproduct_ubmk_v1 # using clang and NBLs, ipc = 0.43, flops/cycle = 0.1867 -VERSIONS += iproduct_ubmk_v2 # + FMA, ipc = 0.386, flops/cycle = 0.2039 -VERSIONS += iproduct_ubmk_v3 # + explicit parallel fma (ipc=0.45,flops/cycle = 0.2663) (8x4) -VERSIONS += iproduct_ubmk_v4 # Slightly cleaner code than v3 - similar performance +debug-grp-x := 0 +debug-grp-y := 0 + + # baseline - ipc = 0.3 +VERSIONS += iproduct_ubmk +iproduct_ubmk-grp-x := 1 +iproduct_ubmk-grp-y := 1 +# using clang and NBLs, ipc = 0.43, flops/cycle = 0.1867 +VERSIONS += iproduct_ubmk_v1 +iproduct_ubmk_v1-grp-x := 1 +iproduct_ubmk_v1-grp-y := 1 +# + FMA, ipc = 0.386, flops/cycle = 0.2039 +VERSIONS += iproduct_ubmk_v2 +iproduct_ubmk_v2-grp-x := 1 +iproduct_ubmk_v2-grp-y := 1 +# + explicit parallel fma (ipc=0.45,flops/cycle = 0.2663) (8x4) +VERSIONS += iproduct_ubmk_v3 +iproduct_ubmk_v3-grp-x := 1 +iproduct_ubmk_v3-grp-y := 1 +# Slightly cleaner code than v3 - similar performance +VERSIONS += iproduct_ubmk_v4 +iproduct_ubmk_v4-grp-x := 1 +iproduct_ubmk_v4-grp-y := 1 + #... +VERSIONS += iproduct_ubmk_parallel +iproduct_ubmk_parallel-grp-x := 1 +iproduct_ubmk_parallel-grp-y := 1 + #... +VERSIONS += iproduct_ubmk-parallel_v1 +iproduct_ubmk-parallel_v1-grp-x := 2 +iproduct_ubmk-parallel_v1-grp-y := 2 + #... +VERSIONS += iproduct_ubmk-parallel_v1.1 +iproduct_ubmk-parallel_v1.1-grp-x := 2 +iproduct_ubmk-parallel_v1.1-grp-y := 2 + #... +VERSIONS += iproduct_ubmk-parallel_v2 +iproduct_ubmk-parallel_v2-grp-x := 2 +iproduct_ubmk-parallel_v2-grp-y := 2 +#... same as v2 but with (1x4 tg) +VERSIONS += iproduct_ubmk-parallel_v3 +iproduct_ubmk-parallel_v3-grp-x := 2 +iproduct_ubmk-parallel_v3-grp-y := 2 _KERNEL_COMPILER = CLANG ################################################################################ @@ -112,6 +213,12 @@ C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1 C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2 C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3 +# set group x/y values +define VERSION-SET-ARGS +kernel/$(1)/$(HOST_TARGET).log: ARGS += --group-x $($(1)-grp-x) +kernel/$(1)/$(HOST_TARGET).log: ARGS += --group-y $($(1)-grp-y) +endef +$(foreach v,$(VERSIONS),$(eval $(call VERSION-SET-ARGS,$v))) ################################ # Inner Product U-Benchmarking # @@ -156,11 +263,61 @@ iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/$(HOST_TARG # Add to versions VERSIONS += $(IPRODUCT-UBMK-VERSIONS) +######################################### +# Parallel Inner Product U-Benchmarking # +######################################### +# number iproducts +N-IPRODUCTS := 1500 +GRID-X := 1 2 4 8 +GRID-Y := 1 2 4 + +IPRODUCT-PARALLEL-BASENAME := iproduct_ubmk-parallel_v1 + +define IPRODUCT-UBMK-PARALLEL-RULE +# creates run directory from template +kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.cpp: kernel/$(IPRODUCT-PARALLEL-BASENAME)/kernel.cpp + mkdir -p $$(dir $$@) + cp $$< $$@ + +# adds arguments +kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --num-iproducts $(1) +kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --grid-x $(2) +kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --grid-y $(3) +kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --group-x $($(IPRODUCT-PARALLEL-BASENAME)-grp-x) +kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --group-y $($(IPRODUCT-PARALLEL-BASENAME)-grp-y) +kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.riscv +kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) +# adds to list of iproduct u-bmk +IPRODUCT-UBMK-PARALLEL-VERSIONS += iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3) +endef + +# Expand rule for each inner product input +$(foreach gy,$(GRID-Y),$(foreach gx,$(GRID-X),$(foreach nip,$(N-IPRODUCTS), $(eval $(call IPRODUCT-UBMK-PARALLEL-RULE,$(nip),$(gx),$(gy)))))) + +.PHONY: create-iproduct-ubmk-parallel +.PHONY: purge-iproduct-ubmk-parallel +.PHONY: iproduct-ubmk-parallel-stats + +# create rule +create-iproduct-ubmk-parallel: $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v/kernel.cpp) + +# purge rule +purge-iproduct-ubmk-parallel: + rm -rf $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v) + +# collect stats for all +iproduct-ubmk-parallel-stats: create-iproduct-ubmk-parallel +iproduct-ubmk-parallel-stats: $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log) + +# Add to versions +VERSIONS += $(IPRODUCT-UBMK-PARALLEL-VERSIONS) + #################### # Greedy Walk Runs # #################### GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490 -GREEDY-WALK-BASENAME := greedy_walk_v3 +GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490 +GREEDY-WALK-BASENAME := greedy_walk_v4 define GREEDY-WALK-RULE # creates run directory from template kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.cpp @@ -168,8 +325,8 @@ kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.c cp $$< $$@ # adds arguments -kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: C_ARGS += --queries $(1) -kernel/greedy_walk-query$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv +kernel/greedy_walk-query$(1)/$(HOST_TARGET).exec.log: C_ARGS += --queries $(1) +kernel/greedy_walk-query$(1)/$(HOST_TARGET).exec.log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv kernel/greedy_walk-query$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) # adds to list of greedy walk versions @@ -201,7 +358,11 @@ VERSIONS += $(GREEDY-WALK-VERSIONS) # Beam Search Runs # #################### BEAM-SEARCH-QUERIES := 2 188 229 355 427 472 -BEAM-SEARCH-BASENAME := beam_search_v5 +BEAM-SEARCH-QUERIES += 25 74 112 140 148 178 +BEAM-SEARCH-QUERIES += 214 244 278 302 331 +BEAM-SEARCH-QUERIES += 396 420 452 489 511 + +BEAM-SEARCH-BASENAME := beam_search_v10 define BEAM-SEARCH-RULE # creates run directory from template From 020fdccb99cac330e88ffa75c8df707d6b216d47 Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Fri, 30 Apr 2021 16:11:43 -0700 Subject: [PATCH 04/22] [ipnsw] submodules --- .gitmodules | 6 ++++++ examples/sdh-eval-workloads/ipnsw/graph-tools | 1 + examples/sdh-eval-workloads/ipnsw/hammerblade-helpers | 1 + examples/sdh-eval-workloads/ipnsw/hb-prog-eval | 2 +- 4 files changed, 9 insertions(+), 1 deletion(-) create mode 160000 examples/sdh-eval-workloads/ipnsw/graph-tools create mode 160000 examples/sdh-eval-workloads/ipnsw/hammerblade-helpers diff --git a/.gitmodules b/.gitmodules index 5083eb4ea..099c758cb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,9 @@ [submodule "n"] path = examples/sdh-eval-workloads/ipnsw/hb-prog-eval url = git@github.com:bespoke-silicon-group/hb-prog-eval +[submodule "examples/sdh-eval-workloads/ipnsw/graph-tools"] + path = examples/sdh-eval-workloads/ipnsw/graph-tools + url = git@github.com:mrutt92/graph-tools +[submodule "examples/sdh-eval-workloads/ipnsw/hammerblade-helpers"] + path = examples/sdh-eval-workloads/ipnsw/hammerblade-helpers + url = git@github.com:mrutt92/hammerblade-helpers diff --git a/examples/sdh-eval-workloads/ipnsw/graph-tools b/examples/sdh-eval-workloads/ipnsw/graph-tools new file mode 160000 index 000000000..a7304c67c --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/graph-tools @@ -0,0 +1 @@ +Subproject commit a7304c67c34070877e57719fd183c4a5ee569904 diff --git a/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers b/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers new file mode 160000 index 000000000..9a26b6d0c --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/hammerblade-helpers @@ -0,0 +1 @@ +Subproject commit 9a26b6d0cbe04a9cc627cce7049a0ba97ca66621 diff --git a/examples/sdh-eval-workloads/ipnsw/hb-prog-eval b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval index 5915cc2c4..f113c0865 160000 --- a/examples/sdh-eval-workloads/ipnsw/hb-prog-eval +++ b/examples/sdh-eval-workloads/ipnsw/hb-prog-eval @@ -1 +1 @@ -Subproject commit 5915cc2c4bc6336102c452a4e7d0a7b06ccf9222 +Subproject commit f113c0865d2d9491551dab8f8b500445b75429bc From 663f2089bea13bd027a205eb0934c1f05dc9aad4 Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Fri, 30 Apr 2021 19:20:26 -0700 Subject: [PATCH 05/22] [ipnsw] Finally ported and working --- examples/sdh-eval-workloads/ipnsw/Makefile | 514 ++++---------------- examples/sdh-eval-workloads/ipnsw/ipnsw.cpp | 2 + 2 files changed, 109 insertions(+), 407 deletions(-) diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile index a8350c2cd..c53ea4368 100644 --- a/examples/sdh-eval-workloads/ipnsw/Makefile +++ b/examples/sdh-eval-workloads/ipnsw/Makefile @@ -1,466 +1,166 @@ -# Copyright (c) 2019, University of Washington All rights reserved. -# -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, this list -# of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, this -# list of conditions and the following disclaimer in the documentation and/or -# other materials provided with the distribution. -# -# Neither the name of the copyright holder nor the names of its contributors may -# be used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -################################################################################ -# Paths / Environment Configuration -################################################################################ -_REPO_ROOT ?= $(shell git rev-parse --show-toplevel) -CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) - --include $(_REPO_ROOT)/environment.mk - -################################################################################ -# Define BSG_MACHINE_PATH, the location of the Makefile.machine.include file -# that defines the machine to compile and simulate on. Using BSG_F1_DIR (which -# is set in environment.mk) uses the same machine as in bsg_replicant. -################################################################################ - -BSG_MACHINE_PATH=$(BSG_F1_DIR)/machines/pod_X1Y1_ruche_X16Y8_hbm +##################### +# Standard includes # +##################### +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) +include $(REPLICANT_PATH)/environment.mk + +####################################### +# Base clase run directory generation # +####################################### +# $1 = name +# $2 = version +# $3 = args +define run-dir +run/$1/kernel.cpp: kernel/$2/kernel.cpp + @mkdir -p $$(dir $$@) + @cp $$< $$@ + @echo "MAKING $$@" + +run/$1/Makefile: template.mk + @mkdir -p $$(dir $$@) + @cat $$< > $$@ + @echo "C_ARGS += $3" >> $$@ + @echo "MAKING $$@" + +.PHONY: generate-$1 build-$1 purge-$1 run-$1 + +generate-$1: run/$1/Makefile run/$1/kernel.cpp +purge-$1: + rm -rf run/$1 +build-$1: generate-$1 + $(MAKE) -C run/$1 main.riscv +run-$1: generate-$1 + $(MAKE) -C run/$1 main.exec.log +endef -################################################################################ -# Define the range of versions -################################################################################ +################################# +# Common command line arguments # +################################# +C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/database_music100.bin +C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/query_music100.bin +C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_0 +C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_1 +C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_2 +C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_3 + +############### +# Greedy Walk # +############### +# greedy-walk version -> dimensions # inner product with ipc=0.3 (8x4) -VERSIONS := greedy_walk greedy_walk-grp-x := 1 greedy_walk-grp-y := 1 # inner product with ipc=0.43 (8x4) -VERSIONS += greedy_walk_v1 greedy_walk_v1-grp-x := 1 greedy_walk_v1-grp-y := 1 # inner product with FLOPS/cycle=0.2 (8x4) -VERSIONS += greedy_walk_v2 greedy_walk_v2-grp-x := 1 greedy_walk_v2-grp-y := 1 # inner product with FLOPS/cycle=0.26 (8x4) -VERSIONS += greedy_walk_v3 greedy_walk_v3-grp-x := 1 greedy_walk_v3-grp-y := 1 # inner product v4-serial -VERSIONS += greedy_walk_v3-ipv4serial greedy_walk_v3-ipv4serial-grp-x := 1 greedy_walk_v3-ipv4serial-grp-y := 1 # greedy_walk_v3 + ParallelInnerProduct_v1 -VERSIONS += greedy_walk_v4 greedy_walk_v4-grp-x := 2 greedy_walk_v4-grp-y := 2 +# $1 = version +# $2 = query +greedy-walk-name = $(1)_query$(2) +define greedy-walk +$(eval $(call run-dir,$(call greedy-walk-name,$1,$2),$1,\ +$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call greedy-walk-name,$1,$2)/kernel.riscv \ +$1 \ +$(C_ARGS) \ +--queries $(2) \ +--group-x $($(1)-grp-x) \ +--group-y $($(1)-grp-y) \ +)) +greedy-walk-generate: generate-$(call greedy-walk-name,$1,$2) +greedy-walk-purge: purge-$(call greedy-walk-name,$1,$2) +greedy-walk-build: build-$(call greedy-walk-name,$1,$2) +greedy-walk-run: run-$(call greedy-walk-name,$1,$2) +endef +.PHONY: greedy-walk-generate +.PHONY: greedy-walk-purge +.PHONY: greedy-walk-build +.PHONY: greedy-walk-run + +############### +# Beam Search # +############### +# beam-search version -> dimensions + # very slow - uses a very dumb sparse set -VERSIONS += beam_search beam_search-grp-x := 1 beam_search-grp-y := 1 # dense set - inner product with ipc=0.3 (8x4) -VERSIONS += beam_search_v1 beam_search_v1-grp-x := 1 beam_search_v1-grp-y := 1 # dense set - inner product with ipc=0.43 (8x4) -VERSIONS += beam_search_v2 beam_search_v2-grp-x := 1 beam_search_v2-grp-y := 1 # + inner_product_v2 (flops/cycle=0.2039) (8x4) -VERSIONS += beam_search_v3 beam_search_v3-grp-x := 1 beam_search_v3-grp-y := 1 # + inner_product_v3 (flops/cycle=0.2663) (8x4) -VERSIONS += beam_search_v4 beam_search_v4-grp-x := 1 beam_search_v4-grp-y := 1 # + Bit vector for dense set -VERSIONS += beam_search_v5 beam_search_v5-grp-x := 1 beam_search_v5-grp-y := 1 # + Bit vector for dense set + inner product v4 seria; -VERSIONS += beam_search_v5-ipv4serial beam_search_v5-ipv4serial-grp-x := 1 beam_search_v5-ipv4serial-grp-y := 1 # beam_search_v5 + inner_product_parallel_v3 -VERSIONS += beam_search_v6 beam_search_v6-grp-x := 2 beam_search_v6-grp-y := 2 # beam_search_v6 but with 1x2 tile group -VERSIONS += beam_search_v7 beam_search_v7-grp-x := 1 beam_search_v7-grp-y := 2 # beam_search_v5 but edges of candidates traversed in parallel -VERSIONS += beam_search_v8 beam_search_v8-grp-x := 4 beam_search_v8-grp-y := 4 # combination of beam_search_v8 + beam_search_v6 -VERSIONS += beam_search_v9 beam_search_v9-grp-x := 4 beam_search_v9-grp-y := 4 # beam_search_v5 but edges of candidates traversed in parallel -VERSIONS += beam_search_v10 beam_search_v10-grp-x := 8 beam_search_v10-grp-y := 4 -# debugging this makefile -VERSIONS += debug -debug-grp-x := 0 -debug-grp-y := 0 - - # baseline - ipc = 0.3 -VERSIONS += iproduct_ubmk -iproduct_ubmk-grp-x := 1 -iproduct_ubmk-grp-y := 1 -# using clang and NBLs, ipc = 0.43, flops/cycle = 0.1867 -VERSIONS += iproduct_ubmk_v1 -iproduct_ubmk_v1-grp-x := 1 -iproduct_ubmk_v1-grp-y := 1 -# + FMA, ipc = 0.386, flops/cycle = 0.2039 -VERSIONS += iproduct_ubmk_v2 -iproduct_ubmk_v2-grp-x := 1 -iproduct_ubmk_v2-grp-y := 1 -# + explicit parallel fma (ipc=0.45,flops/cycle = 0.2663) (8x4) -VERSIONS += iproduct_ubmk_v3 -iproduct_ubmk_v3-grp-x := 1 -iproduct_ubmk_v3-grp-y := 1 -# Slightly cleaner code than v3 - similar performance -VERSIONS += iproduct_ubmk_v4 -iproduct_ubmk_v4-grp-x := 1 -iproduct_ubmk_v4-grp-y := 1 - #... -VERSIONS += iproduct_ubmk_parallel -iproduct_ubmk_parallel-grp-x := 1 -iproduct_ubmk_parallel-grp-y := 1 - #... -VERSIONS += iproduct_ubmk-parallel_v1 -iproduct_ubmk-parallel_v1-grp-x := 2 -iproduct_ubmk-parallel_v1-grp-y := 2 - #... -VERSIONS += iproduct_ubmk-parallel_v1.1 -iproduct_ubmk-parallel_v1.1-grp-x := 2 -iproduct_ubmk-parallel_v1.1-grp-y := 2 - #... -VERSIONS += iproduct_ubmk-parallel_v2 -iproduct_ubmk-parallel_v2-grp-x := 2 -iproduct_ubmk-parallel_v2-grp-y := 2 -#... same as v2 but with (1x4 tg) -VERSIONS += iproduct_ubmk-parallel_v3 -iproduct_ubmk-parallel_v3-grp-x := 2 -iproduct_ubmk-parallel_v3-grp-y := 2 - -_KERNEL_COMPILER = CLANG -################################################################################ -# Define any sources that should be used compiled during kernel compilation, -# including the source file with the kernel itself. kernel.riscv will -# be the name of the compiled RISC-V Binary for the Manycore -# -# Use KERNEL_*LIBRARIES list sources that should be compiled and linked with all -# kernel.cpp versions. However, if you have version-specific sources you must -# come up with your own solution. -# -# Use KERNEL_INCLUDES to specify the path to directories that contain headers. -################################################################################ - -# C Libraries -KERNEL_CLIBRARIES += -# C++ Libraries -KERNEL_CXXLIBRARIES += - -KERNEL_INCLUDES += -I$(CURRENT_PATH)/kernel/include - -# Define the default kernel.cpp file. If KERNEL_DEFAULT is not defined it will -# be set to kernel.cpp in the same directory as this Makefile. -DEFAULT_VERSION := greedy_walk_v3 -KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.cpp -#KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.c - -################################################################################ -# Include the kernel build rules (This must be included after KERNEL_*LIBRARIES, -# KERNEL_DEFAULT, KERNEL_INCLUDES, etc) -################################################################################ - --include $(EXAMPLES_PATH)/examples/cuda/riscv.mk - -################################################################################ -# END OF KERNEL-SPECIFIC RULES / START OF HOST-SPECIFIC RULES -################################################################################ - -################################################################################ -# Include the Cosimulation host build rules (This must be included after -# HOST_*SOURCES, HOST_TARGET, HOST_INCLUDES, etc) -################################################################################ -HOST_TARGET = ipnsw -C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/database_music100.bin -C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/query_music100.bin -C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_0 -C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_1 -C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_2 -C_ARGS += $(CURRENT_PATH)/hb-prog-eval/ipnsw/data/music.edges.level_3 - -# set group x/y values -define VERSION-SET-ARGS -kernel/$(1)/$(HOST_TARGET).log: ARGS += --group-x $($(1)-grp-x) -kernel/$(1)/$(HOST_TARGET).log: ARGS += --group-y $($(1)-grp-y) -endef -$(foreach v,$(VERSIONS),$(eval $(call VERSION-SET-ARGS,$v))) - -################################ -# Inner Product U-Benchmarking # -################################ -# number iproducts -N-IPRODUCTS := 150 500 1000 1500 2000 3000 -IPRODUCT-BASENAME := iproduct_ubmk_v4 - -define IPRODUCT-UBMK-RULE -# creates run directory from template -kernel/iproduct_ubmk-$(1)/kernel.cpp: kernel/$(IPRODUCT-BASENAME)/kernel.cpp - mkdir -p $$(dir $$@) - cp $$< $$@ - -# adds arguments -kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: C_ARGS += --num-iproducts $(1) -kernel/iproduct_ubmk-$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv -kernel/iproduct_ubmk-$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) - -# adds to list of iproduct u-bmk -IPRODUCT-UBMK-VERSIONS += iproduct_ubmk-$(1) +# $1 = version +# $2 = query +beam-search-name = $(1)_query$(2) +define beam-search +$(eval $(call run-dir,$(call beam-search-name,$1,$2),$1,\ +$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call beam-search-name,$1,$2)/kernel.riscv \ +$1 \ +$(C_ARGS) \ +--queries $(2) \ +--group-x $($(1)-grp-x) \ +--group-y $($(1)-grp-y) \ +)) +beam-search-generate: generate-$(call beam-search-name,$1,$2) +beam-search-purge: purge-$(call beam-search-name,$1,$2) +beam-search-build: build-$(call beam-search-name,$1,$2) +beam-search-run: run-$(call beam-search-name,$1,$2) endef - -# Expand rule for each inner product input -$(foreach nip,$(N-IPRODUCTS),$(eval $(call IPRODUCT-UBMK-RULE,$(nip)))) - -.PHONY: create-iproduct-ubmk -.PHONY: purge-iproduct-ubmk -.PHONY: iproduct-ubmk-stats - -# create rule -create-iproduct-ubmk: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/kernel.cpp) - -# purge rule -purge-iproduct-ubmk: - rm -rf $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v) - -# collect stats for all -iproduct-ubmk-stats: create-iproduct-ubmk -iproduct-ubmk-stats: $(foreach v,$(IPRODUCT-UBMK-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log) - -# Add to versions -VERSIONS += $(IPRODUCT-UBMK-VERSIONS) - -######################################### -# Parallel Inner Product U-Benchmarking # -######################################### -# number iproducts -N-IPRODUCTS := 1500 -GRID-X := 1 2 4 8 -GRID-Y := 1 2 4 - -IPRODUCT-PARALLEL-BASENAME := iproduct_ubmk-parallel_v1 - -define IPRODUCT-UBMK-PARALLEL-RULE -# creates run directory from template -kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.cpp: kernel/$(IPRODUCT-PARALLEL-BASENAME)/kernel.cpp - mkdir -p $$(dir $$@) - cp $$< $$@ - -# adds arguments -kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --num-iproducts $(1) -kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --grid-x $(2) -kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --grid-y $(3) -kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --group-x $($(IPRODUCT-PARALLEL-BASENAME)-grp-x) -kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: C_ARGS += --group-y $($(IPRODUCT-PARALLEL-BASENAME)-grp-y) -kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.riscv -kernel/iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) -# adds to list of iproduct u-bmk -IPRODUCT-UBMK-PARALLEL-VERSIONS += iproduct_ubmk-parallel-$(1)-gx$(2)-gy$(3) -endef - -# Expand rule for each inner product input -$(foreach gy,$(GRID-Y),$(foreach gx,$(GRID-X),$(foreach nip,$(N-IPRODUCTS), $(eval $(call IPRODUCT-UBMK-PARALLEL-RULE,$(nip),$(gx),$(gy)))))) - -.PHONY: create-iproduct-ubmk-parallel -.PHONY: purge-iproduct-ubmk-parallel -.PHONY: iproduct-ubmk-parallel-stats - -# create rule -create-iproduct-ubmk-parallel: $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v/kernel.cpp) - -# purge rule -purge-iproduct-ubmk-parallel: - rm -rf $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v) - -# collect stats for all -iproduct-ubmk-parallel-stats: create-iproduct-ubmk-parallel -iproduct-ubmk-parallel-stats: $(foreach v,$(IPRODUCT-UBMK-PARALLEL-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log) - -# Add to versions -VERSIONS += $(IPRODUCT-UBMK-PARALLEL-VERSIONS) - -#################### -# Greedy Walk Runs # -#################### -GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490 -GREEDY-WALK-QUERIES := 4 16 229 276 461 470 490 -GREEDY-WALK-BASENAME := greedy_walk_v4 -define GREEDY-WALK-RULE -# creates run directory from template -kernel/greedy_walk-query$(1)/kernel.cpp: kernel/$(GREEDY-WALK-BASENAME)/kernel.cpp - mkdir -p $$(dir $$@) - cp $$< $$@ - -# adds arguments -kernel/greedy_walk-query$(1)/$(HOST_TARGET).exec.log: C_ARGS += --queries $(1) -kernel/greedy_walk-query$(1)/$(HOST_TARGET).exec.log: BSG_MANYCORE_KERNELS = kernel/iproduct_ubmk-$(1)/kernel.riscv -kernel/greedy_walk-query$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) - -# adds to list of greedy walk versions -GREEDY-WALK-VERSIONS += greedy_walk-query$(1) -endef - -# Expand rule for each query -$(foreach q,$(GREEDY-WALK-QUERIES),$(eval $(call GREEDY-WALK-RULE,$(q)))) - -.PHONY: create-greedy-walk -.PHONY: purge-greedy-walk -.PHONY: greedy-walk-stats - -# create rule -create-greedy-walk: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/kernel.cpp) - -# purge rule -purge-greedy-walk: - rm -rf $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v) - -# collect stats for all -greedy-walk-stats: create-greedy-walk -greedy-walk-stats: $(foreach v,$(GREEDY-WALK-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log) - -# Add to versions -VERSIONS += $(GREEDY-WALK-VERSIONS) - -#################### -# Beam Search Runs # -#################### -BEAM-SEARCH-QUERIES := 2 188 229 355 427 472 -BEAM-SEARCH-QUERIES += 25 74 112 140 148 178 -BEAM-SEARCH-QUERIES += 214 244 278 302 331 -BEAM-SEARCH-QUERIES += 396 420 452 489 511 - -BEAM-SEARCH-BASENAME := beam_search_v10 - -define BEAM-SEARCH-RULE -# creates run directory from template -kernel/beam_search-query$(1)/kernel.cpp: kernel/$(BEAM-SEARCH-BASENAME)/kernel.cpp - mkdir -p $$(dir $$@) - cp $$< $$@ - -# adds arguments -kernel/beam_search-query$(1)/$(HOST_TARGET).log: C_ARGS += --queries $(1) -kernel/beam_search-query$(1)/$(HOST_TARGET).log: BSG_MANYCORE_KERNELS = kernel/beam_search-query$(1)/kernel.riscv -kernel/beam_search-query$$(1)/kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) - -# adds to list of greedy walk versions -BEAM-SEARCH-VERSIONS += beam_search-query$(1) -endef - - -# Expand rule for each query -$(foreach q,$(BEAM-SEARCH-QUERIES),$(eval $(call BEAM-SEARCH-RULE,$(q)))) - -.PHONY: create-beam-search -.PHONY: purge-beam-search -.PHONY: beam-search-stats - -# create rule -create-beam-search: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/kernel.cpp) - -# purge rule -purge-beam-search: - rm -rf $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v) - -# collect stats for all -beam-search-stats: create-beam-search -beam-search-stats: $(foreach v,$(BEAM-SEARCH-VERSIONS),kernel/$v/$(HOST_TARGET).exec.log) - -# Add to versions -VERSIONS += $(BEAM-SEARCH-VERSIONS) - -######################################## -# Continue including cosim build rules # -######################################## - -GRAPH-TOOLS := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/graph-tools -graphtools-dir := $(GRAPH-TOOLS) - -include $(GRAPH-TOOLS)/libgraphtools.mk - -HB-HELPERS := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hammerblade-helpers -hammerblade-helpers-dir := $(HB-HELPERS) -include $(HB-HELPERS)/libhammerblade-helpers-host.mk - -CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags) -CXXFLAGS += $(libgraphtools-interface-cxxflags) -CXXFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw -CXXFLAGS += -DCOSIM - -LDFLAGS += $(libhammerblade-helpers-host-interface-ldflags) -LDFLAGS += $(libgraphtools-interface-ldflags) - -GreedyWalkResults.o: $(libhammerblade-helpers-host-interface-headers) -GreedyWalkResults.o: $(libgraphtools-interface-headers) -GreedyWalkResults.o: $(libgraphtools-interface-libraries) -GreedyWalkResults.o: GreedyWalkResults.cpp -GreedyWalkResults.o: GreedyWalkResults.hpp - -ipnsw.o: $(libhammerblade-helpers-host-interface-headers) -ipnsw.o: $(libgraphtools-interface-headers) -ipnsw.o: $(libgraphtools-interface-libraries) -ipnsw.o: IO.hpp -ipnsw.o: IPNSWGraph.hpp -ipnsw.o: IPNSWRunner.hpp -ipnsw.o: IPNSWKernelRunner.hpp -ipnsw.o: GreedyWalkKernelRunner.hpp -ipnsw.o: BeamSearchKernelRunner.hpp -ipnsw.o: IProductUBmkKernelRunner.hpp -ipnsw.o: IPNSWResultReader.hpp -ipnsw.o: GreedyWalkResultReader.hpp -ipnsw.o: BeamSearchResultReader.hpp -ipnsw.o: GreedyWalkResults.hpp -ipnsw.o: IPNSWFactory.hpp -ipnsw.o: GreedyWalkFactory.hpp -ipnsw.o: BeamSearchFactory.hpp -ipnsw.o: IProductUBmkFactory.hpp -ipnsw.o: StringHelpers.hpp - -TEST_SOURCES = ipnsw.cpp GreedyWalkResults.cpp - --include $(EXAMPLES_PATH)/compilation.mk --include $(EXAMPLES_PATH)/link.mk --include $(EXAMPLES_PATH)/execution.mk - -################################################################################ -# Define the clean rules. clean calls the makefile-specific cleans, whereas -# users can add commands and dependencies to custom.clean. -################################################################################ -version.clean: - rm -rf kernel/*/*{.csv,.log,.rvo,.riscv,.vpd,.key,.png,.dis} - rm -rf kernel/*/{stats,pc_stats} - -custom.clean: version.clean +.PHONY: beam-search-generate +.PHONY: beam-search-purge +.PHONY: beam-search-build +.PHONY: beam-search-run + +############################################################# +# Define which queries we want to run and instantiate rules # +############################################################# +greedy-walk-queries := 4 16 229 276 461 470 490 +$(foreach q,$(greedy-walk-queries),$(eval $(call greedy-walk,greedy_walk_v4,$(q)))) + +beam-search-queries := 2 188 229 355 427 472 +beam-search-queries += 25 74 112 140 148 178 +beam-search-queries += 214 244 278 302 331 +beam-search-queries += 396 420 452 489 511 +$(foreach q,$(beam-search-queries),$(eval $(call beam-search,beam_search_v10,$(q)))) diff --git a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp index 8de8e073e..23f2b20d1 100644 --- a/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp +++ b/examples/sdh-eval-workloads/ipnsw/ipnsw.cpp @@ -28,6 +28,8 @@ int Main(int argc, char *argv[]) Parser args; args.parse(argc, argv); + std::cout << args.str() << std::endl; + std::unique_ptr runner; std::unique_ptr factory; From 79aad5fbc501efeef62a09fd2909880e3bec4df9 Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Fri, 30 Apr 2021 19:22:38 -0700 Subject: [PATCH 06/22] [ipnsw] adds a profile rule --- examples/sdh-eval-workloads/ipnsw/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile index c53ea4368..6d2c4ba49 100644 --- a/examples/sdh-eval-workloads/ipnsw/Makefile +++ b/examples/sdh-eval-workloads/ipnsw/Makefile @@ -22,7 +22,7 @@ run/$1/Makefile: template.mk @echo "C_ARGS += $3" >> $$@ @echo "MAKING $$@" -.PHONY: generate-$1 build-$1 purge-$1 run-$1 +.PHONY: generate-$1 build-$1 purge-$1 run-$1 profile-$1 generate-$1: run/$1/Makefile run/$1/kernel.cpp purge-$1: @@ -31,6 +31,8 @@ build-$1: generate-$1 $(MAKE) -C run/$1 main.riscv run-$1: generate-$1 $(MAKE) -C run/$1 main.exec.log +profile-$1: generate-$1 + $(MAKE) -C run/$1 main.profile.log endef ################################# @@ -82,11 +84,13 @@ greedy-walk-generate: generate-$(call greedy-walk-name,$1,$2) greedy-walk-purge: purge-$(call greedy-walk-name,$1,$2) greedy-walk-build: build-$(call greedy-walk-name,$1,$2) greedy-walk-run: run-$(call greedy-walk-name,$1,$2) +greedy-walk-profile: profile-$(call greedy-walk-name,$1,$2) endef .PHONY: greedy-walk-generate .PHONY: greedy-walk-purge .PHONY: greedy-walk-build .PHONY: greedy-walk-run +.PHONY: greedy-walk-profile ############### # Beam Search # @@ -146,11 +150,13 @@ beam-search-generate: generate-$(call beam-search-name,$1,$2) beam-search-purge: purge-$(call beam-search-name,$1,$2) beam-search-build: build-$(call beam-search-name,$1,$2) beam-search-run: run-$(call beam-search-name,$1,$2) +beam-search-profile: profile-$(call beam-search-name,$1,$2) endef .PHONY: beam-search-generate .PHONY: beam-search-purge .PHONY: beam-search-build .PHONY: beam-search-run +.PHONY: beam-search-profile ############################################################# # Define which queries we want to run and instantiate rules # From de5dec8dc29e989664b40dcd2be8714a508324d8 Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Fri, 30 Apr 2021 19:22:54 -0700 Subject: [PATCH 07/22] [ipnsw] adds missing template makefile --- examples/sdh-eval-workloads/ipnsw/template.mk | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 examples/sdh-eval-workloads/ipnsw/template.mk diff --git a/examples/sdh-eval-workloads/ipnsw/template.mk b/examples/sdh-eval-workloads/ipnsw/template.mk new file mode 100644 index 000000000..13c1e5919 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/template.mk @@ -0,0 +1,72 @@ +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) +include $(REPLICANT_PATH)/environment.mk +include $(BSG_MACHINE_PATH)/Makefile.machine.include + +# kernel code +BSG_MANYCORE_KERNELS = kernel.riscv + +RISCV_CCPPFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/kernel/include +RISCV_CCPPFLAGS += -Dbsg_tiles_X=1 +RISCV_CCPPFLAGS += -Dbsg_tiles_Y=1 + +RISCV_TARGET_OBJECTS = kernel.rvo +kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) +RISCV_OPT_LEVEL = -O3 +include $(EXAMPLES_PATH)/cuda/riscv.mk +RISCV_LDFLAGS := $(filter-out -nostdlib,$(RISCV_LDFLAGS)) + +# host code +graphtools-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/graph-tools +hammerblade-helpers-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hammerblade-helpers + +include $(graphtools-dir)/libgraphtools.mk +include $(hammerblade-helpers-dir)/libhammerblade-helpers-host.mk + +# header files +TEST_HEADERS := $(libhammerblade-helpers-host-interface-headers) +TEST_HEADERS += $(libgraphtools-interface-headers) +TEST_HEADERS += GreedyWalkResults.hpp +TEST_HEADERS += IO.hpp +TEST_HEADERS += IPNSWGraph.hpp +TEST_HEADERS += IPNSWRunner.hpp +TEST_HEADERS += IPNSWKernelRunner.hpp +TEST_HEADERS += GreedyWalkKernelRunner.hpp +TEST_HEADERS += BeamSearchKernelRunner.hpp +TEST_HEADERS += IProductUBmkKernelRunner.hpp +TEST_HEADERS += IPNSWResultReader.hpp +TEST_HEADERS += GreedyWalkResultReader.hpp +TEST_HEADERS += BeamSearchResultReader.hpp +TEST_HEADERS += GreedyWalkResults.hpp +TEST_HEADERS += IPNSWFactory.hpp +TEST_HEADERS += GreedyWalkFactory.hpp +TEST_HEADERS += BeamSearchFactory.hpp +TEST_HEADERS += IProductUBmkFactory.hpp +TEST_HEADERS += StringHelpers.hpp + +# source files +TEST_SOURCES := GreedyWalkResults.cpp +TEST_SOURCES += ipnsw.cpp + +# cxxflags +CXXFLAGS += $(libgraphtools-interface-cxxflags) +CXXFLAGS += $(libhammerblade-helpers-host-interface-cxxflags) +CXXFLAGS += -I$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw +CXXFLAGS += -DCOSIM + +# ldflags +LDFLAGS += $(libgraphtools-interface-ldflags) +LDFLAGS += $(libhammerblade-helpers-host-interface-ldflags) + +vpath %.cpp $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw +vpath %.hpp $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw + +TEST_NAME = main + +include $(EXAMPLES_PATH)/compilation.mk +include $(EXAMPLES_PATH)/link.mk + +# mark dependencies +$(TEST_OBJECTS): $(libgraphtools-interface-libraries) +$(TEST_OBJECTS): $(TEST_HEADERS) + +include $(EXAMPLES_PATH)/execution.mk From 7b25a5944edaa439b3a098b89832f733d2586d9e Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Sun, 2 May 2021 09:24:33 -0700 Subject: [PATCH 08/22] [ipnsw] cleans up the directory a bit --- examples/sdh-eval-workloads/ipnsw/.gitignore | 1 + examples/sdh-eval-workloads/ipnsw/Makefile | 82 +++++++++----------- 2 files changed, 39 insertions(+), 44 deletions(-) create mode 100644 examples/sdh-eval-workloads/ipnsw/.gitignore diff --git a/examples/sdh-eval-workloads/ipnsw/.gitignore b/examples/sdh-eval-workloads/ipnsw/.gitignore new file mode 100644 index 000000000..737e26b00 --- /dev/null +++ b/examples/sdh-eval-workloads/ipnsw/.gitignore @@ -0,0 +1 @@ +run/ \ No newline at end of file diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile index 6d2c4ba49..d643c9e9f 100644 --- a/examples/sdh-eval-workloads/ipnsw/Makefile +++ b/examples/sdh-eval-workloads/ipnsw/Makefile @@ -4,6 +4,8 @@ REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) include $(REPLICANT_PATH)/environment.mk +all: + ####################################### # Base clase run directory generation # ####################################### @@ -25,14 +27,24 @@ run/$1/Makefile: template.mk .PHONY: generate-$1 build-$1 purge-$1 run-$1 profile-$1 generate-$1: run/$1/Makefile run/$1/kernel.cpp + purge-$1: rm -rf run/$1 + build-$1: generate-$1 - $(MAKE) -C run/$1 main.riscv -run-$1: generate-$1 - $(MAKE) -C run/$1 main.exec.log + +$(MAKE) -C run/$1 main.riscv + +exec-$1: generate-$1 + +$(MAKE) -C run/$1 main.exec.log + profile-$1: generate-$1 - $(MAKE) -C run/$1 main.profile.log + +$(MAKE) -C run/$1 main.profile.log + +debug-$1: generate-$1 + +$(MAKE) -C run/$1/main.debug.log + +saif-$1: generate-$1 + +$(MAKE) -C run/$1/main.saifgen.log endef ################################# @@ -68,30 +80,6 @@ greedy_walk_v3-ipv4serial-grp-y := 1 greedy_walk_v4-grp-x := 2 greedy_walk_v4-grp-y := 2 -# $1 = version -# $2 = query -greedy-walk-name = $(1)_query$(2) -define greedy-walk -$(eval $(call run-dir,$(call greedy-walk-name,$1,$2),$1,\ -$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call greedy-walk-name,$1,$2)/kernel.riscv \ -$1 \ -$(C_ARGS) \ ---queries $(2) \ ---group-x $($(1)-grp-x) \ ---group-y $($(1)-grp-y) \ -)) -greedy-walk-generate: generate-$(call greedy-walk-name,$1,$2) -greedy-walk-purge: purge-$(call greedy-walk-name,$1,$2) -greedy-walk-build: build-$(call greedy-walk-name,$1,$2) -greedy-walk-run: run-$(call greedy-walk-name,$1,$2) -greedy-walk-profile: profile-$(call greedy-walk-name,$1,$2) -endef -.PHONY: greedy-walk-generate -.PHONY: greedy-walk-purge -.PHONY: greedy-walk-build -.PHONY: greedy-walk-run -.PHONY: greedy-walk-profile - ############### # Beam Search # ############### @@ -136,37 +124,43 @@ beam_search_v10-grp-y := 4 # $1 = version # $2 = query -beam-search-name = $(1)_query$(2) -define beam-search -$(eval $(call run-dir,$(call beam-search-name,$1,$2),$1,\ -$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call beam-search-name,$1,$2)/kernel.riscv \ +run-name = $(1)_query$(2) +define run +$(eval $(call run-dir,$(call run-name,$1,$2),$1,\ +$(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/run/$(call run-name,$1,$2)/kernel.riscv \ $1 \ $(C_ARGS) \ --queries $(2) \ --group-x $($(1)-grp-x) \ --group-y $($(1)-grp-y) \ )) -beam-search-generate: generate-$(call beam-search-name,$1,$2) -beam-search-purge: purge-$(call beam-search-name,$1,$2) -beam-search-build: build-$(call beam-search-name,$1,$2) -beam-search-run: run-$(call beam-search-name,$1,$2) -beam-search-profile: profile-$(call beam-search-name,$1,$2) +generate: generate-$(call run-name,$1,$2) +purge: purge-$(call run-name,$1,$2) +build: build-$(call run-name,$1,$2) +exec: exec-$(call run-name,$1,$2) +profile: profile-$(call run-name,$1,$2) +debug: debug-$(call run-name,$1,$2) +saifgen: saifgen-$(call run-name,$1,$2) endef -.PHONY: beam-search-generate -.PHONY: beam-search-purge -.PHONY: beam-search-build -.PHONY: beam-search-run -.PHONY: beam-search-profile +.PHONY: generate +.PHONY: purge +.PHONY: build +.PHONY: exec +.PHONY: profile +.PHONY: debug +.PHONY: saifgen ############################################################# # Define which queries we want to run and instantiate rules # ############################################################# greedy-walk-queries := 4 16 229 276 461 470 490 -$(foreach q,$(greedy-walk-queries),$(eval $(call greedy-walk,greedy_walk_v4,$(q)))) +$(foreach q,$(greedy-walk-queries),$(eval $(call run,greedy_walk_v4,$(q)))) beam-search-queries := 2 188 229 355 427 472 beam-search-queries += 25 74 112 140 148 178 beam-search-queries += 214 244 278 302 331 beam-search-queries += 396 420 452 489 511 -$(foreach q,$(beam-search-queries),$(eval $(call beam-search,beam_search_v10,$(q)))) +$(foreach q,$(beam-search-queries),$(eval $(call run,beam_search_v10,$(q)))) +.PHONY: all +all: exec From a36d3b03acc507dbd613a348b4196683cb741d14 Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Mon, 22 Mar 2021 10:01:46 -0700 Subject: [PATCH 09/22] starting a graphit test dir, adding test_vec_add_parallel as dummy test for now --- examples/Makefile | 2 +- examples/graphit/Makefile | 61 +++++ examples/graphit/riscv.mk | 257 ++++++++++++++++++ .../graphit/test_vec_add_parallel/Makefile | 142 ++++++++++ .../graphit/test_vec_add_parallel/kernel.cpp | 20 ++ examples/graphit/test_vec_add_parallel/main.c | 196 +++++++++++++ 6 files changed, 677 insertions(+), 1 deletion(-) create mode 100644 examples/graphit/Makefile create mode 100644 examples/graphit/riscv.mk create mode 100644 examples/graphit/test_vec_add_parallel/Makefile create mode 100644 examples/graphit/test_vec_add_parallel/kernel.cpp create mode 100644 examples/graphit/test_vec_add_parallel/main.c diff --git a/examples/Makefile b/examples/Makefile index 1bc3055e8..6fda8a5ef 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -45,7 +45,7 @@ include $(REPLICANT_PATH)/environment.mk include $(EXAMPLES_PATH)/link.mk # Supported example suites -TARGETS = library spmd cuda python +TARGETS = library spmd cuda python graphit # Define the tests that get run TESTS += test_loader diff --git a/examples/graphit/Makefile b/examples/graphit/Makefile new file mode 100644 index 000000000..1ac9533b9 --- /dev/null +++ b/examples/graphit/Makefile @@ -0,0 +1,61 @@ +# Copyright (c) 2019, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This Makefile compiles, links, and executes examples Run `make help` +# to see the available targets for the selected platform. + +# environment.mk verifies the build environment and sets the following +# makefile variables: +# +# LIBRAIRES_PATH: The path to the libraries directory +# HARDWARE_PATH: The path to the hardware directory +# EXAMPLES_PATH: The path to the examples directory +# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL +# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore +# CL_DIR: Path to the directory of this AWS F1 Project +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) + +include $(REPLICANT_PATH)/environment.mk + +# Defines REGRESSION_PREBUILD +include $(EXAMPLES_PATH)/link.mk + +# Define the tests that get run +TESTS += test_vec_add_parallel + +regression: $(TESTS) + @echo "GRAPHIT REGRESSION PASSED" + +$(TESTS): $(REGRESSION_PREBUILD) + $(MAKE) -C $@ regression + +clean: $(TESTS:=.clean) + +%.clean: + $(MAKE) -C $(@:.clean=) clean + +.PHONY: clean regression $(TESTS) %.clean diff --git a/examples/graphit/riscv.mk b/examples/graphit/riscv.mk new file mode 100644 index 000000000..1266a5e7f --- /dev/null +++ b/examples/graphit/riscv.mk @@ -0,0 +1,257 @@ +# Copyright (c) 2019, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# TODO: Makefile comment +ORANGE=\033[0;33m +RED=\033[0;31m +NC=\033[0m + +################################################################################ +# Paths +################################################################################ +_REPO_ROOT ?= $(shell git rev-parse --show-toplevel) +-include $(_REPO_ROOT)/environment.mk + +BSG_MANYCORE_SPMD_PATH = $(BSG_MANYCORE_DIR)/software/spmd/ +BSG_MANYCORE_CUDALITE_PATH = $(BSG_MANYCORE_SPMD_PATH)/bsg_cuda_lite_runtime/ +BSG_MANYCORE_CUDALITE_MAIN_PATH = $(BSG_MANYCORE_CUDALITE_PATH)/main + +BSG_MANYCORE_LIB_PATH = $(BSG_MANYCORE_DIR)/software/bsg_manycore_lib +BSG_MANYCORE_COMMON_PATH = $(BSG_MANYCORE_SPMD_PATH)/common/ + +RISCV_TOOLS_PATH := $(BSG_MANYCORE_DIR)/software/riscv-tools/ +RISCV_GNU_PATH := $(RISCV_TOOLS_PATH)/riscv-install +RISCV_LLVM_PATH := $(RISCV_TOOLS_PATH)/llvm/llvm-install + +################################################################################ +# Include RISC-V Tool Configuration +################################################################################ + +RISCV_LINK_GEN := $(BSG_MANYCORE_DIR)/software/py/bsg_manycore_link_gen.py + +# These flags are not supported by clang +RISCV_GNU_FLAGS = -mno-fdiv -frerun-cse-after-loop -fweb -frename-registers + +RISCV_GCC ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-gcc $(RISCV_GNU_FLAGS) +RISCV_GXX ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-g++ $(RISCV_GNU_FLAGS) +RISCV_ELF2HEX ?= LD_LIBRARY_PATH=$(RISCV_GNU_PATH)/lib $(RISCV_GNU_PATH)/bin/elf2hex +RISCV_OBJCOPY ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-objcopy +RISCV_AR ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-ar +RISCV_OBJDUMP ?= $(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs-objdump +RISCV_LINK ?= $(RISCV_GCC) -t -T $(LINK_SCRIPT) $(RISCV_LDFLAGS) +RISCV_LD ?= $(RISCV_GCC) + +RISCV_CLANG_ABI = ilp32f +RISCV_CLANG_CCPPFLAGS += --target=riscv32 -mabi=$(RISCV_CLANG_ABI) +RISCV_CLANG_CXXFLAGS += --sysroot=$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs +RISCV_CLANG_CXXFLAGS += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0 +RISCV_CLANG_CXXFLAGS += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0/riscv32-unknown-elf-dramfs + +RISCV_CLANG ?= $(RISCV_LLVM_PATH)/bin/clang $(RISCV_CLANG_CFLAGS) $(RISCV_CLANG_CCPPFLAGS) +RISCV_CLANGXX ?= $(RISCV_LLVM_PATH)/bin/clang++ $(RISCV_CLANG_CXXFLAGS) $(RISCV_CLANG_CCPPFLAGS) +RISCV_LLVM_OPT ?= $(RISCV_LLVM_PATH)/bin/opt +RISCV_LLVM_LLC ?= $(RISCV_LLVM_PATH)/bin/llc +RISCV_LLVM_LIB ?= $(RISCV_LLVM_PATH)/lib + +# Set the default RISC-V Compilers. To override these globally set +# RISCV_CXX = $(RISCV_CLANGXX), etc. This can also be done on a +# per-object basis. For example, foo.rvo: RISCV_CXX=$(RISCV_CLANGXX) +RISCV_CXX ?= $(RISCV_GXX) +RISCV_CC ?= $(RISCV_GCC) + +################################################################################ +# C/C++ Compilation Flags +# +# All RISCV C/C++ compilation variables simply have RISCV_* appended. +################################################################################ +RISCV_OPT_LEVEL ?= -O2 +RISCV_ARCH_OP := rv32imaf + +# CCPPFLAGS are common between GCC and G++ +RISCV_CCPPFLAGS += $(RISCV_OPT_LEVEL) +RISCV_CCPPFLAGS += -march=$(RISCV_ARCH_OP) +RISCV_CCPPFLAGS += -g +RISCV_CCPPFLAGS += -static +RISCV_CCPPFLAGS += -ffast-math +RISCV_CCPPFLAGS += -fno-common +RISCV_CCPPFLAGS += -ffp-contract=off + +RISCV_CFLAGS += -std=gnu99 $(RISCV_CCPPFLAGS) +RISCV_CXXFLAGS += -std=c++11 $(RISCV_CCPPFLAGS) +RISCV_CXXFLAGS += -fno-threadsafe-statics + +RISCV_INCLUDES += -I$(BSG_MANYCORE_COMMON_PATH) +RISCV_INCLUDES += -I$(BSG_MANYCORE_DIR)/software/bsg_manycore_lib + +# TODO: Fail if bsg_tiles_X/Y are not set +RISCV_DEFINES += -Dbsg_global_X=$(BSG_MACHINE_GLOBAL_X) +RISCV_DEFINES += -Dbsg_global_Y=$(BSG_MACHINE_GLOBAL_Y) +RISCV_DEFINES += -Dbsg_group_size=$(BSG_MACHINE_POD_TILES) +RISCV_DEFINES += -Dbsg_pods_X=$(BSG_MACHINE_PODS_X) +RISCV_DEFINES += -Dbsg_pods_Y=$(BSG_MACHINE_PODS_Y) +RISCV_DEFINES += -DIO_X_INDEX=$(BSG_MACHINE_HOST_X_CORD) +RISCV_DEFINES += -DIO_Y_INDEX=$(BSG_MACHINE_HOST_Y_CORD) +RISCV_DEFINES += -DPREALLOCATE=0 +RISCV_DEFINES += -DHOST_DEBUG=0 + +# We build and name a machine-specific crt.rvo because it's REALLY +# difficult to figure out why your program/cosimulation is hanging +# when the wrong link script was used during linking +crt.rvo: $(BSG_MANYCORE_COMMON_PATH)/crt.S + $(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.comp.log + +# We compile these locally so that we don't interfere with the files in +# $(BSG_MANYCORE_LIB_PATH). +# BSG Manycore Library Objects +LIBBSG_MANYCORE_OBJECTS += bsg_set_tile_x_y.rvo +LIBBSG_MANYCORE_OBJECTS += bsg_tile_config_vars.rvo +LIBBSG_MANYCORE_OBJECTS += bsg_printf.rvo + +$(LIBBSG_MANYCORE_OBJECTS) main.rvo: RISCV_CXX = $(RISCV_GCC) + +$(LIBBSG_MANYCORE_OBJECTS): %.rvo:$(BSG_MANYCORE_LIB_PATH)/%.c + $(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ + +main.rvo: $(BSG_MANYCORE_CUDALITE_MAIN_PATH)/main.c + $(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ + +%.rvo: %.c + $(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.gcc.log + +%.rvo: %.cpp + $(RISCV_CXX) $(RISCV_CXXFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.gcc.log + +kernel.compile.clean: + rm -rf *.rvo *.a + +.PRECIOUS: %.rvo + +################################################################################ +# Linker Flow +################################################################################ + +# ELF File Parameters +# Default .data section location; LOCAL=>DMEM, SHARED=>DRAM. +BSG_ELF_DEFAULT_DATA_LOC ?= LOCAL + +BSG_ELF_OFF_CHIP_MEM := $(BSG_MACHINE_DRAM_INCLUDED) + +# Total addressable DRAM size (in 32-bit WORDS, and SIZE bytes) +BSG_ELF_DRAM_WORDS := $(shell expr $(BSG_MACHINE_DRAM_BANK_SIZE_WORDS) \* $(BSG_MACHINE_GLOBAL_X)) +BSG_ELF_DRAM_SIZE := $(shell expr $(BSG_ELF_DRAM_WORDS) \* 4) + +# Victim Cache Set Size (in 32-bit WORDS and SIZE bytes) +_BSG_ELF_VCACHE_SET_WORDS := $(shell expr $(BSG_MACHINE_VCACHE_WAY) \* $(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS)) +BSG_ELF_VCACHE_SET_SIZE := $(shell expr $(_BSG_ELF_VCACHE_SET_WORDS) \* 4) + +# Victim Cache Column Size (in 32-bit WORDS and SIZE bytes) +_BSG_ELF_VCACHE_COLUMN_WORDS := $(shell expr $(BSG_MACHINE_VCACHE_SET) \* $(_BSG_ELF_VCACHE_SET_WORDS)) +BSG_ELF_VCACHE_COLUMN_SIZE := $(shell expr $(_BSG_ELF_VCACHE_COLUMN_WORDS) \* 4) + +# Victim Cache Total Size (in 32-bit WORDS, and SIZE BYTES) +_BSG_ELF_VCACHE_MANYCORE_WORDS ?= $(shell expr $(BSG_MACHINE_GLOBAL_X) \* $(_BSG_ELF_VCACHE_COLUMN_WORDS)) +BSG_ELF_VCACHE_MANYCORE_SIZE := $(shell expr $(_BSG_ELF_VCACHE_MANYCORE_WORDS) \* 4) + +# Compute the ELF Stack Pointer Location. +ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL) +# If the .data segment is in DMEM (LOCAL) then put it at the top of DMEM. (This is the typical case) +BSG_ELF_STACK_PTR ?= 0x00000ffc +else + # EVA Offset in DRAM + BSG_ELF_DRAM_EVA_OFFSET = 0x80000000 + + ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1) + # Otherwise, use the top of DRAM (if present), + _BSG_ELF_DRAM_LIMIT = $(shell expr $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_DRAM_SIZE)) + else + # Or the Victim Cache address space (if DRAM is disabled/not present). + _BSG_ELF_DRAM_LIMIT = $(shell expr $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_VCACHE_MANYCORE_SIZE)) + endif +# Finally, Subtract 4 from the maximum memory space address +BSG_ELF_STACK_PTR = $(shell expr $(_BSG_ELF_DRAM_LIMIT) - 4) +endif + +# Linker script generation parameters +ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1) + ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL) + LINK_GEN_OPTS ?= --default_data_loc=dmem --dram_size=$(BSG_ELF_DRAM_SIZE) --sp=$(BSG_ELF_STACK_PTR) + else ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), SHARED) + LINK_GEN_OPTS ?= --default_data_loc=dram --dram_size=$(BSG_ELF_DRAM_SIZE) --sp=$(BSG_ELF_STACK_PTR) + else + $(error Invalid BSG_ELF_DEFAULT_DATA_LOC = $(BSG_ELF_DEFAULT_DATA_LOC); Only LOCAL and SHARED are valid) + endif + + LINK_GEN_OPTS += --imem_size=0x01000000 # 16MB +else ifeq ($(BSG_ELF_OFF_CHIP_MEM), 0) + ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL) + LINK_GEN_OPTS ?= --default_data_loc=dmem --dram_size=$(BSG_ELF_VCACHE_SIZE) --sp=$(BSG_ELF_STACK_PTR) + else ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), SHARED) + LINK_GEN_OPTS ?= --default_data_loc=dram --dram_size=$(BSG_ELF_VCACHE_SIZE) --sp=$(BSG_ELF_STACK_PTR) + else + $(error Invalid BSG_ELF_DEFAULT_DATA_LOC = $(BSG_ELF_DEFAULT_DATA_LOC); Only LOCAL and SHARED are valid) + endif + + LINK_GEN_OPTS += --imem_size=0x00008000 # 32KB +else + $(error Invalid BSG_ELF_OFF_CHIP_MEM = $(BSG_ELF_OFF_CHIP_MEM); Only 0 and 1 are valid) +endif + +RISCV_LINK_SCRIPT ?= bsg_link.ld +$(RISCV_LINK_SCRIPT): $(RISCV_LINK_GEN) + $(RISCV_LINK_GEN) $(LINK_GEN_OPTS) --out=$@ + +# Link commands and definitions + +RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_dram_size=$(BSG_ELF_DRAM_SIZE) +RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_vcache_size=$(BSG_ELF_VCACHE_MANYCORE_SIZE) +RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_stack_ptr=$(BSG_ELF_STACK_PTR) + +RISCV_LDFLAGS += -nostdlib +RISCV_LDFLAGS += -march=$(RISCV_ARCH_OP) +RISCV_LDFLAGS += -nostartfiles +RISCV_LDFLAGS += -ffast-math +RISCV_LDFLAGS += -lc +RISCV_LDFLAGS += -lm +RISCV_LDFLAGS += -lgcc + +# TODO: temporary fix to solve this problem: https://stackoverflow.com/questions/56518056/risc-v-linker-throwing-sections-lma-overlap-error-despite-lmas-belonging-to-dif +RISCV_LDFLAGS += -Wl,--no-check-sections + +# This builds a .riscv binary for the current machine type and tile +# group size. RISCV_TARGET_OBJECTS are .rvo files that will be linked +# in the final binary. +%.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) + $(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@ + +kernel.link.clean: + rm -rf *.riscv $(RISCV_LINK_SCRIPT) + + +.PRECIOUS: %.riscv +.PHONY: kernel.link.clean kernel.compile.clean +clean: kernel.link.clean kernel.compile.clean + diff --git a/examples/graphit/test_vec_add_parallel/Makefile b/examples/graphit/test_vec_add_parallel/Makefile new file mode 100644 index 000000000..74c5c5b7c --- /dev/null +++ b/examples/graphit/test_vec_add_parallel/Makefile @@ -0,0 +1,142 @@ +# Copyright (c) 2021, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This Makefile compiles, links, and executes examples Run `make help` +# to see the available targets for the selected platform. + +################################################################################ +# environment.mk verifies the build environment and sets the following +# makefile variables: +# +# LIBRAIRES_PATH: The path to the libraries directory +# HARDWARE_PATH: The path to the hardware directory +# EXAMPLES_PATH: The path to the examples directory +# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL +# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore +############################################################################### + +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) + +include $(REPLICANT_PATH)/environment.mk +SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd +CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime + +# TEST_NAME is the basename of the executable +TEST_NAME = main +# KERNEL_NAME is the name of the CUDA-Lite Kernel +KERNEL_NAME = vec_add_parallel + +############################################################################### +# Host code compilation flags and flow +############################################################################### + +# TEST_SOURCES is a list of source files that need to be compiled +TEST_SOURCES = main.c + +DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE +CDEFINES += +CXXDEFINES += + +FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable +CFLAGS += -std=c99 $(FLAGS) +CXXFLAGS += -std=c++11 $(FLAGS) + +# compilation.mk defines rules for compilation of C/C++ +include $(EXAMPLES_PATH)/compilation.mk + +# Specify any header file dependencies +main.o: INCLUDES += -I$(EXAMPLES_PATH) +main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h + +############################################################################### +# Host code link flags and flow +############################################################################### + +LDFLAGS += + +# link.mk defines rules for linking of the final execution binary. +include $(EXAMPLES_PATH)/link.mk + +############################################################################### +# Device code compilation flow +############################################################################### + +# BSG_MANYCORE_KERNELS is a list of manycore executables that should +# be built before executing. +BSG_MANYCORE_KERNELS = kernel.riscv + +kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) +kernel.riscv: kernel.rvo + +# Tile Group Dimensions +TILE_GROUP_DIM_X = 2 +TILE_GROUP_DIM_Y = 2 +RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) +RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) + +include $(EXAMPLES_PATH)/cuda/riscv.mk + +############################################################################### +# Execution flow +# +# C_ARGS: Use this to pass arguments that you want to appear in argv +# For SPMD tests C arguments are: +# +# SIM_ARGS: Use this to pass arguments to the simulator +############################################################################### +C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) + +SIM_ARGS ?= + +# Include platform-specific execution rules +include $(EXAMPLES_PATH)/execution.mk + +############################################################################### +# Regression Flow +############################################################################### + +regression: main.exec.log + @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null + +############################################################################### +# Default rules, help, and clean +############################################################################### +.DEFAULT_GOAL := help +help: + @echo "Usage:" + @echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}" + @echo " $(TEST_NAME).profile: Build executable with profilers enabled" + @echo " $(TEST_NAME).debug: Build waveform executable (if VCS)" + @echo " $(TEST_NAME).{profile,debug}.log: Run specific executable" + @echo " clean: Remove all subdirectory-specific outputs" + + +.PHONY: clean + +clean: + + diff --git a/examples/graphit/test_vec_add_parallel/kernel.cpp b/examples/graphit/test_vec_add_parallel/kernel.cpp new file mode 100644 index 000000000..b2ea1ae88 --- /dev/null +++ b/examples/graphit/test_vec_add_parallel/kernel.cpp @@ -0,0 +1,20 @@ +//This kernel adds 2 vectors + +#include +#include +#include + +bsg_barrier barrier; + +extern "C" __attribute__ ((noinline)) +int kernel_vec_add_parallel(int *A, int *B, int *C, int N, int block_size_x) { + + int start_x = block_size_x * (__bsg_tile_group_id_y * __bsg_grid_dim_x + __bsg_tile_group_id_x); + for (int iter_x = __bsg_id; iter_x < block_size_x; iter_x += bsg_tiles_X * bsg_tiles_Y) { + C[start_x + iter_x] = A[start_x + iter_x] + B[start_x + iter_x]; + } + + barrier.sync(); + + return 0; +} diff --git a/examples/graphit/test_vec_add_parallel/main.c b/examples/graphit/test_vec_add_parallel/main.c new file mode 100644 index 000000000..07c9bd209 --- /dev/null +++ b/examples/graphit/test_vec_add_parallel/main.c @@ -0,0 +1,196 @@ +// Copyright (c) 2019, University of Washington All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// Redistributions of source code must retain the above copyright notice, this list +// of conditions and the following disclaimer. +// +// Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// +// Neither the name of the copyright holder nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ALLOC_NAME "default_allocator" + +/*! + * Runs the vector addition a grid of 2x2 tile groups. A[N] + B[N] --> C[N] + * Grid dimensions are determines by how much of a load we want for each tile group (block_size_x) + * This tests uses the software/spmd/bsg_cuda_lite_runtime/vec_add_parallel/ Manycore binary in the BSG Manycore bitbucket repository. +*/ + + +void host_vec_add (int *A, int *B, int *C, int N) { + for (int i = 0; i < N; i ++) { + C[i] = A[i] + B[i]; + } + return; +} + + +int kernel_vec_add_parallel (int argc, char **argv) { + int rc; + char *bin_path, *test_name; + struct arguments_path args = {NULL, NULL}; + + argp_parse (&argp_path, argc, argv, 0, 0, &args); + bin_path = args.path; + test_name = args.name; + + bsg_pr_test_info("Running the CUDA Vector Addition Kernel on a grid of 2x2 tile groups.\n\n"); + + srand(time); + + /*********************/ + /* Initialize device */ + /*********************/ + hb_mc_device_t device; + BSG_CUDA_CALL(hb_mc_device_init(&device, test_name, 0)); + + hb_mc_pod_id_t pod; + hb_mc_device_foreach_pod_id(&device, pod) + { + /**********************************************************************/ + /* Define path to binary. */ + /* Initialize device, load binary and unfreeze tiles. */ + /**********************************************************************/ + bsg_pr_test_info("Loading program for %s onto pod %d\n", + test_name, pod); + + BSG_CUDA_CALL(hb_mc_device_set_default_pod(&device, pod)); + BSG_CUDA_CALL(hb_mc_device_program_init(&device, bin_path, ALLOC_NAME, 0)); + + /***************************************************************************************************************** + * Allocate memory on the device for A, B and C. + ******************************************************************************************************************/ + uint32_t N = 1024; + + eva_t A_device, B_device, C_device; + BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &A_device)); /* allocate A[N] on the device */ + BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &B_device)); /* allocate B[N] on the device */ + BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &C_device)); /* allocate C[N] on the device */ + + /***************************************************************************************************************** + * Allocate memory on the host for A & B and initialize with random values. + ******************************************************************************************************************/ + uint32_t A_host[N]; /* allocate A[N] on the host */ + uint32_t B_host[N]; /* allocate B[N] on the host */ + for (int i = 0; i < N; i++) { /* fill A with arbitrary data */ + A_host[i] = rand() & 0xFFFF; + B_host[i] = rand() & 0xFFFF; + } + + /***************************************************************************************************************** + * Copy A & B from host onto device DRAM. + ******************************************************************************************************************/ + void *dst = (void *) ((intptr_t) A_device); + void *src = (void *) &A_host[0]; + BSG_CUDA_CALL(hb_mc_device_memcpy (&device, dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_DEVICE)); /* Copy A to the device */ + + dst = (void *) ((intptr_t) B_device); + src = (void *) &B_host[0]; + BSG_CUDA_CALL(hb_mc_device_memcpy (&device, dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_DEVICE)); /* Copy B to the device */ + + /***************************************************************************************************************** + * Define block_size_x/y: amount of work for each tile group + * Define tg_dim_x/y: number of tiles in each tile group + * Calculate grid_dim_x/y: number of tile groups needed based on block_size_x/y + ******************************************************************************************************************/ + uint32_t block_size_x = 64; + hb_mc_dimension_t tg_dim = { .x = 2, .y = 2 }; + hb_mc_dimension_t grid_dim = { .x = N / block_size_x, .y = 1 }; + + /***************************************************************************************************************** + * Prepare list of input arguments for kernel. + ******************************************************************************************************************/ + int cuda_argv[5] = {A_device, B_device, C_device, N, block_size_x}; + + /***************************************************************************************************************** + * Enquque grid of tile groups, pass in grid and tile group dimensions, kernel name, number and list of input arguments + ******************************************************************************************************************/ + BSG_CUDA_CALL(hb_mc_kernel_enqueue (&device, grid_dim, tg_dim, "kernel_vec_add_parallel", 5, cuda_argv)); + + /***************************************************************************************************************** + * Launch and execute all tile groups on device and wait for all to finish. + ******************************************************************************************************************/ + BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(&device)); + + /***************************************************************************************************************** + * Copy result matrix back from device DRAM into host memory. + ******************************************************************************************************************/ + uint32_t C_host[N]; + src = (void *) ((intptr_t) C_device); + dst = (void *) &C_host[0]; + BSG_CUDA_CALL(hb_mc_device_memcpy (&device, (void *) dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_HOST)); /* copy C to the host */ + + /***************************************************************************************************************** + * Freeze the tiles and memory manager cleanup. + ******************************************************************************************************************/ + BSG_CUDA_CALL(hb_mc_device_program_finish(&device)); + + /***************************************************************************************************************** + * Calculate the expected result using host code and compare the results. + ******************************************************************************************************************/ + uint32_t C_expected[N]; + host_vec_add (A_host, B_host, C_expected, N); + + + int mismatch = 0; + for (int i = 0; i < N; i++) { + if (A_host[i] + B_host[i] != C_host[i]) { + bsg_pr_err(BSG_RED("Mismatch: ") "C[%d]: 0x%08" PRIx32 " + 0x%08" PRIx32 " = 0x%08" PRIx32 "\t Expected: 0x%08" PRIx32 "\n", + i , A_host[i], B_host[i], C_host[i], C_expected[i]); + mismatch = 1; + } + } + + if (mismatch) { + return HB_MC_FAIL; + } + } + + BSG_CUDA_CALL(hb_mc_device_finish(&device)); + + return HB_MC_SUCCESS; +} + +#ifdef VCS +int vcs_main(int argc, char ** argv) +#else +int main(int argc, char ** argv) +#endif +{ + bsg_pr_test_info("test_vec_add_parallel Regression Test \n"); + int rc = kernel_vec_add_parallel(argc, argv); + bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); + return rc; +} + + From 7b091c95af49717dcedff8bb4d3a3423d75f6082 Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Mon, 22 Mar 2021 14:52:25 -0700 Subject: [PATCH 10/22] broken pr-nibble test --- examples/graphit/Makefile | 1 + examples/graphit/test_pr_nibble/Makefile | 150 +++++++++++ examples/graphit/test_pr_nibble/kernel.cpp | 225 ++++++++++++++++ .../kernel/include/pr_nibble.hpp | 9 + examples/graphit/test_pr_nibble/main.cpp | 247 ++++++++++++++++++ examples/graphit/test_pr_nibble/pr.hpp | 25 ++ examples/graphit/test_pr_nibble/pr_host.hpp | 53 ++++ .../graphit/test_vec_add_parallel/Makefile | 2 +- 8 files changed, 711 insertions(+), 1 deletion(-) create mode 100644 examples/graphit/test_pr_nibble/Makefile create mode 100644 examples/graphit/test_pr_nibble/kernel.cpp create mode 100644 examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp create mode 100644 examples/graphit/test_pr_nibble/main.cpp create mode 100644 examples/graphit/test_pr_nibble/pr.hpp create mode 100644 examples/graphit/test_pr_nibble/pr_host.hpp diff --git a/examples/graphit/Makefile b/examples/graphit/Makefile index 1ac9533b9..f8389272b 100644 --- a/examples/graphit/Makefile +++ b/examples/graphit/Makefile @@ -46,6 +46,7 @@ include $(EXAMPLES_PATH)/link.mk # Define the tests that get run TESTS += test_vec_add_parallel +TESTS += test_pr_nibble regression: $(TESTS) @echo "GRAPHIT REGRESSION PASSED" diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile new file mode 100644 index 000000000..d456fd96d --- /dev/null +++ b/examples/graphit/test_pr_nibble/Makefile @@ -0,0 +1,150 @@ +# Copyright (c) 2021, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This Makefile compiles, links, and executes examples Run `make help` +# to see the available targets for the selected platform. + +################################################################################ +# environment.mk verifies the build environment and sets the following +# makefile variables: +# +# LIBRAIRES_PATH: The path to the libraries directory +# HARDWARE_PATH: The path to the hardware directory +# EXAMPLES_PATH: The path to the examples directory +# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL +# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore +############################################################################### + +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) + +include $(REPLICANT_PATH)/environment.mk +SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd +CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime +GRAPHIT_PATH = $(REPLICANT_PATH)/../graphit-new +CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + +GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx + +# TEST_NAME is the basename of the executable +TEST_NAME = main +# KERNEL_NAME is the name of the CUDA-Lite Kernel +KERNEL_NAME = pr_nibble + +############################################################################### +# Host code compilation flags and flow +############################################################################### + +# TEST_SOURCES is a list of source files that need to be compiled +TEST_SOURCES = main.cpp + +DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE +CDEFINES += +CXXDEFINES += + +FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable +CFLAGS += -std=c99 $(FLAGS) +CXXFLAGS += -std=c++14 $(FLAGS) + +HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ + +# compilation.mk defines rules for compilation of C/C++ +include $(EXAMPLES_PATH)/compilation.mk + +# Specify any header file dependencies +main.o: INCLUDES += -I$(EXAMPLES_PATH) -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/ +main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h + +############################################################################### +# Host code link flags and flow +############################################################################### + +LDFLAGS += + +# link.mk defines rules for linking of the final execution binary. +include $(EXAMPLES_PATH)/link.mk + +############################################################################### +# Device code compilation flow +############################################################################### + +# BSG_MANYCORE_KERNELS is a list of manycore executables that should +# be built before executing. +BSG_MANYCORE_KERNELS = kernel.riscv + +kernel.rvo: RISCV_CXX = $(RISCV_GXX) +kernel.riscv: kernel.rvo + +# Tile Group Dimensions +TILE_GROUP_DIM_X = 16 +TILE_GROUP_DIM_Y = 8 +RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) +RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) + +RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_pr_nibble/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/ + +include $(EXAMPLES_PATH)/graphit/riscv.mk + +############################################################################### +# Execution flow +# +# C_ARGS: Use this to pass arguments that you want to appear in argv +# For SPMD tests C arguments are: +# +# SIM_ARGS: Use this to pass arguments to the simulator +############################################################################### +C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH) + +SIM_ARGS ?= + +# Include platform-specific execution rules +include $(EXAMPLES_PATH)/execution.mk + +############################################################################### +# Regression Flow +############################################################################### + +regression: main.exec.log + @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null + +############################################################################### +# Default rules, help, and clean +############################################################################### +.DEFAULT_GOAL := help +help: + @echo "Usage:" + @echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}" + @echo " $(TEST_NAME).profile: Build executable with profilers enabled" + @echo " $(TEST_NAME).debug: Build waveform executable (if VCS)" + @echo " $(TEST_NAME).{profile,debug}.log: Run specific executable" + @echo " clean: Remove all subdirectory-specific outputs" + + +.PHONY: clean + +clean: + + diff --git a/examples/graphit/test_pr_nibble/kernel.cpp b/examples/graphit/test_pr_nibble/kernel.cpp new file mode 100644 index 000000000..d49112e73 --- /dev/null +++ b/examples/graphit/test_pr_nibble/kernel.cpp @@ -0,0 +1,225 @@ +//#define DEBUG +#include + +#ifdef DEBUG +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#else +#include +// #define BSG_TILE_GROUP_X_DIM 16 +// #define BSG_TILE_GROUP_Y_DIM 8 +#endif + +#include +bsg_barrier barrier; + +#include +#include + +#ifdef DEBUG +#define pr_dbg(fmt, ...) \ + bsg_printf(fmt, ##__VA_ARGS__) +#else +#define pr_dbg(fmt, ...) +#endif + +__attribute__((section(".dram"))) double * __restrict p; +__attribute__((section(".dram"))) double * __restrict old_rank; +__attribute__((section(".dram"))) double * __restrict new_rank; +__attribute__((section(".dram"))) int * __restrict out_degree; +__attribute__((section(".dram"))) int * __restrict generated_tmp_vector_3; +//__attribute__((section(".dram"))) double alpha = 0.15; +//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; + +template int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); + //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); + int start, end; + local_range(V, &start, &end); + for ( int d = start; d < end; d++) { + int degree = in_indices[d + 1] - in_indices[d]; + int * neighbors = &in_neighbors[in_indices[d]]; + for(int s = 0; s < degree; s++) { + if(from_vertexset[neighbors[s]]) { + //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); + apply_func (neighbors[s] , d); + } + } //end of loop on in neighbors + } //end of outer for loop + return 0; +} //end of edgeset apply function + +template int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); + //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); + int start, end; + local_range(V, &start, &end); + for ( int s = start; s < end; s++) { + if(from_vertexset[s]) { + int degree = out_indices[s + 1] - out_indices[s]; + int * neighbors = &out_neighbors[out_indices[s]]; + for(int d = 0; d < degree; d++) { + apply_func (s, neighbors[d]); + //if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); } + + } + } //end of loop on in neighbors + } //end of outer for loop + //barrier.sync(); + return 0; +} //end of edgeset apply function + + +struct generated_vector_op_apply_func_4 +{ + void operator() (int v) + { + out_degree[v] = generated_tmp_vector_3[v]; + }; +}; +struct new_rank_generated_vector_op_apply_func_2 +{ + void operator() (int v) + { + new_rank[v] = ((float) 0) ; + }; +}; +struct old_rank_generated_vector_op_apply_func_1 +{ + void operator() (int v) + { + old_rank[v] = ((float) 0) ; + }; +}; +struct p_generated_vector_op_apply_func_0 +{ + void operator() (int v) + { + p[v] = ((float) 0) ; + }; +}; +struct updateEdge +{ + void operator() (int src, int dst) + { + double alpha = 0.15; + new_rank[dst] = (new_rank[dst] + (((((1) - alpha) / ((1) + alpha)) * old_rank[src]) / out_degree[src])); + }; +}; +struct updateSelf +{ + void operator() (int v) + { + double alpha = 0.15; + p[v] = (p[v] + ((((2) * alpha) / ((1) + alpha)) * old_rank[v])); + new_rank[v] = (0) ; + }; +}; +struct filter_frontier +{ + bool operator() (int v) + { + double epsilon = (double) 1e-6; + bool output ; + if(old_rank[v] == 0) return 0; + //output = (old_rank[v]) > ((out_degree[v] * epsilon)); + output = (old_rank[v]) > ((out_degree[v] * epsilon)); + return output; + }; +}; + +extern "C" int __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + p_generated_vector_op_apply_func_0()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + old_rank_generated_vector_op_apply_func_1()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + new_rank_generated_vector_op_apply_func_2()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + generated_vector_op_apply_func_4()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) updateSelf_kernel(int V, int tag_c) { + //pr_dbg("%i: on update self tag: %i\n", bsg_id, tag_c); + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + updateSelf()(iter_x); + } + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c); + bsg_cuda_print_stat_start(tag_c); + edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + + extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + bsg_cuda_print_stat_start(tag_c); + edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + +extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { + //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c); + //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c); + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if (iter_x < V) { + next5[iter_x] = 0; + if ( filter_frontier()( iter_x ) ) { + next5[iter_x] = 1; + //pr_dbg("added vertex %i to frontier\n", iter_x); + } + } + else { break; } + } //end of loop + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + + diff --git a/examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp b/examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp new file mode 100644 index 000000000..ee50a54d6 --- /dev/null +++ b/examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp @@ -0,0 +1,9 @@ +#pragma once +#ifndef __PR_PULL_BENCHMARK_HPP +#define __PR_PULL_BENCHMARK_HPP + +#include +#include +#include +#include +#endif diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp new file mode 100644 index 000000000..e7813101a --- /dev/null +++ b/examples/graphit/test_pr_nibble/main.cpp @@ -0,0 +1,247 @@ +#include "pr.hpp" + +//#define DEBUG + +#define VERIFY 1 + +#ifdef DEBUG +#define X 1 +#define Y 1 +#else +#define X 16 +#define Y 8 +#endif + +#define ROOT 6 //eventually we will need to do 50 start vertices (in parallel) +#define NUM_LOCKS 1024 //width of manycore * 64 + +GraphHB edges; +GlobalScalar p_dev; +GlobalScalar old_rank_dev; +GlobalScalar new_rank_dev; +GlobalScalar out_degree_dev; +GlobalScalar alpha_dev; +GlobalScalar epsilon_dev; + +#include "pr_host.hpp" + +int launch(int argc, char ** argv){ + InputParser input(argc, argv); + if(!input.cmdOptionExists("-g")){ + std::cerr << "no input args\n"; + return 0; + } + std::string ucode_path = input.getRISCVFile(); + + int iter = 0; + // std::string iterstrbase = "iteration-"; + // auto pos = ucode_path.find(iterstrbase); + // auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos); + // std::stringstream ss(iterstr); + // ss >> iter; + std::cerr << "iteration: " << iter << std::endl; + + int version = 0; //default to vertex pull + if(ucode_path.find("push") != std::string::npos) { + version = 1; + } + else if(ucode_path.find("block") != std::string::npos) { + version = 2; + } + int hybrid = 0; //default to vertex pull + if(ucode_path.find("hybrid") != std::string::npos) { + hybrid = 1; + } + std::cerr << "version: " << version << std::endl; + std::cerr << "load microcode" << std::endl; + hammerblade::builtin_loadMicroCodeFromFile(ucode_path); + + std::cerr << "load graph" << std::endl; + std::string graph_f = input.getCmdOption("-g"); + edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); + + std::cerr << "size of graph: " << std::endl; + std::cerr << edges.num_nodes() << std::endl; + std::cerr << edges.num_edges() << std::endl; + + std::cerr << "init global scalars" << std::endl; + p_dev = GlobalScalar("p"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), p_dev); + old_rank_dev = GlobalScalar("old_rank"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), old_rank_dev); + new_rank_dev = GlobalScalar("new_rank"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), new_rank_dev); + out_degree_dev = GlobalScalar("out_degree"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), out_degree_dev); + alpha_dev = GlobalScalar("alpha"); + epsilon_dev = GlobalScalar("epsilon"); + + std::cerr << "init locks" << std::endl; + GlobalScalar glbl_locks = GlobalScalar("locks"); + hammerblade::init_global_array>(NUM_LOCKS, glbl_locks); + std::atomic tmp_a[NUM_LOCKS] = {}; + + Device::Ptr device = Device::GetInstance(); + + float alpha = ((float) 0.15) ; + float epsilon = ((float) 1e-06) ; + int start_vertex = ROOT; + + + Vector frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); + //Vector next_frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); + + std::vector hfrontier(edges.num_nodes(), 0); + std::vector p(edges.num_nodes(), (double) 0.0); + std::vector new_rank(edges.num_nodes(), (double) 0.0); + std::vector old_rank(edges.num_nodes(), (double) 0.0); + std::vector out_degs = edges.get_out_degrees(); + + //compute up to current iter on host + hfrontier[start_vertex] = 1; + new_rank[start_vertex] = (double) 1.0; + old_rank[start_vertex] = (double) 1.0; + host_pr_calc(p, old_rank, new_rank, hfrontier, iter); + + frontier.copyToDevice(hfrontier.data(), hfrontier.size()); + + //next_frontier.copyToDevice(zeros.data(), zeros.size()); + hammerblade::write_global_buffer_dma(p.data(), p_dev, p.size()); + hammerblade::write_global_buffer_dma(old_rank.data(), old_rank_dev, old_rank.size()); + hammerblade::write_global_buffer_dma(new_rank.data(), new_rank_dev, new_rank.size()); + hammerblade::write_global_buffer_dma(out_degs.data(), out_degree_dev, out_degs.size()); + hammerblade::write_global_buffer_dma>(tmp_a, glbl_locks, NUM_LOCKS); + + device->freeze_cores(); + device->write_dma(); + device->unfreeze_cores(); + if(hybrid || version == 2) { + int num_items = std::count(hfrontier.begin(), hfrontier.end(), 1); + int dir = calculate_direction(num_items, hfrontier, edges, edges.num_nodes(), edges.num_edges()); + if(dir){ + if(version != 2) version = 0; //pull + } else { + version = 1; //push + } + } + + //alpha_dev.set(alpha); + //epsilon_dev.set(epsilon); + + //hammerblade::builtin_addVertexHB(frontier, start_vertex); + //hammerblade::insert_val(start_vertex, ((double) 1) , old_rank_dev); + //hammerblade::insert_val(start_vertex, ((double) 1) , new_rank_dev); + + std::cerr << "start of while loop\n"; + int tag_c = 0; + //double host_rank[edges.num_nodes()]; + //ofstream prog_file; + //prog_file.open("./progress.txt"); + //prog_file << "starting computation w/ root vertex: " << start_vertex << std::endl; + //while ( builtin_getVertexSetSizeHB(frontier, edges.num_nodes()) != 0) + //while ( iter < 16) + for(int i = 0; i < 1; i++) + { + int f_sz = 0; + //new_rank = old_rank; + switch(version) { + case 0: //vertex pull + std::cerr << "pull kernel\n"; + std::cerr << "run update self vertex kernel\n"; + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; + device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "create next frontier\n"; + device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runJobs(); + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); + std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; + break; + case 1: //vertex push + std::cerr << "push kernel\n"; + std::cerr << "run update self vertex kernel\n"; + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; + device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + std::cerr << "create next frontier\n"; + device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runJobs(); + f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); + std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; + break; + case 2: //blocked pull + std::cerr << "blocked pull kernel\n"; + std::cerr << "run update self vertex kernel\n"; + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; + device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInVertexlistAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runJobs(); + tag_c++; + std::cerr << "create next frontier\n"; + device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runJobs(); + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); + std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; + break; + } + tag_c++; + + iter++; + //prog_file << "finished iteration: " << iter << std::endl; + } + std::cerr << "*******end of program********\n"; + //prog_file << "*******end of program********\n"; + std::cerr << "took: " << iter << " iterations to complete\n"; + //prog_file << "took: " << iter << " iterations to complete\n"; + //prog_file.close(); + if(VERIFY) { + ofstream ver_file; + ver_file.open("./rank.txt"); + double host_rank[edges.num_nodes()]; + hammerblade::read_global_buffer_dma(host_rank, old_rank_dev, edges.num_nodes()); + for(int i = 0; i < edges.num_nodes(); i++) { + ver_file << host_rank[i] << std::endl; + } + ver_file.close(); + } + return 0; +} + +#ifdef VCS +int vcs_main(int argc, char ** argv){ + // int argc = get_argc(args); + // char *argv[argc]; + // get_argv(args, argc, argv); + // svScope scope; + // scope = svGetScopeFromName("tb"); + // svSetScope(scope); + bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n"); + int rc = launch(argc, argv); + //*exit_code = rc; + bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); + return rc; +} +#else +int main(int argc, char ** argv) { + bsg_pr_test_info("Unified Main CUDA Regression Test (F1)\n"); + int rc = launch(argc, argv); + bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); + return rc; +} +#endif diff --git a/examples/graphit/test_pr_nibble/pr.hpp b/examples/graphit/test_pr_nibble/pr.hpp new file mode 100644 index 000000000..b73169532 --- /dev/null +++ b/examples/graphit/test_pr_nibble/pr.hpp @@ -0,0 +1,25 @@ +#ifndef __PR_PULL_BENCHMARK_HPP +#define __PR_PULL_BENCHMARK_HPP + +#include "hb_intrinsics.h" +#include "infra_hb/host/arg_parser.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include "../common.h" + + +using hammerblade::Device; +using hammerblade::Vector; +using hammerblade::GraphHB; +using hammerblade::GlobalScalar; + +#endif diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp new file mode 100644 index 000000000..0168845f0 --- /dev/null +++ b/examples/graphit/test_pr_nibble/pr_host.hpp @@ -0,0 +1,53 @@ +//function to compute pr-nibble on host up to current iter +#pragma once +#include +#include + +inline void host_pr_calc(std::vector & p, std::vector & old_rank, std::vector & new_rank, std::vector & frontier, int iter) { + double alpha = (double) 0.15; + double epsilon = (double) 1e-06; + auto g = edges.getHostGraph(); + int * in_neigh = g.in_neighbors_shared_.get(); + int ** in_index = g.in_index_shared_.get(); + std::string fname = "iter-" + std::to_string(iter) + ".txt"; + ofstream ofile; + ofile.open (fname); + for(int i = 0; i < iter; i++) { + //std::memcpy(new_rank, old_rank, sizeof(double)*edges.num_nodes()); + //new_rank = old_rank; + new_rank.assign(old_rank.begin(), old_rank.end()); + //print out iteration and size: + int num_items = std::count(frontier.begin(), frontier.end(), 1); + std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl; + //update_self + for(int v = 0; v < g.num_nodes(); v++) { + p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; + new_rank[v] = (double) 0.0 ; + } + //update edges + for(int d = 0; d < g.num_nodes(); d++) { + for(int s : g.in_neigh(d)) { + if(frontier[s]){ + double update = ((1.0 - alpha) / (1.0 + alpha)) * old_rank[s]; + update = update / ((double) g.out_degree(s)); + new_rank[d] += update; + if(i == (iter - 1)) {ofile << s << " " << d << " " << new_rank[d] << std::endl;} + } + } + } + //old_rank.swap(new_rank); + //std::memcpy(old_rank, new_rank, sizeof(double)*edges.num_nodes()); + //old_rank = new_rank; + old_rank.assign(new_rank.begin(), new_rank.end()); + //update frontier + for(int v = 0; v < g.num_nodes(); v++) { + frontier[v] = 0; + if(g.out_degree(v) > 0 && old_rank[v] >= (((double) g.out_degree(v)) * epsilon)) { + frontier[v] = 1; + } + } + } + ofile.close(); + int num_items = std::count(frontier.begin(), frontier.end(), 1); + std::cerr << "returning with frontier size: " << num_items << std::endl; +} diff --git a/examples/graphit/test_vec_add_parallel/Makefile b/examples/graphit/test_vec_add_parallel/Makefile index 74c5c5b7c..4291f23a1 100644 --- a/examples/graphit/test_vec_add_parallel/Makefile +++ b/examples/graphit/test_vec_add_parallel/Makefile @@ -98,7 +98,7 @@ TILE_GROUP_DIM_Y = 2 RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) -include $(EXAMPLES_PATH)/cuda/riscv.mk +include $(EXAMPLES_PATH)/graphit/riscv.mk ############################################################################### # Execution flow From e0276f6674ca3e6570c31136791f219ab3dd0070 Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Mon, 22 Mar 2021 16:17:00 -0700 Subject: [PATCH 11/22] updated, still broken --- examples/graphit/riscv.mk | 4 ++-- examples/graphit/test_pr_nibble/Makefile | 6 +++--- examples/graphit/test_pr_nibble/kernel.cpp | 12 ++++++------ examples/graphit/test_pr_nibble/main.cpp | 10 ++-------- examples/graphit/test_pr_nibble/pr.hpp | 2 +- 5 files changed, 14 insertions(+), 20 deletions(-) diff --git a/examples/graphit/riscv.mk b/examples/graphit/riscv.mk index 1266a5e7f..87a52d511 100644 --- a/examples/graphit/riscv.mk +++ b/examples/graphit/riscv.mk @@ -54,7 +54,7 @@ RISCV_LLVM_PATH := $(RISCV_TOOLS_PATH)/llvm/llvm-install RISCV_LINK_GEN := $(BSG_MANYCORE_DIR)/software/py/bsg_manycore_link_gen.py # These flags are not supported by clang -RISCV_GNU_FLAGS = -mno-fdiv -frerun-cse-after-loop -fweb -frename-registers +RISCV_GNU_FLAGS = -frerun-cse-after-loop -fweb -frename-registers -mtune=bsg_vanilla_2020 RISCV_GCC ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-gcc $(RISCV_GNU_FLAGS) RISCV_GXX ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-g++ $(RISCV_GNU_FLAGS) @@ -66,7 +66,7 @@ RISCV_LINK ?= $(RISCV_GCC) -t -T $(LINK_SCRIPT) $(RISCV_LDFLAGS) RISCV_LD ?= $(RISCV_GCC) RISCV_CLANG_ABI = ilp32f -RISCV_CLANG_CCPPFLAGS += --target=riscv32 -mabi=$(RISCV_CLANG_ABI) +RISCV_CLANG_CCPPFLAGS += --target=riscv32 -mabi=$(RISCV_CLANG_ABI) -march=riscv32imaf -mtune=hb-rv32 RISCV_CLANG_CXXFLAGS += --sysroot=$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs RISCV_CLANG_CXXFLAGS += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0 RISCV_CLANG_CXXFLAGS += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0/riscv32-unknown-elf-dramfs diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile index d456fd96d..0088fe364 100644 --- a/examples/graphit/test_pr_nibble/Makefile +++ b/examples/graphit/test_pr_nibble/Makefile @@ -67,7 +67,7 @@ CXXDEFINES += FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable CFLAGS += -std=c99 $(FLAGS) -CXXFLAGS += -std=c++14 $(FLAGS) +CXXFLAGS += -std=c++14 $(FLAGS) HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ @@ -82,7 +82,7 @@ main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h # Host code link flags and flow ############################################################################### -LDFLAGS += +LDFLAGS += # link.mk defines rules for linking of the final execution binary. include $(EXAMPLES_PATH)/link.mk @@ -116,7 +116,7 @@ include $(EXAMPLES_PATH)/graphit/riscv.mk # # SIM_ARGS: Use this to pass arguments to the simulator ############################################################################### -C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH) +C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) #-g $(GRAPH_PATH) SIM_ARGS ?= diff --git a/examples/graphit/test_pr_nibble/kernel.cpp b/examples/graphit/test_pr_nibble/kernel.cpp index d49112e73..c115b9f73 100644 --- a/examples/graphit/test_pr_nibble/kernel.cpp +++ b/examples/graphit/test_pr_nibble/kernel.cpp @@ -25,9 +25,9 @@ bsg_barrier barrier; #define pr_dbg(fmt, ...) #endif -__attribute__((section(".dram"))) double * __restrict p; -__attribute__((section(".dram"))) double * __restrict old_rank; -__attribute__((section(".dram"))) double * __restrict new_rank; +__attribute__((section(".dram"))) float * __restrict p; +__attribute__((section(".dram"))) float * __restrict old_rank; +__attribute__((section(".dram"))) float * __restrict new_rank; __attribute__((section(".dram"))) int * __restrict out_degree; __attribute__((section(".dram"))) int * __restrict generated_tmp_vector_3; //__attribute__((section(".dram"))) double alpha = 0.15; @@ -106,7 +106,7 @@ struct updateEdge { void operator() (int src, int dst) { - double alpha = 0.15; + float alpha = 0.15; new_rank[dst] = (new_rank[dst] + (((((1) - alpha) / ((1) + alpha)) * old_rank[src]) / out_degree[src])); }; }; @@ -114,7 +114,7 @@ struct updateSelf { void operator() (int v) { - double alpha = 0.15; + float alpha = 0.15; p[v] = (p[v] + ((((2) * alpha) / ((1) + alpha)) * old_rank[v])); new_rank[v] = (0) ; }; @@ -123,7 +123,7 @@ struct filter_frontier { bool operator() (int v) { - double epsilon = (double) 1e-6; + float epsilon = (float) 1e-6; bool output ; if(old_rank[v] == 0) return 0; //output = (old_rank[v]) > ((out_degree[v] * epsilon)); diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp index e7813101a..3897a44d4 100644 --- a/examples/graphit/test_pr_nibble/main.cpp +++ b/examples/graphit/test_pr_nibble/main.cpp @@ -57,7 +57,8 @@ int launch(int argc, char ** argv){ hammerblade::builtin_loadMicroCodeFromFile(ucode_path); std::cerr << "load graph" << std::endl; - std::string graph_f = input.getCmdOption("-g"); + //std::string graph_f = input.getCmdOption("-g"); + std::string graph_f = "~/research/bladerunner6.0/graphit-new/test/graphs/darpa-eval/jhu.mtx"; edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); std::cerr << "size of graph: " << std::endl; @@ -225,15 +226,8 @@ int launch(int argc, char ** argv){ #ifdef VCS int vcs_main(int argc, char ** argv){ - // int argc = get_argc(args); - // char *argv[argc]; - // get_argv(args, argc, argv); - // svScope scope; - // scope = svGetScopeFromName("tb"); - // svSetScope(scope); bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n"); int rc = launch(argc, argv); - //*exit_code = rc; bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); return rc; } diff --git a/examples/graphit/test_pr_nibble/pr.hpp b/examples/graphit/test_pr_nibble/pr.hpp index b73169532..b1f9ac484 100644 --- a/examples/graphit/test_pr_nibble/pr.hpp +++ b/examples/graphit/test_pr_nibble/pr.hpp @@ -1,3 +1,4 @@ +#pragma once #ifndef __PR_PULL_BENCHMARK_HPP #define __PR_PULL_BENCHMARK_HPP @@ -14,7 +15,6 @@ #include #include #include -//#include "../common.h" using hammerblade::Device; From e7f74e62bd59cbb7bb507f4c74903f6eb8bd49d6 Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Mon, 29 Mar 2021 15:04:36 -0700 Subject: [PATCH 12/22] working pr nibble test --- examples/graphit/test_pr_nibble/Makefile | 7 +-- examples/graphit/test_pr_nibble/main.cpp | 48 ++++++++++----------- examples/graphit/test_pr_nibble/pr_host.hpp | 18 ++++---- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile index 0088fe364..8dec2f3be 100644 --- a/examples/graphit/test_pr_nibble/Makefile +++ b/examples/graphit/test_pr_nibble/Makefile @@ -44,8 +44,8 @@ REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) include $(REPLICANT_PATH)/environment.mk SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime -GRAPHIT_PATH = $(REPLICANT_PATH)/../graphit-new CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-new GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx @@ -106,7 +106,7 @@ RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_pr_nibble/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/ -include $(EXAMPLES_PATH)/graphit/riscv.mk +include $(EXAMPLES_PATH)/cuda/riscv.mk ############################################################################### # Execution flow @@ -116,7 +116,7 @@ include $(EXAMPLES_PATH)/graphit/riscv.mk # # SIM_ARGS: Use this to pass arguments to the simulator ############################################################################### -C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) #-g $(GRAPH_PATH) +C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH) SIM_ARGS ?= @@ -142,6 +142,7 @@ help: @echo " $(TEST_NAME).{profile,debug}.log: Run specific executable" @echo " clean: Remove all subdirectory-specific outputs" +print-% : ; @echo $* = $($*) .PHONY: clean diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp index 3897a44d4..e5b434481 100644 --- a/examples/graphit/test_pr_nibble/main.cpp +++ b/examples/graphit/test_pr_nibble/main.cpp @@ -20,8 +20,8 @@ GlobalScalar p_dev; GlobalScalar old_rank_dev; GlobalScalar new_rank_dev; GlobalScalar out_degree_dev; -GlobalScalar alpha_dev; -GlobalScalar epsilon_dev; +//GlobalScalar alpha_dev; +//GlobalScalar epsilon_dev; #include "pr_host.hpp" @@ -57,25 +57,25 @@ int launch(int argc, char ** argv){ hammerblade::builtin_loadMicroCodeFromFile(ucode_path); std::cerr << "load graph" << std::endl; - //std::string graph_f = input.getCmdOption("-g"); - std::string graph_f = "~/research/bladerunner6.0/graphit-new/test/graphs/darpa-eval/jhu.mtx"; + std::string graph_f = input.getCmdOption("-g"); edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); std::cerr << "size of graph: " << std::endl; std::cerr << edges.num_nodes() << std::endl; std::cerr << edges.num_edges() << std::endl; - std::cerr << "init global scalars" << std::endl; + p_dev = GlobalScalar("p"); - hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), p_dev); + + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), p_dev); old_rank_dev = GlobalScalar("old_rank"); - hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), old_rank_dev); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), old_rank_dev); new_rank_dev = GlobalScalar("new_rank"); - hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), new_rank_dev); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), new_rank_dev); out_degree_dev = GlobalScalar("out_degree"); hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), out_degree_dev); - alpha_dev = GlobalScalar("alpha"); - epsilon_dev = GlobalScalar("epsilon"); + //alpha_dev = GlobalScalar("alpha"); + //epsilon_dev = GlobalScalar("epsilon"); std::cerr << "init locks" << std::endl; GlobalScalar glbl_locks = GlobalScalar("locks"); @@ -87,29 +87,27 @@ int launch(int argc, char ** argv){ float alpha = ((float) 0.15) ; float epsilon = ((float) 1e-06) ; int start_vertex = ROOT; - - Vector frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); //Vector next_frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); std::vector hfrontier(edges.num_nodes(), 0); - std::vector p(edges.num_nodes(), (double) 0.0); - std::vector new_rank(edges.num_nodes(), (double) 0.0); - std::vector old_rank(edges.num_nodes(), (double) 0.0); + std::vector p(edges.num_nodes(), (float) 0.0); + std::vector new_rank(edges.num_nodes(), (float) 0.0); + std::vector old_rank(edges.num_nodes(), (float) 0.0); std::vector out_degs = edges.get_out_degrees(); //compute up to current iter on host hfrontier[start_vertex] = 1; - new_rank[start_vertex] = (double) 1.0; - old_rank[start_vertex] = (double) 1.0; + new_rank[start_vertex] = (float) 1.0; + old_rank[start_vertex] = (float) 1.0; host_pr_calc(p, old_rank, new_rank, hfrontier, iter); frontier.copyToDevice(hfrontier.data(), hfrontier.size()); //next_frontier.copyToDevice(zeros.data(), zeros.size()); - hammerblade::write_global_buffer_dma(p.data(), p_dev, p.size()); - hammerblade::write_global_buffer_dma(old_rank.data(), old_rank_dev, old_rank.size()); - hammerblade::write_global_buffer_dma(new_rank.data(), new_rank_dev, new_rank.size()); + hammerblade::write_global_buffer_dma(p.data(), p_dev, p.size()); + hammerblade::write_global_buffer_dma(old_rank.data(), old_rank_dev, old_rank.size()); + hammerblade::write_global_buffer_dma(new_rank.data(), new_rank_dev, new_rank.size()); hammerblade::write_global_buffer_dma(out_degs.data(), out_degree_dev, out_degs.size()); hammerblade::write_global_buffer_dma>(tmp_a, glbl_locks, NUM_LOCKS); @@ -160,7 +158,7 @@ int launch(int argc, char ** argv){ device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); device->runJobs(); std::cerr << "swap arrays\n"; - hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; break; @@ -175,7 +173,7 @@ int launch(int argc, char ** argv){ device->runJobs(); tag_c++; std::cerr << "swap arrays\n"; - hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); std::cerr << "create next frontier\n"; device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); device->runJobs(); @@ -196,7 +194,7 @@ int launch(int argc, char ** argv){ device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); device->runJobs(); std::cerr << "swap arrays\n"; - hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; break; @@ -214,8 +212,8 @@ int launch(int argc, char ** argv){ if(VERIFY) { ofstream ver_file; ver_file.open("./rank.txt"); - double host_rank[edges.num_nodes()]; - hammerblade::read_global_buffer_dma(host_rank, old_rank_dev, edges.num_nodes()); + float host_rank[edges.num_nodes()]; + hammerblade::read_global_buffer_dma(host_rank, old_rank_dev, edges.num_nodes()); for(int i = 0; i < edges.num_nodes(); i++) { ver_file << host_rank[i] << std::endl; } diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp index 0168845f0..7e7479495 100644 --- a/examples/graphit/test_pr_nibble/pr_host.hpp +++ b/examples/graphit/test_pr_nibble/pr_host.hpp @@ -3,9 +3,9 @@ #include #include -inline void host_pr_calc(std::vector & p, std::vector & old_rank, std::vector & new_rank, std::vector & frontier, int iter) { - double alpha = (double) 0.15; - double epsilon = (double) 1e-06; +inline void host_pr_calc(std::vector & p, std::vector & old_rank, std::vector & new_rank, std::vector & frontier, int iter) { + float alpha = (float) 0.15; + float epsilon = (float) 1e-06; auto g = edges.getHostGraph(); int * in_neigh = g.in_neighbors_shared_.get(); int ** in_index = g.in_index_shared_.get(); @@ -13,7 +13,7 @@ inline void host_pr_calc(std::vector & p, std::vector & old_rank ofstream ofile; ofile.open (fname); for(int i = 0; i < iter; i++) { - //std::memcpy(new_rank, old_rank, sizeof(double)*edges.num_nodes()); + //std::memcpy(new_rank, old_rank, sizeof(float)*edges.num_nodes()); //new_rank = old_rank; new_rank.assign(old_rank.begin(), old_rank.end()); //print out iteration and size: @@ -22,27 +22,27 @@ inline void host_pr_calc(std::vector & p, std::vector & old_rank //update_self for(int v = 0; v < g.num_nodes(); v++) { p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; - new_rank[v] = (double) 0.0 ; + new_rank[v] = (float) 0.0 ; } //update edges for(int d = 0; d < g.num_nodes(); d++) { for(int s : g.in_neigh(d)) { if(frontier[s]){ - double update = ((1.0 - alpha) / (1.0 + alpha)) * old_rank[s]; - update = update / ((double) g.out_degree(s)); + float update = ((1.0 - alpha) / (1.0 + alpha)) * old_rank[s]; + update = update / ((float) g.out_degree(s)); new_rank[d] += update; if(i == (iter - 1)) {ofile << s << " " << d << " " << new_rank[d] << std::endl;} } } } //old_rank.swap(new_rank); - //std::memcpy(old_rank, new_rank, sizeof(double)*edges.num_nodes()); + //std::memcpy(old_rank, new_rank, sizeof(float)*edges.num_nodes()); //old_rank = new_rank; old_rank.assign(new_rank.begin(), new_rank.end()); //update frontier for(int v = 0; v < g.num_nodes(); v++) { frontier[v] = 0; - if(g.out_degree(v) > 0 && old_rank[v] >= (((double) g.out_degree(v)) * epsilon)) { + if(g.out_degree(v) > 0 && old_rank[v] >= (((float) g.out_degree(v)) * epsilon)) { frontier[v] = 1; } } From 206bce5f01f24091e0585c36daacab267420178e Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Thu, 1 Apr 2021 15:50:17 -0700 Subject: [PATCH 13/22] initial delta stepping test, fixes for pr nibble --- examples/graphit/test_pr_nibble/Makefile | 2 +- examples/graphit/test_pr_nibble/kernel.cpp | 9 +- examples/graphit/test_pr_nibble/main.cpp | 14 +- examples/graphit/test_sssp_delta/Makefile | 151 ++++++++++++ examples/graphit/test_sssp_delta/kernel.cpp | 99 ++++++++ .../kernel/include/pr_nibble.hpp | 9 + .../test_sssp_delta/kernel/include/sssp.hpp | 8 + examples/graphit/test_sssp_delta/main.cpp | 219 ++++++++++++++++++ examples/graphit/test_sssp_delta/sssp.hpp | 26 +++ 9 files changed, 522 insertions(+), 15 deletions(-) create mode 100644 examples/graphit/test_sssp_delta/Makefile create mode 100644 examples/graphit/test_sssp_delta/kernel.cpp create mode 100644 examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp create mode 100644 examples/graphit/test_sssp_delta/kernel/include/sssp.hpp create mode 100644 examples/graphit/test_sssp_delta/main.cpp create mode 100644 examples/graphit/test_sssp_delta/sssp.hpp diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile index 8dec2f3be..4bfca9a2b 100644 --- a/examples/graphit/test_pr_nibble/Makefile +++ b/examples/graphit/test_pr_nibble/Makefile @@ -67,7 +67,7 @@ CXXDEFINES += FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable CFLAGS += -std=c99 $(FLAGS) -CXXFLAGS += -std=c++14 $(FLAGS) +CXXFLAGS += -std=c++11 $(FLAGS) HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ diff --git a/examples/graphit/test_pr_nibble/kernel.cpp b/examples/graphit/test_pr_nibble/kernel.cpp index c115b9f73..79186ab8a 100644 --- a/examples/graphit/test_pr_nibble/kernel.cpp +++ b/examples/graphit/test_pr_nibble/kernel.cpp @@ -125,9 +125,10 @@ struct filter_frontier { float epsilon = (float) 1e-6; bool output ; - if(old_rank[v] == 0) return 0; + //if(old_rank[v] == 0) return 0; + if(new_rank[v] == 0) return 0; //output = (old_rank[v]) > ((out_degree[v] * epsilon)); - output = (old_rank[v]) > ((out_degree[v] * epsilon)); + output = (new_rank[v]) > ((out_degree[v] * epsilon)); return output; }; }; @@ -185,7 +186,9 @@ extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_verte barrier.sync(); //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c); bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); bsg_cuda_print_stat_end(tag_c); barrier.sync(); return 0; @@ -194,7 +197,9 @@ extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_verte extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { barrier.sync(); bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); bsg_cuda_print_stat_end(tag_c); barrier.sync(); return 0; diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp index e5b434481..ff396f302 100644 --- a/examples/graphit/test_pr_nibble/main.cpp +++ b/examples/graphit/test_pr_nibble/main.cpp @@ -2,7 +2,7 @@ //#define DEBUG -#define VERIFY 1 +#define VERIFY 0 #ifdef DEBUG #define X 1 @@ -66,7 +66,6 @@ int launch(int argc, char ** argv){ std::cerr << "init global scalars" << std::endl; p_dev = GlobalScalar("p"); - hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), p_dev); old_rank_dev = GlobalScalar("old_rank"); hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), old_rank_dev); @@ -76,14 +75,11 @@ int launch(int argc, char ** argv){ hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), out_degree_dev); //alpha_dev = GlobalScalar("alpha"); //epsilon_dev = GlobalScalar("epsilon"); - std::cerr << "init locks" << std::endl; GlobalScalar glbl_locks = GlobalScalar("locks"); hammerblade::init_global_array>(NUM_LOCKS, glbl_locks); std::atomic tmp_a[NUM_LOCKS] = {}; - Device::Ptr device = Device::GetInstance(); - float alpha = ((float) 0.15) ; float epsilon = ((float) 1e-06) ; int start_vertex = ROOT; @@ -124,13 +120,6 @@ int launch(int argc, char ** argv){ } } - //alpha_dev.set(alpha); - //epsilon_dev.set(epsilon); - - //hammerblade::builtin_addVertexHB(frontier, start_vertex); - //hammerblade::insert_val(start_vertex, ((double) 1) , old_rank_dev); - //hammerblade::insert_val(start_vertex, ((double) 1) , new_rank_dev); - std::cerr << "start of while loop\n"; int tag_c = 0; //double host_rank[edges.num_nodes()]; @@ -219,6 +208,7 @@ int launch(int argc, char ** argv){ } ver_file.close(); } + device->finish(); return 0; } diff --git a/examples/graphit/test_sssp_delta/Makefile b/examples/graphit/test_sssp_delta/Makefile new file mode 100644 index 000000000..2e980c4e2 --- /dev/null +++ b/examples/graphit/test_sssp_delta/Makefile @@ -0,0 +1,151 @@ +# Copyright (c) 2021, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This Makefile compiles, links, and executes examples Run `make help` +# to see the available targets for the selected platform. + +################################################################################ +# environment.mk verifies the build environment and sets the following +# makefile variables: +# +# LIBRAIRES_PATH: The path to the libraries directory +# HARDWARE_PATH: The path to the hardware directory +# EXAMPLES_PATH: The path to the examples directory +# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL +# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore +############################################################################### + +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) + +include $(REPLICANT_PATH)/environment.mk +SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd +CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime +CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-new + +GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx + +# TEST_NAME is the basename of the executable +TEST_NAME = main +# KERNEL_NAME is the name of the CUDA-Lite Kernel +KERNEL_NAME = sssp + +############################################################################### +# Host code compilation flags and flow +############################################################################### + +# TEST_SOURCES is a list of source files that need to be compiled +TEST_SOURCES = main.cpp + +DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE +CDEFINES += +CXXDEFINES += + +FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable +CFLAGS += -std=c99 $(FLAGS) +CXXFLAGS += -std=c++11 $(FLAGS) + +HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ + +# compilation.mk defines rules for compilation of C/C++ +include $(EXAMPLES_PATH)/compilation.mk + +# Specify any header file dependencies +main.o: INCLUDES += -I$(EXAMPLES_PATH) -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/ +main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h + +############################################################################### +# Host code link flags and flow +############################################################################### + +LDFLAGS += + +# link.mk defines rules for linking of the final execution binary. +include $(EXAMPLES_PATH)/link.mk + +############################################################################### +# Device code compilation flow +############################################################################### + +# BSG_MANYCORE_KERNELS is a list of manycore executables that should +# be built before executing. +BSG_MANYCORE_KERNELS = kernel.riscv + +kernel.rvo: RISCV_CXX = $(RISCV_GXX) +kernel.riscv: kernel.rvo + +# Tile Group Dimensions +TILE_GROUP_DIM_X = 16 +TILE_GROUP_DIM_Y = 8 +RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) +RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) + +RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_sssp_delta/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/ + +include $(EXAMPLES_PATH)/cuda/riscv.mk + +############################################################################### +# Execution flow +# +# C_ARGS: Use this to pass arguments that you want to appear in argv +# For SPMD tests C arguments are: +# +# SIM_ARGS: Use this to pass arguments to the simulator +############################################################################### +C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH) + +SIM_ARGS ?= + +# Include platform-specific execution rules +include $(EXAMPLES_PATH)/execution.mk + +############################################################################### +# Regression Flow +############################################################################### + +regression: main.exec.log + @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null + +############################################################################### +# Default rules, help, and clean +############################################################################### +.DEFAULT_GOAL := help +help: + @echo "Usage:" + @echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}" + @echo " $(TEST_NAME).profile: Build executable with profilers enabled" + @echo " $(TEST_NAME).debug: Build waveform executable (if VCS)" + @echo " $(TEST_NAME).{profile,debug}.log: Run specific executable" + @echo " clean: Remove all subdirectory-specific outputs" + +print-% : ; @echo $* = $($*) + +.PHONY: clean + +clean: + + diff --git a/examples/graphit/test_sssp_delta/kernel.cpp b/examples/graphit/test_sssp_delta/kernel.cpp new file mode 100644 index 000000000..5039f1dd2 --- /dev/null +++ b/examples/graphit/test_sssp_delta/kernel.cpp @@ -0,0 +1,99 @@ +#include +#include + +//#define BSG_TILE_GROUP_X_DIM 16 +//#define BSG_TILE_GROUP_Y_DIM 2 +//#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +//#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#include +bsg_barrier barrier; +#include + +//#define DEBUG +#ifdef DEBUG +#define pr_dbg(fmt, ...) \ + bsg_printf(fmt, ##__VA_ARGS__) +#else +#define pr_dbg(fmt, ...) +#endif + + +__attribute__((section(".dram"))) int * __restrict dist; + +template int edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier(int *in_indices , WNode *in_neighbors, int* from_vertexset, int * next_frontier, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + bsg_cuda_print_stat_start(1); + bsg_saif_start(); + int start, end; + local_range(V, &start, &end); + if(bsg_id == 0) pr_dbg("elem 1: %i and dist: %i and random weight: %i\n", from_vertexset[5], dist[5], in_neighbors[in_indices[5]].weight); + for ( int d = start; d < end; d++) { + int degree = in_indices[d + 1] - in_indices[d]; + WNode * neighbors = &in_neighbors[in_indices[d]]; + for(int s = 0; s < degree; s++) { + if(from_vertexset[neighbors[s].vertex]) { + if( apply_func ( neighbors[s].vertex, d, neighbors[s].weight )) { + next_frontier[d] = 1; + } + } + } //end of loop on in neighbors + } //end of outer for loop + bsg_saif_end(); + bsg_cuda_print_stat_end(1); + barrier.sync(); + return 0; +} //end of edgeset apply function + + +struct dist_generated_vector_op_apply_func_0 +{ + void operator() (int v) + { + dist[v] = (2147483647) ; + }; +}; +struct updateEdge +{ + bool operator() (int src, int dst, int weight) + { + bool output3 = false; + int new_dist = (dist[src] + weight); + if(dist[dst] > new_dist) { + dist[dst] = new_dist; + output3 = true; + } + return output3; + }; +}; +struct reset +{ + void operator() (int v) + { + dist[v] = (2147483647) ; + }; +}; + +extern "C" int __attribute__ ((noinline)) dist_generated_vector_op_apply_func_0_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + dist_generated_vector_op_apply_func_0()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) reset_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + reset()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier_call(int *in_indices, WNode *in_neighbors, int *frontier, int *modified_vertexsubset1, int V, int E, int block_size_x) { + edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier(in_indices, in_neighbors, frontier, modified_vertexsubset1, updateEdge(), V, E, block_size_x); + return 0; +} + + diff --git a/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp b/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp new file mode 100644 index 000000000..ee50a54d6 --- /dev/null +++ b/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp @@ -0,0 +1,9 @@ +#pragma once +#ifndef __PR_PULL_BENCHMARK_HPP +#define __PR_PULL_BENCHMARK_HPP + +#include +#include +#include +#include +#endif diff --git a/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp b/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp new file mode 100644 index 000000000..23da05000 --- /dev/null +++ b/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp @@ -0,0 +1,8 @@ +#ifndef __SSSP_BENCHMARK_HPP +#define __SSSP_BENCHMARK_HPP + +#include +#include +#include +#include +#endif diff --git a/examples/graphit/test_sssp_delta/main.cpp b/examples/graphit/test_sssp_delta/main.cpp new file mode 100644 index 000000000..069ee5426 --- /dev/null +++ b/examples/graphit/test_sssp_delta/main.cpp @@ -0,0 +1,219 @@ +#include "sssp.hpp" +#define X 16 +#define Y 2 +#define NUM_LOCKS 1024 +#define VERIFY false +#define ROOT 6 +#define DELTA 32 + +WGraphHB edges; +GlobalScalar dist_dev; +//BucketPriorityQueue pq; + +bool apply(int s, int d, int w, std::vector &dist) { + int new_dist = (dist[s] + w); + if(dist[d] > new_dist) { + dist[d] = new_dist; + return true; + } + return false; +} + +void sssp_pull_call(std::vector &front, std::vector &next, std::vector &dist) { + auto g = edges.getHostGraph(); + auto * in_neigh = g.in_neighbors_shared_.get(); + auto ** in_index = g.in_index_shared_.get(); + for(int d = 0; d < edges.num_nodes(); d++) { + int ind = in_index[d] - in_neigh; + int degree = g.in_degree(d); + auto * neighbors = &in_neigh[ind]; + for(int s = 0; s < degree; s++){ + if(front[neighbors[s].v]){ + if(apply(neighbors[s].v, d, neighbors[s].w, dist)) { + next[d] = 1; + } + } + } + } + +} + +void host_sssp_pull(BucketPriorityQueue& pq, std::vector &dist, int iter) { + dist[ROOT] = 0; + Device::Ptr device = Device::GetInstance(); + Vector next_frontier_dev = Vector(edges.num_nodes()); + std::vector h_next(edges.num_nodes(), 0); + std::vector h_front(edges.num_nodes(), 0); + + for(int i = 0; i < iter; i++) { + if(!(pq.finished() == 0)) { std::cout << "no more items on iter: " << i << "\n"; break; } + Vector front = pq.popDenseReadyVertexSet(); + front.copyToHost(h_front.data(), edges.num_nodes()); + device->freeze_cores(); + device->read_dma(); + device->unfreeze_cores(); + int num_elems = std::count(h_front.begin(), h_front.end(), 1); + std::cout << "num elems in front: " << num_elems << " val of 0: " << h_front[0] << std::endl; + sssp_pull_call(h_front, h_next, dist); + num_elems = std::count(h_next.begin(), h_next.end(), 1); + std::cout << "num elems in next front: " << num_elems << std::endl; + std::cout << "dist of 1: " << dist[1] << std::endl; + next_frontier_dev.copyToDevice(h_next.data(), edges.num_nodes()); + hammerblade::write_global_buffer_dma(dist.data(), dist_dev, edges.num_nodes()); + device->freeze_cores(); + device->write_dma(); + device->unfreeze_cores(); + hammerblade::updateBucketWithGraphItVertexSubset(next_frontier_dev, pq); + std::fill(h_next.begin(), h_next.end(), 0); + } +} +void host_sssp_push(BucketPriorityQueue &pq, std::vector &dist, int iter) { + host_sssp_pull(pq, dist, iter); +} + +int launch(int argc, char * argv[]){ + InputParser input(argc, argv); + if(!input.cmdOptionExists("-g")) { + + std::cerr << "no input args\n"; + for(auto i = 0; i < argc; i++) { + std::cerr << argv[i] << " "; + } + std::cerr << std::endl; + return 0; + } + std::string ucode_path = input.getRISCVFile(); + + int iter = 0; + //std::string iterstrbase = "iteration-"; + //auto pos = ucode_path.find(iterstrbase); + //auto iterstr = ucode_path.substr(pos + iterstrbase.size(), std::string::npos); + //std::stringstream ss(iterstr); + //ss >> iter; + std::cerr << "iteration: " << iter << std::endl; + + int version = 0; //pull-vertex + if(ucode_path.find("push-vertex") != std::string::npos) { + version = 1; + } + std::cerr << "load microcode" << std::endl; + hammerblade::builtin_loadMicroCodeFromFile(ucode_path); + std::cerr << "load graph" << std::endl; + + std::string graph_f = input.getCmdOption("-g"); + //std::string frontier_f = input.getCmdOption("-f"); + edges = hammerblade::builtin_loadWeightedEdgesFromFileToHB (graph_f.c_str()); + std::cerr << "out deg of 0: " << edges.out_degree(5) << "num edges: " << edges.num_edges() << std::endl; + + + Device::Ptr device = Device::GetInstance(); + dist_dev = GlobalScalar("dist"); + hammerblade::init_global_array(edges.num_nodes(), dist_dev); + hammerblade::assign_val_dma(0, edges.num_nodes(), (2147483647), dist_dev); + int start_vertex = 0; + //hammerblade::insert_val(start_vertex, 0, dist_dev); + + std::cerr << "init locks\n"; + GlobalScalar glbl_locks = GlobalScalar("locks"); + hammerblade::init_global_array>(NUM_LOCKS, glbl_locks); + std::atomic tmp_array[NUM_LOCKS] = {}; + hammerblade::write_global_buffer_dma>(tmp_array, glbl_locks, NUM_LOCKS); + + std::cerr << "doing batch dma write" << std::endl; + device->freeze_cores(); + device->write_dma(); + device->unfreeze_cores(); + hammerblade::insert_val(start_vertex, 0, dist_dev); + std::cerr << "init pq" << std::endl; + BucketPriorityQueue pq = BucketPriorityQueue(edges.num_nodes(), &dist_dev, (hammerblade::BucketOrder)1, (hammerblade::PriorityOrder)0, (int) 128, (int) 32); + + std::cerr << "host side compute up to current iter: \n"; + std::vector h_dist(edges.num_nodes(), 2147483647); + if(version == 0) { + host_sssp_pull(pq, h_dist, iter); + } else { + host_sssp_push(pq, h_dist, iter); + } + hammerblade::write_global_buffer_dma(h_dist.data(), dist_dev, edges.num_nodes()); + device->freeze_cores(); + device->write_dma(); + device->unfreeze_cores(); + + std::cerr << "starting while loop" << std::endl; + Vector next_frontier_dev; + switch(version){ + case 0: { // do dense pull bfs + //device->enqueueJob("init_kernel", hb_mc_dimension(X,Y), {edges.num_nodes()}); + //device->runJobs(); + for(int i = 0; i < 1; i++) //just doing one large iteration + { + + std::cerr << "doing SSSP Delta Stepping kernel" << std::endl; + //Vector frontier = hammerblade::getBucketWithGraphItVertexSubset(pq); + Vector frontier = pq.popDenseReadyVertexSet(); + std::cerr << "got frontier from pq\n"; + next_frontier_dev = Vector(edges.num_nodes()); + //next_frontier_dev.assign(0, edges.num_nodes(), 0); + //device->freeze_cores(); + //device->write_dma(); + //device->unfreeze_cores(); + printf("0x%08x\n", frontier.getAddr()); + printf("next: 0x%08x\n", next_frontier_dev.getAddr()); + std::cerr << "initialized next front\n"; + device->enqueueJob("edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier_call", + hb_mc_dimension(X,Y), + {edges.getInIndicesAddr(), + edges.getInNeighborsAddr(), + frontier.getAddr(), + next_frontier_dev.getAddr(), + edges.num_nodes(), + edges.num_edges(), + edges.num_nodes()}); + device->runJobs(); + std::cerr << "updating buckets:\n"; + hammerblade::updateBucketWithGraphItVertexSubset(next_frontier_dev, pq); + hammerblade::deleteObject(frontier); + } + break; + } + case 1: { //do sparse push blocked bfs + break; + } + } + + std::cerr << "finished while loop" << std::endl; + + if(VERIFY) { + int * host_next = new int[edges.num_nodes()]; + next_frontier_dev.copyToHost(host_next, edges.num_nodes()); + + device->freeze_cores(); + device->read_dma(); + device->unfreeze_cores(); + + ofstream file("./frontier_verify.txt"); + if(!file.is_open()) std::cerr <<"couldn't open file\n"; + for(int i = 0; i < edges.num_nodes(); i++) { + if(host_next[i] == 1 && i % 50 == 0) std::cerr << i << std::endl; + file << host_next[i] << std::endl; + } + file.close(); + } + device->finish(); + return 0; +} +#ifdef VCS +int vcs_main(int argc, char ** argv) { + bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n"); + int rc = launch(argc,argv); + bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); + return rc; +} +#else +int main(int argc, char ** argv) { + bsg_pr_test_info("Unified Main CUDA Regression Test (F1)\n"); + int rc = launch(argc,argv); + bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); + return rc; +} +#endif diff --git a/examples/graphit/test_sssp_delta/sssp.hpp b/examples/graphit/test_sssp_delta/sssp.hpp new file mode 100644 index 000000000..2dfcd3d5c --- /dev/null +++ b/examples/graphit/test_sssp_delta/sssp.hpp @@ -0,0 +1,26 @@ +#ifndef __SSSP_BENCHMARK_HPP +#define __SSSP_BENCHMARK_HPP + +#pragma once +#include "hb_intrinsics.h" +#include "infra_hb/host/arg_parser.hpp" +#include "infra_hb/host/priority_queue.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using hammerblade::Device; +using hammerblade::Vector; +using hammerblade::GraphHB; +using hammerblade::WGraphHB; +using hammerblade::GlobalScalar; +using hammerblade::BucketPriorityQueue; +using hammerblade::Bucket; +#endif From 72e4809aee544688da49c36d1956be9339682e58 Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Sun, 4 Apr 2021 18:28:36 -0700 Subject: [PATCH 14/22] hack to get multiple kernel versions support, needs to be refactored --- examples/cuda/riscv.mk | 2 +- examples/graphit/test_pr_nibble/Makefile | 48 ++++++++++++++++++++---- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/examples/cuda/riscv.mk b/examples/cuda/riscv.mk index 87a52d511..00b37c1eb 100644 --- a/examples/cuda/riscv.mk +++ b/examples/cuda/riscv.mk @@ -244,7 +244,7 @@ RISCV_LDFLAGS += -Wl,--no-check-sections # This builds a .riscv binary for the current machine type and tile # group size. RISCV_TARGET_OBJECTS are .rvo files that will be linked # in the final binary. -%.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) +kernel.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) $(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@ kernel.link.clean: diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile index 4bfca9a2b..74d58eeb3 100644 --- a/examples/graphit/test_pr_nibble/Makefile +++ b/examples/graphit/test_pr_nibble/Makefile @@ -39,21 +39,24 @@ # BSG_MANYCORE_DIR: Path to a clone of BSG Manycore ############################################################################### +CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) include $(REPLICANT_PATH)/environment.mk SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime -CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-new +GRAPHIT_PATH = $(REPLICANT_PATH)/../graphit-new GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx - # TEST_NAME is the basename of the executable TEST_NAME = main # KERNEL_NAME is the name of the CUDA-Lite Kernel KERNEL_NAME = pr_nibble +VERSIONS = hybrid +DEFAULT_VERSION := hybrid +KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.cpp + ############################################################################### # Host code compilation flags and flow ############################################################################### @@ -69,7 +72,6 @@ FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable CFLAGS += -std=c99 $(FLAGS) CXXFLAGS += -std=c++11 $(FLAGS) -HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ # compilation.mk defines rules for compilation of C/C++ include $(EXAMPLES_PATH)/compilation.mk @@ -93,11 +95,13 @@ include $(EXAMPLES_PATH)/link.mk # BSG_MANYCORE_KERNELS is a list of manycore executables that should # be built before executing. -BSG_MANYCORE_KERNELS = kernel.riscv +BSG_MANYCORE_KERNELS = kernel.riscv kernel.rvo: RISCV_CXX = $(RISCV_GXX) kernel.riscv: kernel.rvo +%/kernel.rvo: RISCV_CXX = $(RISCV_GXX) + # Tile Group Dimensions TILE_GROUP_DIM_X = 16 TILE_GROUP_DIM_Y = 8 @@ -108,6 +112,9 @@ RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_pr_nibble/kernel/inc include $(EXAMPLES_PATH)/cuda/riscv.mk +%/kernel.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo %/kernel.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) + $(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@ + ############################################################################### # Execution flow # @@ -116,13 +123,37 @@ include $(EXAMPLES_PATH)/cuda/riscv.mk # # SIM_ARGS: Use this to pass arguments to the simulator ############################################################################### -C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH) +#C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH) +C_ARGS ?= $(KERNEL_NAME) -g $(GRAPH_PATH) SIM_ARGS ?= # Include platform-specific execution rules include $(EXAMPLES_PATH)/execution.mk +HOST_TARGET := $(TEST_NAME).profile + +$(VERSIONS): %: kernel/%/$(HOST_TARGET).log + +ALIASES = vanilla_stats.csv vcache_stats.csv +$(ALIASES): $(HOST_TARGET).log ; +$(HOST_TARGET).log: kernel.riscv $(HOST_TARGET) + ./$(HOST_TARGET) $(SIM_ARGS) +c_args="kernel.riscv $(DEFAULT_VERSION) $(C_ARGS)" 2>&1 | tee $@ + + +KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a) +.PRECIOUS: $(KERNEL_ALIASES) +$(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ; +kernel/%/$(HOST_TARGET).log: kernel/%/kernel.riscv $(HOST_TARGET) + $(eval EXEC_PATH := $(patsubst %/,%,$(dir $@))) + $(eval KERNEL_PATH := $(CURRENT_PATH)/$(EXEC_PATH)) + $(eval _VERSION := $(notdir $(EXEC_PATH))) + cd $(EXEC_PATH) && \ + $(CURRENT_PATH)/$(HOST_TARGET) $(SIM_ARGS) +c_args="$(KERNEL_PATH)/kernel.riscv $(_VERSION) $(C_ARGS)" \ + 2>&1 | tee $(notdir $a) + +versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log) + ############################################################################### # Regression Flow ############################################################################### @@ -144,8 +175,11 @@ help: print-% : ; @echo $* = $($*) +version.clean: + rm -rf kernel/*/*{.ucli,.csv,.log,.rvo,.riscv,.vpd,.key,.dis,.ll,.ll.s} + .PHONY: clean -clean: +clean: version.clean From bcfc393515a0853f22b8a61921d40e40ba3b0324 Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Tue, 20 Apr 2021 10:53:09 -0700 Subject: [PATCH 15/22] cleaning up/adding kernel code to repo --- examples/graphit/test_pr_nibble/Makefile | 34 ++- .../test_pr_nibble/kernel/hybrid/kernel.cpp | 229 ++++++++++++++++++ examples/graphit/test_pr_nibble/main.cpp | 32 +-- examples/graphit/test_pr_nibble/pr_host.hpp | 11 +- 4 files changed, 272 insertions(+), 34 deletions(-) create mode 100644 examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile index 74d58eeb3..9ca9ec067 100644 --- a/examples/graphit/test_pr_nibble/Makefile +++ b/examples/graphit/test_pr_nibble/Makefile @@ -52,8 +52,29 @@ GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx TEST_NAME = main # KERNEL_NAME is the name of the CUDA-Lite Kernel KERNEL_NAME = pr_nibble +HOST_TARGET := $(TEST_NAME).profile + +BASE_VERSIONS += hybrid-update + +ITERATIONS := 0 1 2 3 4 5 6 7 8 9 +v-from-basev-and-iter = $1-iteration-$2 +basev-from-v = $(word 1,$(subst -iteration-, ,$1)) +iter-from-v = $(word 2,$(subst -iteration-, ,$1)) + +VERSIONS := $(foreach i,$(ITERATIONS),$(foreach v,$(BASE_VERSIONS),\ + $(call v-from-basev-and-iter,$v,$i))) + +VERSION-DIRS := $(foreach v,$(VERSIONS),kernel/$v) + +.PHONY: $(VERSION-DIRS) +$(VERSION-DIRS): + cp -r $(call basev-from-v,$@) $@ + +.PHONY: versions bleach-versions +versions: $(VERSION-DIRS) +bleach-versions: + rm -rf $(VERSION-DIRS) -VERSIONS = hybrid DEFAULT_VERSION := hybrid KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.cpp @@ -131,20 +152,19 @@ SIM_ARGS ?= # Include platform-specific execution rules include $(EXAMPLES_PATH)/execution.mk -HOST_TARGET := $(TEST_NAME).profile $(VERSIONS): %: kernel/%/$(HOST_TARGET).log -ALIASES = vanilla_stats.csv vcache_stats.csv +ALIASES = vanilla_stats.csv vcache_stats.csv dramsim3epoch.json dramsim3.json dramsim3.tag.json dramsim3.txt $(ALIASES): $(HOST_TARGET).log ; -$(HOST_TARGET).log: kernel.riscv $(HOST_TARGET) +$(HOST_TARGET).log: $(HOST_TARGET) kernel.riscv ./$(HOST_TARGET) $(SIM_ARGS) +c_args="kernel.riscv $(DEFAULT_VERSION) $(C_ARGS)" 2>&1 | tee $@ KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a) .PRECIOUS: $(KERNEL_ALIASES) $(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ; -kernel/%/$(HOST_TARGET).log: kernel/%/kernel.riscv $(HOST_TARGET) +kernel/%/$(HOST_TARGET).log: $(HOST_TARGET) kernel/%/kernel.riscv $(eval EXEC_PATH := $(patsubst %/,%,$(dir $@))) $(eval KERNEL_PATH := $(CURRENT_PATH)/$(EXEC_PATH)) $(eval _VERSION := $(notdir $(EXEC_PATH))) @@ -152,7 +172,9 @@ kernel/%/$(HOST_TARGET).log: kernel/%/kernel.riscv $(HOST_TARGET) $(CURRENT_PATH)/$(HOST_TARGET) $(SIM_ARGS) +c_args="$(KERNEL_PATH)/kernel.riscv $(_VERSION) $(C_ARGS)" \ 2>&1 | tee $(notdir $a) -versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log) +.PRECIOUS: %.log + +all-versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log) ############################################################################### # Regression Flow diff --git a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp new file mode 100644 index 000000000..16e66425c --- /dev/null +++ b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp @@ -0,0 +1,229 @@ +//#define DEBUG +#include + +#ifdef DEBUG +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#else +#include +// #define BSG_TILE_GROUP_X_DIM 16 +// #define BSG_TILE_GROUP_Y_DIM 8 +#endif + +#include +bsg_barrier barrier; + +#include +#include + +#ifdef DEBUG +#define pr_dbg(fmt, ...) \ + bsg_printf(fmt, ##__VA_ARGS__) +#else +#define pr_dbg(fmt, ...) +#endif + +__attribute__((section(".dram"))) float * __restrict p; +__attribute__((section(".dram"))) float * __restrict old_rank; +__attribute__((section(".dram"))) float * __restrict new_rank; +__attribute__((section(".dram"))) int * __restrict out_degree; +__attribute__((section(".dram"))) int * __restrict generated_tmp_vector_3; +//__attribute__((section(".dram"))) double alpha = 0.15; +//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; + +template int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); + //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); + int start, end; + local_range(V, &start, &end); + for ( int d = start; d < end; d++) { + int degree = in_indices[d + 1] - in_indices[d]; + int * neighbors = &in_neighbors[in_indices[d]]; + for(int s = 0; s < degree; s++) { + if(from_vertexset[neighbors[s]]) { + //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); + apply_func (neighbors[s] , d); + } + } //end of loop on in neighbors + } //end of outer for loop + return 0; +} //end of edgeset apply function + +template int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); + //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); + int start, end; + local_range(V, &start, &end); + for ( int s = start; s < end; s++) { + if(from_vertexset[s]) { + int degree = out_indices[s + 1] - out_indices[s]; + int * neighbors = &out_neighbors[out_indices[s]]; + for(int d = 0; d < degree; d++) { + apply_func (s, neighbors[d]); + //if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); } + + } + } //end of loop on in neighbors + } //end of outer for loop + //barrier.sync(); + return 0; +} //end of edgeset apply function + + +struct generated_vector_op_apply_func_4 +{ + void operator() (int v) + { + out_degree[v] = generated_tmp_vector_3[v]; + }; +}; +struct new_rank_generated_vector_op_apply_func_2 +{ + void operator() (int v) + { + new_rank[v] = ((float) 0) ; + }; +}; +struct old_rank_generated_vector_op_apply_func_1 +{ + void operator() (int v) + { + old_rank[v] = ((float) 0) ; + }; +}; +struct p_generated_vector_op_apply_func_0 +{ + void operator() (int v) + { + p[v] = ((float) 0) ; + }; +}; +struct updateEdge +{ + void operator() (int src, int dst) + { + float alpha = 0.15; + new_rank[dst] = (new_rank[dst] + (((((1) - alpha) / ((1) + alpha)) * old_rank[src]) / out_degree[src])); + }; +}; +struct updateSelf +{ + void operator() (int v) + { + float alpha = 0.15; + p[v] = (p[v] + ((((2) * alpha) / ((1) + alpha)) * old_rank[v])); + new_rank[v] = (0) ; + }; +}; +struct filter_frontier +{ + bool operator() (int v) + { + float epsilon = (float) 1e-6; + bool output ; + //if(old_rank[v] == 0) return 0; + if(new_rank[v] == 0) return 0; + //output = (old_rank[v]) > ((out_degree[v] * epsilon)); + output = (new_rank[v]) > ((out_degree[v] * epsilon)); + return output; + }; +}; + +extern "C" int __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + p_generated_vector_op_apply_func_0()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + old_rank_generated_vector_op_apply_func_1()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + new_rank_generated_vector_op_apply_func_2()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) { + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + generated_vector_op_apply_func_4()(iter_x); + } + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) { + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if(frontier[iter_x]) { updateSelf()(iter_x); } + } + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c); + bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); + edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + + extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); + edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + +extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { + //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c); + //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c); + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if (iter_x < V) { + next5[iter_x] = 0; + if ( filter_frontier()( iter_x ) ) { + next5[iter_x] = 1; + //pr_dbg("added vertex %i to frontier\n", iter_x); + } + } + else { break; } + } //end of loop + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + + diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp index ff396f302..9cbfdde2b 100644 --- a/examples/graphit/test_pr_nibble/main.cpp +++ b/examples/graphit/test_pr_nibble/main.cpp @@ -34,11 +34,11 @@ int launch(int argc, char ** argv){ std::string ucode_path = input.getRISCVFile(); int iter = 0; - // std::string iterstrbase = "iteration-"; - // auto pos = ucode_path.find(iterstrbase); - // auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos); - // std::stringstream ss(iterstr); - // ss >> iter; + std::string iterstrbase = "iteration-"; + auto pos = ucode_path.find(iterstrbase); + auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos); + std::stringstream ss(iterstr); + ss >> iter; std::cerr << "iteration: " << iter << std::endl; int version = 0; //default to vertex pull @@ -84,7 +84,6 @@ int launch(int argc, char ** argv){ float epsilon = ((float) 1e-06) ; int start_vertex = ROOT; Vector frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); - //Vector next_frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); std::vector hfrontier(edges.num_nodes(), 0); std::vector p(edges.num_nodes(), (float) 0.0); @@ -122,12 +121,7 @@ int launch(int argc, char ** argv){ std::cerr << "start of while loop\n"; int tag_c = 0; - //double host_rank[edges.num_nodes()]; - //ofstream prog_file; - //prog_file.open("./progress.txt"); - //prog_file << "starting computation w/ root vertex: " << start_vertex << std::endl; //while ( builtin_getVertexSetSizeHB(frontier, edges.num_nodes()) != 0) - //while ( iter < 16) for(int i = 0; i < 1; i++) { int f_sz = 0; @@ -136,10 +130,10 @@ int launch(int argc, char ** argv){ case 0: //vertex pull std::cerr << "pull kernel\n"; std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; - std::cerr << "run update edges kernel on iter : " << iter << "\n"; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; @@ -154,25 +148,25 @@ int launch(int argc, char ** argv){ case 1: //vertex push std::cerr << "push kernel\n"; std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; std::cerr << "run update edges kernel on iter : " << iter << "\n"; device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; - std::cerr << "swap arrays\n"; - hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); std::cerr << "create next frontier\n"; device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); device->runJobs(); + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; break; case 2: //blocked pull std::cerr << "blocked pull kernel\n"; std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {edges.num_nodes(), tag_c}); + device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); device->runJobs(); tag_c++; std::cerr << "run update edges kernel on iter : " << iter << "\n"; @@ -191,13 +185,9 @@ int launch(int argc, char ** argv){ tag_c++; iter++; - //prog_file << "finished iteration: " << iter << std::endl; } std::cerr << "*******end of program********\n"; - //prog_file << "*******end of program********\n"; std::cerr << "took: " << iter << " iterations to complete\n"; - //prog_file << "took: " << iter << " iterations to complete\n"; - //prog_file.close(); if(VERIFY) { ofstream ver_file; ver_file.open("./rank.txt"); diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp index 7e7479495..1923c6d6d 100644 --- a/examples/graphit/test_pr_nibble/pr_host.hpp +++ b/examples/graphit/test_pr_nibble/pr_host.hpp @@ -13,16 +13,16 @@ inline void host_pr_calc(std::vector & p, std::vector & old_rank, ofstream ofile; ofile.open (fname); for(int i = 0; i < iter; i++) { - //std::memcpy(new_rank, old_rank, sizeof(float)*edges.num_nodes()); - //new_rank = old_rank; new_rank.assign(old_rank.begin(), old_rank.end()); //print out iteration and size: int num_items = std::count(frontier.begin(), frontier.end(), 1); std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl; //update_self for(int v = 0; v < g.num_nodes(); v++) { - p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; - new_rank[v] = (float) 0.0 ; + if(frontier[v]) { + p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; + new_rank[v] = (float) 0.0 ; + } } //update edges for(int d = 0; d < g.num_nodes(); d++) { @@ -35,9 +35,6 @@ inline void host_pr_calc(std::vector & p, std::vector & old_rank, } } } - //old_rank.swap(new_rank); - //std::memcpy(old_rank, new_rank, sizeof(float)*edges.num_nodes()); - //old_rank = new_rank; old_rank.assign(new_rank.begin(), new_rank.end()); //update frontier for(int v = 0; v < g.num_nodes(); v++) { From 5a72371e4b338b6db74842daa495f7506bdde6a8 Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Wed, 5 May 2021 10:57:33 -0700 Subject: [PATCH 16/22] Redirects submodules --- .gitmodules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 099c758cb..c130faae3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,7 +3,7 @@ url = git@github.com:bespoke-silicon-group/hb-prog-eval [submodule "examples/sdh-eval-workloads/ipnsw/graph-tools"] path = examples/sdh-eval-workloads/ipnsw/graph-tools - url = git@github.com:mrutt92/graph-tools + url = git@github.com:bespoke-silicon-group/graph-tools [submodule "examples/sdh-eval-workloads/ipnsw/hammerblade-helpers"] path = examples/sdh-eval-workloads/ipnsw/hammerblade-helpers - url = git@github.com:mrutt92/hammerblade-helpers + url = git@github.com:bespoke-silicon-group/hammerblade-helpers From b12e42e7842cb0168b789e9badaaae20fbe70e54 Mon Sep 17 00:00:00 2001 From: Max Ruttenberg Date: Wed, 5 May 2021 12:20:10 -0700 Subject: [PATCH 17/22] [ipnsw] adds rule to generate input --- examples/sdh-eval-workloads/ipnsw/Makefile | 30 +++++++++++++++++----- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/examples/sdh-eval-workloads/ipnsw/Makefile b/examples/sdh-eval-workloads/ipnsw/Makefile index d643c9e9f..a8f6da2c5 100644 --- a/examples/sdh-eval-workloads/ipnsw/Makefile +++ b/examples/sdh-eval-workloads/ipnsw/Makefile @@ -6,6 +6,23 @@ include $(REPLICANT_PATH)/environment.mk all: +################## +# Prepare inputs # +################## +ipnsw-eval-dir := $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw +ipnsw-inputs = $(ipnsw-eval-dir)/data/database_music100.bin +ipnsw-inputs += $(ipnsw-eval-dir)/data/query_music100.bin +ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_0 +ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_1 +ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_2 +ipnsw-inputs += $(ipnsw-eval-dir)/data/music.edges.level_3 + +ipnsw-input := $(ipnsw-eval-dir)/data/database_music100.bin +# this rule generates all the inputs, but we just target one +# to avoid running this more than once +$(ipnsw-input): + cd $(ipnsw-eval-dir) && bash prep.sh + ####################################### # Base clase run directory generation # ####################################### @@ -50,12 +67,7 @@ endef ################################# # Common command line arguments # ################################# -C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/database_music100.bin -C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/query_music100.bin -C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_0 -C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_1 -C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_2 -C_ARGS += $(EXAMPLES_PATH)/sdh-eval-workloads/ipnsw/hb-prog-eval/ipnsw/data/music.edges.level_3 +C_ARGS += $(ipnsw-inputs) ############### # Greedy Walk # @@ -141,6 +153,12 @@ exec: exec-$(call run-name,$1,$2) profile: profile-$(call run-name,$1,$2) debug: debug-$(call run-name,$1,$2) saifgen: saifgen-$(call run-name,$1,$2) + +saifgen-$(call run-name,$1,$2): $(ipnsw-input) +profile-$(call run-name,$1,$2): $(ipnsw-input) +debug-$(call run-name,$1,$2): $(ipnsw-input) +exec-$(call run-name,$1,$2): $(ipnsw-input) + endef .PHONY: generate .PHONY: purge From 769350dc4d4e49904aed4b570ec8d865e4d525eb Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Wed, 5 May 2021 16:24:36 -0700 Subject: [PATCH 18/22] [pr-nibble] adding graphit submodule, removing unnecessary tests, trying to refactor test --- .gitmodules | 3 + examples/graphit/Makefile | 1 - examples/graphit/graphit-src | 1 + examples/graphit/riscv.mk | 257 ------------------ examples/graphit/test_pr_nibble/Makefile | 9 +- examples/graphit/test_pr_nibble/main.cpp | 18 +- examples/graphit/test_pr_nibble/pr.hpp | 2 +- examples/graphit/test_sssp_delta/Makefile | 151 ---------- examples/graphit/test_sssp_delta/kernel.cpp | 99 ------- .../kernel/include/pr_nibble.hpp | 9 - .../test_sssp_delta/kernel/include/sssp.hpp | 8 - examples/graphit/test_sssp_delta/main.cpp | 219 --------------- examples/graphit/test_sssp_delta/sssp.hpp | 26 -- .../graphit/test_vec_add_parallel/Makefile | 142 ---------- .../graphit/test_vec_add_parallel/kernel.cpp | 20 -- examples/graphit/test_vec_add_parallel/main.c | 196 ------------- 16 files changed, 11 insertions(+), 1150 deletions(-) create mode 160000 examples/graphit/graphit-src delete mode 100644 examples/graphit/riscv.mk delete mode 100644 examples/graphit/test_sssp_delta/Makefile delete mode 100644 examples/graphit/test_sssp_delta/kernel.cpp delete mode 100644 examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp delete mode 100644 examples/graphit/test_sssp_delta/kernel/include/sssp.hpp delete mode 100644 examples/graphit/test_sssp_delta/main.cpp delete mode 100644 examples/graphit/test_sssp_delta/sssp.hpp delete mode 100644 examples/graphit/test_vec_add_parallel/Makefile delete mode 100644 examples/graphit/test_vec_add_parallel/kernel.cpp delete mode 100644 examples/graphit/test_vec_add_parallel/main.c diff --git a/.gitmodules b/.gitmodules index c130faae3..18ee1bd0f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "examples/sdh-eval-workloads/ipnsw/hammerblade-helpers"] path = examples/sdh-eval-workloads/ipnsw/hammerblade-helpers url = git@github.com:bespoke-silicon-group/hammerblade-helpers +[submodule "examples/graphit/graphit-src"] + path = examples/graphit/graphit-src + url = https://github.com/bespoke-silicon-group/graphit.git diff --git a/examples/graphit/Makefile b/examples/graphit/Makefile index f8389272b..600ef53f4 100644 --- a/examples/graphit/Makefile +++ b/examples/graphit/Makefile @@ -45,7 +45,6 @@ include $(REPLICANT_PATH)/environment.mk include $(EXAMPLES_PATH)/link.mk # Define the tests that get run -TESTS += test_vec_add_parallel TESTS += test_pr_nibble regression: $(TESTS) diff --git a/examples/graphit/graphit-src b/examples/graphit/graphit-src new file mode 160000 index 000000000..9f4d8e9ba --- /dev/null +++ b/examples/graphit/graphit-src @@ -0,0 +1 @@ +Subproject commit 9f4d8e9bacac0ed44afe7c3abde697f21457a487 diff --git a/examples/graphit/riscv.mk b/examples/graphit/riscv.mk deleted file mode 100644 index 87a52d511..000000000 --- a/examples/graphit/riscv.mk +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2019, University of Washington All rights reserved. -# -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, this list -# of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, this -# list of conditions and the following disclaimer in the documentation and/or -# other materials provided with the distribution. -# -# Neither the name of the copyright holder nor the names of its contributors may -# be used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# TODO: Makefile comment -ORANGE=\033[0;33m -RED=\033[0;31m -NC=\033[0m - -################################################################################ -# Paths -################################################################################ -_REPO_ROOT ?= $(shell git rev-parse --show-toplevel) --include $(_REPO_ROOT)/environment.mk - -BSG_MANYCORE_SPMD_PATH = $(BSG_MANYCORE_DIR)/software/spmd/ -BSG_MANYCORE_CUDALITE_PATH = $(BSG_MANYCORE_SPMD_PATH)/bsg_cuda_lite_runtime/ -BSG_MANYCORE_CUDALITE_MAIN_PATH = $(BSG_MANYCORE_CUDALITE_PATH)/main - -BSG_MANYCORE_LIB_PATH = $(BSG_MANYCORE_DIR)/software/bsg_manycore_lib -BSG_MANYCORE_COMMON_PATH = $(BSG_MANYCORE_SPMD_PATH)/common/ - -RISCV_TOOLS_PATH := $(BSG_MANYCORE_DIR)/software/riscv-tools/ -RISCV_GNU_PATH := $(RISCV_TOOLS_PATH)/riscv-install -RISCV_LLVM_PATH := $(RISCV_TOOLS_PATH)/llvm/llvm-install - -################################################################################ -# Include RISC-V Tool Configuration -################################################################################ - -RISCV_LINK_GEN := $(BSG_MANYCORE_DIR)/software/py/bsg_manycore_link_gen.py - -# These flags are not supported by clang -RISCV_GNU_FLAGS = -frerun-cse-after-loop -fweb -frename-registers -mtune=bsg_vanilla_2020 - -RISCV_GCC ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-gcc $(RISCV_GNU_FLAGS) -RISCV_GXX ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-g++ $(RISCV_GNU_FLAGS) -RISCV_ELF2HEX ?= LD_LIBRARY_PATH=$(RISCV_GNU_PATH)/lib $(RISCV_GNU_PATH)/bin/elf2hex -RISCV_OBJCOPY ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-objcopy -RISCV_AR ?= $(RISCV_GNU_PATH)/bin/riscv32-unknown-elf-dramfs-ar -RISCV_OBJDUMP ?= $(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs-objdump -RISCV_LINK ?= $(RISCV_GCC) -t -T $(LINK_SCRIPT) $(RISCV_LDFLAGS) -RISCV_LD ?= $(RISCV_GCC) - -RISCV_CLANG_ABI = ilp32f -RISCV_CLANG_CCPPFLAGS += --target=riscv32 -mabi=$(RISCV_CLANG_ABI) -march=riscv32imaf -mtune=hb-rv32 -RISCV_CLANG_CXXFLAGS += --sysroot=$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs -RISCV_CLANG_CXXFLAGS += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0 -RISCV_CLANG_CXXFLAGS += -I$(RISCV_GNU_PATH)/riscv32-unknown-elf-dramfs/include/c++/9.2.0/riscv32-unknown-elf-dramfs - -RISCV_CLANG ?= $(RISCV_LLVM_PATH)/bin/clang $(RISCV_CLANG_CFLAGS) $(RISCV_CLANG_CCPPFLAGS) -RISCV_CLANGXX ?= $(RISCV_LLVM_PATH)/bin/clang++ $(RISCV_CLANG_CXXFLAGS) $(RISCV_CLANG_CCPPFLAGS) -RISCV_LLVM_OPT ?= $(RISCV_LLVM_PATH)/bin/opt -RISCV_LLVM_LLC ?= $(RISCV_LLVM_PATH)/bin/llc -RISCV_LLVM_LIB ?= $(RISCV_LLVM_PATH)/lib - -# Set the default RISC-V Compilers. To override these globally set -# RISCV_CXX = $(RISCV_CLANGXX), etc. This can also be done on a -# per-object basis. For example, foo.rvo: RISCV_CXX=$(RISCV_CLANGXX) -RISCV_CXX ?= $(RISCV_GXX) -RISCV_CC ?= $(RISCV_GCC) - -################################################################################ -# C/C++ Compilation Flags -# -# All RISCV C/C++ compilation variables simply have RISCV_* appended. -################################################################################ -RISCV_OPT_LEVEL ?= -O2 -RISCV_ARCH_OP := rv32imaf - -# CCPPFLAGS are common between GCC and G++ -RISCV_CCPPFLAGS += $(RISCV_OPT_LEVEL) -RISCV_CCPPFLAGS += -march=$(RISCV_ARCH_OP) -RISCV_CCPPFLAGS += -g -RISCV_CCPPFLAGS += -static -RISCV_CCPPFLAGS += -ffast-math -RISCV_CCPPFLAGS += -fno-common -RISCV_CCPPFLAGS += -ffp-contract=off - -RISCV_CFLAGS += -std=gnu99 $(RISCV_CCPPFLAGS) -RISCV_CXXFLAGS += -std=c++11 $(RISCV_CCPPFLAGS) -RISCV_CXXFLAGS += -fno-threadsafe-statics - -RISCV_INCLUDES += -I$(BSG_MANYCORE_COMMON_PATH) -RISCV_INCLUDES += -I$(BSG_MANYCORE_DIR)/software/bsg_manycore_lib - -# TODO: Fail if bsg_tiles_X/Y are not set -RISCV_DEFINES += -Dbsg_global_X=$(BSG_MACHINE_GLOBAL_X) -RISCV_DEFINES += -Dbsg_global_Y=$(BSG_MACHINE_GLOBAL_Y) -RISCV_DEFINES += -Dbsg_group_size=$(BSG_MACHINE_POD_TILES) -RISCV_DEFINES += -Dbsg_pods_X=$(BSG_MACHINE_PODS_X) -RISCV_DEFINES += -Dbsg_pods_Y=$(BSG_MACHINE_PODS_Y) -RISCV_DEFINES += -DIO_X_INDEX=$(BSG_MACHINE_HOST_X_CORD) -RISCV_DEFINES += -DIO_Y_INDEX=$(BSG_MACHINE_HOST_Y_CORD) -RISCV_DEFINES += -DPREALLOCATE=0 -RISCV_DEFINES += -DHOST_DEBUG=0 - -# We build and name a machine-specific crt.rvo because it's REALLY -# difficult to figure out why your program/cosimulation is hanging -# when the wrong link script was used during linking -crt.rvo: $(BSG_MANYCORE_COMMON_PATH)/crt.S - $(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.comp.log - -# We compile these locally so that we don't interfere with the files in -# $(BSG_MANYCORE_LIB_PATH). -# BSG Manycore Library Objects -LIBBSG_MANYCORE_OBJECTS += bsg_set_tile_x_y.rvo -LIBBSG_MANYCORE_OBJECTS += bsg_tile_config_vars.rvo -LIBBSG_MANYCORE_OBJECTS += bsg_printf.rvo - -$(LIBBSG_MANYCORE_OBJECTS) main.rvo: RISCV_CXX = $(RISCV_GCC) - -$(LIBBSG_MANYCORE_OBJECTS): %.rvo:$(BSG_MANYCORE_LIB_PATH)/%.c - $(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ - -main.rvo: $(BSG_MANYCORE_CUDALITE_MAIN_PATH)/main.c - $(RISCV_GCC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ - -%.rvo: %.c - $(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.gcc.log - -%.rvo: %.cpp - $(RISCV_CXX) $(RISCV_CXXFLAGS) $(RISCV_DEFINES) $(RISCV_INCLUDES) -c $< -o $@ |& tee $*.gcc.log - -kernel.compile.clean: - rm -rf *.rvo *.a - -.PRECIOUS: %.rvo - -################################################################################ -# Linker Flow -################################################################################ - -# ELF File Parameters -# Default .data section location; LOCAL=>DMEM, SHARED=>DRAM. -BSG_ELF_DEFAULT_DATA_LOC ?= LOCAL - -BSG_ELF_OFF_CHIP_MEM := $(BSG_MACHINE_DRAM_INCLUDED) - -# Total addressable DRAM size (in 32-bit WORDS, and SIZE bytes) -BSG_ELF_DRAM_WORDS := $(shell expr $(BSG_MACHINE_DRAM_BANK_SIZE_WORDS) \* $(BSG_MACHINE_GLOBAL_X)) -BSG_ELF_DRAM_SIZE := $(shell expr $(BSG_ELF_DRAM_WORDS) \* 4) - -# Victim Cache Set Size (in 32-bit WORDS and SIZE bytes) -_BSG_ELF_VCACHE_SET_WORDS := $(shell expr $(BSG_MACHINE_VCACHE_WAY) \* $(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS)) -BSG_ELF_VCACHE_SET_SIZE := $(shell expr $(_BSG_ELF_VCACHE_SET_WORDS) \* 4) - -# Victim Cache Column Size (in 32-bit WORDS and SIZE bytes) -_BSG_ELF_VCACHE_COLUMN_WORDS := $(shell expr $(BSG_MACHINE_VCACHE_SET) \* $(_BSG_ELF_VCACHE_SET_WORDS)) -BSG_ELF_VCACHE_COLUMN_SIZE := $(shell expr $(_BSG_ELF_VCACHE_COLUMN_WORDS) \* 4) - -# Victim Cache Total Size (in 32-bit WORDS, and SIZE BYTES) -_BSG_ELF_VCACHE_MANYCORE_WORDS ?= $(shell expr $(BSG_MACHINE_GLOBAL_X) \* $(_BSG_ELF_VCACHE_COLUMN_WORDS)) -BSG_ELF_VCACHE_MANYCORE_SIZE := $(shell expr $(_BSG_ELF_VCACHE_MANYCORE_WORDS) \* 4) - -# Compute the ELF Stack Pointer Location. -ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL) -# If the .data segment is in DMEM (LOCAL) then put it at the top of DMEM. (This is the typical case) -BSG_ELF_STACK_PTR ?= 0x00000ffc -else - # EVA Offset in DRAM - BSG_ELF_DRAM_EVA_OFFSET = 0x80000000 - - ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1) - # Otherwise, use the top of DRAM (if present), - _BSG_ELF_DRAM_LIMIT = $(shell expr $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_DRAM_SIZE)) - else - # Or the Victim Cache address space (if DRAM is disabled/not present). - _BSG_ELF_DRAM_LIMIT = $(shell expr $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_VCACHE_MANYCORE_SIZE)) - endif -# Finally, Subtract 4 from the maximum memory space address -BSG_ELF_STACK_PTR = $(shell expr $(_BSG_ELF_DRAM_LIMIT) - 4) -endif - -# Linker script generation parameters -ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1) - ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL) - LINK_GEN_OPTS ?= --default_data_loc=dmem --dram_size=$(BSG_ELF_DRAM_SIZE) --sp=$(BSG_ELF_STACK_PTR) - else ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), SHARED) - LINK_GEN_OPTS ?= --default_data_loc=dram --dram_size=$(BSG_ELF_DRAM_SIZE) --sp=$(BSG_ELF_STACK_PTR) - else - $(error Invalid BSG_ELF_DEFAULT_DATA_LOC = $(BSG_ELF_DEFAULT_DATA_LOC); Only LOCAL and SHARED are valid) - endif - - LINK_GEN_OPTS += --imem_size=0x01000000 # 16MB -else ifeq ($(BSG_ELF_OFF_CHIP_MEM), 0) - ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL) - LINK_GEN_OPTS ?= --default_data_loc=dmem --dram_size=$(BSG_ELF_VCACHE_SIZE) --sp=$(BSG_ELF_STACK_PTR) - else ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), SHARED) - LINK_GEN_OPTS ?= --default_data_loc=dram --dram_size=$(BSG_ELF_VCACHE_SIZE) --sp=$(BSG_ELF_STACK_PTR) - else - $(error Invalid BSG_ELF_DEFAULT_DATA_LOC = $(BSG_ELF_DEFAULT_DATA_LOC); Only LOCAL and SHARED are valid) - endif - - LINK_GEN_OPTS += --imem_size=0x00008000 # 32KB -else - $(error Invalid BSG_ELF_OFF_CHIP_MEM = $(BSG_ELF_OFF_CHIP_MEM); Only 0 and 1 are valid) -endif - -RISCV_LINK_SCRIPT ?= bsg_link.ld -$(RISCV_LINK_SCRIPT): $(RISCV_LINK_GEN) - $(RISCV_LINK_GEN) $(LINK_GEN_OPTS) --out=$@ - -# Link commands and definitions - -RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_dram_size=$(BSG_ELF_DRAM_SIZE) -RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_vcache_size=$(BSG_ELF_VCACHE_MANYCORE_SIZE) -RISCV_LDFLAGS += -Wl,--defsym,_bsg_elf_stack_ptr=$(BSG_ELF_STACK_PTR) - -RISCV_LDFLAGS += -nostdlib -RISCV_LDFLAGS += -march=$(RISCV_ARCH_OP) -RISCV_LDFLAGS += -nostartfiles -RISCV_LDFLAGS += -ffast-math -RISCV_LDFLAGS += -lc -RISCV_LDFLAGS += -lm -RISCV_LDFLAGS += -lgcc - -# TODO: temporary fix to solve this problem: https://stackoverflow.com/questions/56518056/risc-v-linker-throwing-sections-lma-overlap-error-despite-lmas-belonging-to-dif -RISCV_LDFLAGS += -Wl,--no-check-sections - -# This builds a .riscv binary for the current machine type and tile -# group size. RISCV_TARGET_OBJECTS are .rvo files that will be linked -# in the final binary. -%.riscv: crt.rvo bsg_set_tile_x_y.rvo bsg_tile_config_vars.rvo main.rvo $(RISCV_TARGET_OBJECTS) $(RISCV_LINK_SCRIPT) - $(RISCV_LD) -T $(RISCV_LINK_SCRIPT) $(RISCV_LDFLAGS) $(filter %.rvo,$^) -o $@ - -kernel.link.clean: - rm -rf *.riscv $(RISCV_LINK_SCRIPT) - - -.PRECIOUS: %.riscv -.PHONY: kernel.link.clean kernel.compile.clean -clean: kernel.link.clean kernel.compile.clean - diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile index 9ca9ec067..bdd77f513 100644 --- a/examples/graphit/test_pr_nibble/Makefile +++ b/examples/graphit/test_pr_nibble/Makefile @@ -45,7 +45,7 @@ REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) include $(REPLICANT_PATH)/environment.mk SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime -GRAPHIT_PATH = $(REPLICANT_PATH)/../graphit-new +GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-src GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx # TEST_NAME is the basename of the executable @@ -54,7 +54,7 @@ TEST_NAME = main KERNEL_NAME = pr_nibble HOST_TARGET := $(TEST_NAME).profile -BASE_VERSIONS += hybrid-update +BASE_VERSIONS += hybrid ITERATIONS := 0 1 2 3 4 5 6 7 8 9 v-from-basev-and-iter = $1-iteration-$2 @@ -98,8 +98,7 @@ CXXFLAGS += -std=c++11 $(FLAGS) include $(EXAMPLES_PATH)/compilation.mk # Specify any header file dependencies -main.o: INCLUDES += -I$(EXAMPLES_PATH) -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/ -main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h +main.o: INCLUDES += -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/ ############################################################################### # Host code link flags and flow @@ -162,7 +161,7 @@ $(HOST_TARGET).log: $(HOST_TARGET) kernel.riscv KERNEL_ALIASES = $(foreach a,$(ALIASES),kernel/%/$a) -.PRECIOUS: $(KERNEL_ALIASES) +.PRECIOUS: $(KERNEL_ALIASES) kernel/%/kernel.riscv $(KERNEL_ALIASES): kernel/%/$(HOST_TARGET).log ; kernel/%/$(HOST_TARGET).log: $(HOST_TARGET) kernel/%/kernel.riscv $(eval EXEC_PATH := $(patsubst %/,%,$(dir $@))) diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp index 9cbfdde2b..f0bee8b64 100644 --- a/examples/graphit/test_pr_nibble/main.cpp +++ b/examples/graphit/test_pr_nibble/main.cpp @@ -25,7 +25,7 @@ GlobalScalar out_degree_dev; #include "pr_host.hpp" -int launch(int argc, char ** argv){ +int test_pr_nibble(int argc, char ** argv){ InputParser input(argc, argv); if(!input.cmdOptionExists("-g")){ std::cerr << "no input args\n"; @@ -202,18 +202,4 @@ int launch(int argc, char ** argv){ return 0; } -#ifdef VCS -int vcs_main(int argc, char ** argv){ - bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n"); - int rc = launch(argc, argv); - bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); - return rc; -} -#else -int main(int argc, char ** argv) { - bsg_pr_test_info("Unified Main CUDA Regression Test (F1)\n"); - int rc = launch(argc, argv); - bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); - return rc; -} -#endif +declare_program_main("test_pr_nibble", test_pr_nibble); diff --git a/examples/graphit/test_pr_nibble/pr.hpp b/examples/graphit/test_pr_nibble/pr.hpp index b1f9ac484..5cce0e30a 100644 --- a/examples/graphit/test_pr_nibble/pr.hpp +++ b/examples/graphit/test_pr_nibble/pr.hpp @@ -4,6 +4,7 @@ #include "hb_intrinsics.h" #include "infra_hb/host/arg_parser.hpp" +#include #include #include #include @@ -14,7 +15,6 @@ #include #include #include -#include using hammerblade::Device; diff --git a/examples/graphit/test_sssp_delta/Makefile b/examples/graphit/test_sssp_delta/Makefile deleted file mode 100644 index 2e980c4e2..000000000 --- a/examples/graphit/test_sssp_delta/Makefile +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2021, University of Washington All rights reserved. -# -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, this list -# of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, this -# list of conditions and the following disclaimer in the documentation and/or -# other materials provided with the distribution. -# -# Neither the name of the copyright holder nor the names of its contributors may -# be used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# This Makefile compiles, links, and executes examples Run `make help` -# to see the available targets for the selected platform. - -################################################################################ -# environment.mk verifies the build environment and sets the following -# makefile variables: -# -# LIBRAIRES_PATH: The path to the libraries directory -# HARDWARE_PATH: The path to the hardware directory -# EXAMPLES_PATH: The path to the examples directory -# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL -# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore -############################################################################### - -REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) - -include $(REPLICANT_PATH)/environment.mk -SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd -CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime -CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -GRAPHIT_PATH = $(CURRENT_PATH)/../graphit-new - -GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx - -# TEST_NAME is the basename of the executable -TEST_NAME = main -# KERNEL_NAME is the name of the CUDA-Lite Kernel -KERNEL_NAME = sssp - -############################################################################### -# Host code compilation flags and flow -############################################################################### - -# TEST_SOURCES is a list of source files that need to be compiled -TEST_SOURCES = main.cpp - -DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE -CDEFINES += -CXXDEFINES += - -FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable -CFLAGS += -std=c99 $(FLAGS) -CXXFLAGS += -std=c++11 $(FLAGS) - -HOST_CXX = /mnt/users/ssd0/homes/eafurst/research/gcc-build/bin/g++ - -# compilation.mk defines rules for compilation of C/C++ -include $(EXAMPLES_PATH)/compilation.mk - -# Specify any header file dependencies -main.o: INCLUDES += -I$(EXAMPLES_PATH) -I$(CURRENT_PATH) -I$(GRAPHIT_PATH)/src/runtime_lib/ -main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h - -############################################################################### -# Host code link flags and flow -############################################################################### - -LDFLAGS += - -# link.mk defines rules for linking of the final execution binary. -include $(EXAMPLES_PATH)/link.mk - -############################################################################### -# Device code compilation flow -############################################################################### - -# BSG_MANYCORE_KERNELS is a list of manycore executables that should -# be built before executing. -BSG_MANYCORE_KERNELS = kernel.riscv - -kernel.rvo: RISCV_CXX = $(RISCV_GXX) -kernel.riscv: kernel.rvo - -# Tile Group Dimensions -TILE_GROUP_DIM_X = 16 -TILE_GROUP_DIM_Y = 8 -RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) -RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) - -RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_sssp_delta/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/ - -include $(EXAMPLES_PATH)/cuda/riscv.mk - -############################################################################### -# Execution flow -# -# C_ARGS: Use this to pass arguments that you want to appear in argv -# For SPMD tests C arguments are: -# -# SIM_ARGS: Use this to pass arguments to the simulator -############################################################################### -C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) -g $(GRAPH_PATH) - -SIM_ARGS ?= - -# Include platform-specific execution rules -include $(EXAMPLES_PATH)/execution.mk - -############################################################################### -# Regression Flow -############################################################################### - -regression: main.exec.log - @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null - -############################################################################### -# Default rules, help, and clean -############################################################################### -.DEFAULT_GOAL := help -help: - @echo "Usage:" - @echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}" - @echo " $(TEST_NAME).profile: Build executable with profilers enabled" - @echo " $(TEST_NAME).debug: Build waveform executable (if VCS)" - @echo " $(TEST_NAME).{profile,debug}.log: Run specific executable" - @echo " clean: Remove all subdirectory-specific outputs" - -print-% : ; @echo $* = $($*) - -.PHONY: clean - -clean: - - diff --git a/examples/graphit/test_sssp_delta/kernel.cpp b/examples/graphit/test_sssp_delta/kernel.cpp deleted file mode 100644 index 5039f1dd2..000000000 --- a/examples/graphit/test_sssp_delta/kernel.cpp +++ /dev/null @@ -1,99 +0,0 @@ -#include -#include - -//#define BSG_TILE_GROUP_X_DIM 16 -//#define BSG_TILE_GROUP_Y_DIM 2 -//#define bsg_tiles_X BSG_TILE_GROUP_X_DIM -//#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM -#include -bsg_barrier barrier; -#include - -//#define DEBUG -#ifdef DEBUG -#define pr_dbg(fmt, ...) \ - bsg_printf(fmt, ##__VA_ARGS__) -#else -#define pr_dbg(fmt, ...) -#endif - - -__attribute__((section(".dram"))) int * __restrict dist; - -template int edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier(int *in_indices , WNode *in_neighbors, int* from_vertexset, int * next_frontier, APPLY_FUNC apply_func, int V, int E, int block_size_x) -{ - bsg_cuda_print_stat_start(1); - bsg_saif_start(); - int start, end; - local_range(V, &start, &end); - if(bsg_id == 0) pr_dbg("elem 1: %i and dist: %i and random weight: %i\n", from_vertexset[5], dist[5], in_neighbors[in_indices[5]].weight); - for ( int d = start; d < end; d++) { - int degree = in_indices[d + 1] - in_indices[d]; - WNode * neighbors = &in_neighbors[in_indices[d]]; - for(int s = 0; s < degree; s++) { - if(from_vertexset[neighbors[s].vertex]) { - if( apply_func ( neighbors[s].vertex, d, neighbors[s].weight )) { - next_frontier[d] = 1; - } - } - } //end of loop on in neighbors - } //end of outer for loop - bsg_saif_end(); - bsg_cuda_print_stat_end(1); - barrier.sync(); - return 0; -} //end of edgeset apply function - - -struct dist_generated_vector_op_apply_func_0 -{ - void operator() (int v) - { - dist[v] = (2147483647) ; - }; -}; -struct updateEdge -{ - bool operator() (int src, int dst, int weight) - { - bool output3 = false; - int new_dist = (dist[src] + weight); - if(dist[dst] > new_dist) { - dist[dst] = new_dist; - output3 = true; - } - return output3; - }; -}; -struct reset -{ - void operator() (int v) - { - dist[v] = (2147483647) ; - }; -}; - -extern "C" int __attribute__ ((noinline)) dist_generated_vector_op_apply_func_0_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - dist_generated_vector_op_apply_func_0()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) reset_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - reset()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier_call(int *in_indices, WNode *in_neighbors, int *frontier, int *modified_vertexsubset1, int V, int E, int block_size_x) { - edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier(in_indices, in_neighbors, frontier, modified_vertexsubset1, updateEdge(), V, E, block_size_x); - return 0; -} - - diff --git a/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp b/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp deleted file mode 100644 index ee50a54d6..000000000 --- a/examples/graphit/test_sssp_delta/kernel/include/pr_nibble.hpp +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once -#ifndef __PR_PULL_BENCHMARK_HPP -#define __PR_PULL_BENCHMARK_HPP - -#include -#include -#include -#include -#endif diff --git a/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp b/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp deleted file mode 100644 index 23da05000..000000000 --- a/examples/graphit/test_sssp_delta/kernel/include/sssp.hpp +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __SSSP_BENCHMARK_HPP -#define __SSSP_BENCHMARK_HPP - -#include -#include -#include -#include -#endif diff --git a/examples/graphit/test_sssp_delta/main.cpp b/examples/graphit/test_sssp_delta/main.cpp deleted file mode 100644 index 069ee5426..000000000 --- a/examples/graphit/test_sssp_delta/main.cpp +++ /dev/null @@ -1,219 +0,0 @@ -#include "sssp.hpp" -#define X 16 -#define Y 2 -#define NUM_LOCKS 1024 -#define VERIFY false -#define ROOT 6 -#define DELTA 32 - -WGraphHB edges; -GlobalScalar dist_dev; -//BucketPriorityQueue pq; - -bool apply(int s, int d, int w, std::vector &dist) { - int new_dist = (dist[s] + w); - if(dist[d] > new_dist) { - dist[d] = new_dist; - return true; - } - return false; -} - -void sssp_pull_call(std::vector &front, std::vector &next, std::vector &dist) { - auto g = edges.getHostGraph(); - auto * in_neigh = g.in_neighbors_shared_.get(); - auto ** in_index = g.in_index_shared_.get(); - for(int d = 0; d < edges.num_nodes(); d++) { - int ind = in_index[d] - in_neigh; - int degree = g.in_degree(d); - auto * neighbors = &in_neigh[ind]; - for(int s = 0; s < degree; s++){ - if(front[neighbors[s].v]){ - if(apply(neighbors[s].v, d, neighbors[s].w, dist)) { - next[d] = 1; - } - } - } - } - -} - -void host_sssp_pull(BucketPriorityQueue& pq, std::vector &dist, int iter) { - dist[ROOT] = 0; - Device::Ptr device = Device::GetInstance(); - Vector next_frontier_dev = Vector(edges.num_nodes()); - std::vector h_next(edges.num_nodes(), 0); - std::vector h_front(edges.num_nodes(), 0); - - for(int i = 0; i < iter; i++) { - if(!(pq.finished() == 0)) { std::cout << "no more items on iter: " << i << "\n"; break; } - Vector front = pq.popDenseReadyVertexSet(); - front.copyToHost(h_front.data(), edges.num_nodes()); - device->freeze_cores(); - device->read_dma(); - device->unfreeze_cores(); - int num_elems = std::count(h_front.begin(), h_front.end(), 1); - std::cout << "num elems in front: " << num_elems << " val of 0: " << h_front[0] << std::endl; - sssp_pull_call(h_front, h_next, dist); - num_elems = std::count(h_next.begin(), h_next.end(), 1); - std::cout << "num elems in next front: " << num_elems << std::endl; - std::cout << "dist of 1: " << dist[1] << std::endl; - next_frontier_dev.copyToDevice(h_next.data(), edges.num_nodes()); - hammerblade::write_global_buffer_dma(dist.data(), dist_dev, edges.num_nodes()); - device->freeze_cores(); - device->write_dma(); - device->unfreeze_cores(); - hammerblade::updateBucketWithGraphItVertexSubset(next_frontier_dev, pq); - std::fill(h_next.begin(), h_next.end(), 0); - } -} -void host_sssp_push(BucketPriorityQueue &pq, std::vector &dist, int iter) { - host_sssp_pull(pq, dist, iter); -} - -int launch(int argc, char * argv[]){ - InputParser input(argc, argv); - if(!input.cmdOptionExists("-g")) { - - std::cerr << "no input args\n"; - for(auto i = 0; i < argc; i++) { - std::cerr << argv[i] << " "; - } - std::cerr << std::endl; - return 0; - } - std::string ucode_path = input.getRISCVFile(); - - int iter = 0; - //std::string iterstrbase = "iteration-"; - //auto pos = ucode_path.find(iterstrbase); - //auto iterstr = ucode_path.substr(pos + iterstrbase.size(), std::string::npos); - //std::stringstream ss(iterstr); - //ss >> iter; - std::cerr << "iteration: " << iter << std::endl; - - int version = 0; //pull-vertex - if(ucode_path.find("push-vertex") != std::string::npos) { - version = 1; - } - std::cerr << "load microcode" << std::endl; - hammerblade::builtin_loadMicroCodeFromFile(ucode_path); - std::cerr << "load graph" << std::endl; - - std::string graph_f = input.getCmdOption("-g"); - //std::string frontier_f = input.getCmdOption("-f"); - edges = hammerblade::builtin_loadWeightedEdgesFromFileToHB (graph_f.c_str()); - std::cerr << "out deg of 0: " << edges.out_degree(5) << "num edges: " << edges.num_edges() << std::endl; - - - Device::Ptr device = Device::GetInstance(); - dist_dev = GlobalScalar("dist"); - hammerblade::init_global_array(edges.num_nodes(), dist_dev); - hammerblade::assign_val_dma(0, edges.num_nodes(), (2147483647), dist_dev); - int start_vertex = 0; - //hammerblade::insert_val(start_vertex, 0, dist_dev); - - std::cerr << "init locks\n"; - GlobalScalar glbl_locks = GlobalScalar("locks"); - hammerblade::init_global_array>(NUM_LOCKS, glbl_locks); - std::atomic tmp_array[NUM_LOCKS] = {}; - hammerblade::write_global_buffer_dma>(tmp_array, glbl_locks, NUM_LOCKS); - - std::cerr << "doing batch dma write" << std::endl; - device->freeze_cores(); - device->write_dma(); - device->unfreeze_cores(); - hammerblade::insert_val(start_vertex, 0, dist_dev); - std::cerr << "init pq" << std::endl; - BucketPriorityQueue pq = BucketPriorityQueue(edges.num_nodes(), &dist_dev, (hammerblade::BucketOrder)1, (hammerblade::PriorityOrder)0, (int) 128, (int) 32); - - std::cerr << "host side compute up to current iter: \n"; - std::vector h_dist(edges.num_nodes(), 2147483647); - if(version == 0) { - host_sssp_pull(pq, h_dist, iter); - } else { - host_sssp_push(pq, h_dist, iter); - } - hammerblade::write_global_buffer_dma(h_dist.data(), dist_dev, edges.num_nodes()); - device->freeze_cores(); - device->write_dma(); - device->unfreeze_cores(); - - std::cerr << "starting while loop" << std::endl; - Vector next_frontier_dev; - switch(version){ - case 0: { // do dense pull bfs - //device->enqueueJob("init_kernel", hb_mc_dimension(X,Y), {edges.num_nodes()}); - //device->runJobs(); - for(int i = 0; i < 1; i++) //just doing one large iteration - { - - std::cerr << "doing SSSP Delta Stepping kernel" << std::endl; - //Vector frontier = hammerblade::getBucketWithGraphItVertexSubset(pq); - Vector frontier = pq.popDenseReadyVertexSet(); - std::cerr << "got frontier from pq\n"; - next_frontier_dev = Vector(edges.num_nodes()); - //next_frontier_dev.assign(0, edges.num_nodes(), 0); - //device->freeze_cores(); - //device->write_dma(); - //device->unfreeze_cores(); - printf("0x%08x\n", frontier.getAddr()); - printf("next: 0x%08x\n", next_frontier_dev.getAddr()); - std::cerr << "initialized next front\n"; - device->enqueueJob("edgeset_apply_pull_parallel_weighted_deduplicated_from_vertexset_with_frontier_call", - hb_mc_dimension(X,Y), - {edges.getInIndicesAddr(), - edges.getInNeighborsAddr(), - frontier.getAddr(), - next_frontier_dev.getAddr(), - edges.num_nodes(), - edges.num_edges(), - edges.num_nodes()}); - device->runJobs(); - std::cerr << "updating buckets:\n"; - hammerblade::updateBucketWithGraphItVertexSubset(next_frontier_dev, pq); - hammerblade::deleteObject(frontier); - } - break; - } - case 1: { //do sparse push blocked bfs - break; - } - } - - std::cerr << "finished while loop" << std::endl; - - if(VERIFY) { - int * host_next = new int[edges.num_nodes()]; - next_frontier_dev.copyToHost(host_next, edges.num_nodes()); - - device->freeze_cores(); - device->read_dma(); - device->unfreeze_cores(); - - ofstream file("./frontier_verify.txt"); - if(!file.is_open()) std::cerr <<"couldn't open file\n"; - for(int i = 0; i < edges.num_nodes(); i++) { - if(host_next[i] == 1 && i % 50 == 0) std::cerr << i << std::endl; - file << host_next[i] << std::endl; - } - file.close(); - } - device->finish(); - return 0; -} -#ifdef VCS -int vcs_main(int argc, char ** argv) { - bsg_pr_test_info("Unified Main Regression Test (COSIMULATION)\n"); - int rc = launch(argc,argv); - bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); - return rc; -} -#else -int main(int argc, char ** argv) { - bsg_pr_test_info("Unified Main CUDA Regression Test (F1)\n"); - int rc = launch(argc,argv); - bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); - return rc; -} -#endif diff --git a/examples/graphit/test_sssp_delta/sssp.hpp b/examples/graphit/test_sssp_delta/sssp.hpp deleted file mode 100644 index 2dfcd3d5c..000000000 --- a/examples/graphit/test_sssp_delta/sssp.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef __SSSP_BENCHMARK_HPP -#define __SSSP_BENCHMARK_HPP - -#pragma once -#include "hb_intrinsics.h" -#include "infra_hb/host/arg_parser.hpp" -#include "infra_hb/host/priority_queue.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using hammerblade::Device; -using hammerblade::Vector; -using hammerblade::GraphHB; -using hammerblade::WGraphHB; -using hammerblade::GlobalScalar; -using hammerblade::BucketPriorityQueue; -using hammerblade::Bucket; -#endif diff --git a/examples/graphit/test_vec_add_parallel/Makefile b/examples/graphit/test_vec_add_parallel/Makefile deleted file mode 100644 index 4291f23a1..000000000 --- a/examples/graphit/test_vec_add_parallel/Makefile +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2021, University of Washington All rights reserved. -# -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, this list -# of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, this -# list of conditions and the following disclaimer in the documentation and/or -# other materials provided with the distribution. -# -# Neither the name of the copyright holder nor the names of its contributors may -# be used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# This Makefile compiles, links, and executes examples Run `make help` -# to see the available targets for the selected platform. - -################################################################################ -# environment.mk verifies the build environment and sets the following -# makefile variables: -# -# LIBRAIRES_PATH: The path to the libraries directory -# HARDWARE_PATH: The path to the hardware directory -# EXAMPLES_PATH: The path to the examples directory -# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL -# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore -############################################################################### - -REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) - -include $(REPLICANT_PATH)/environment.mk -SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd -CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime - -# TEST_NAME is the basename of the executable -TEST_NAME = main -# KERNEL_NAME is the name of the CUDA-Lite Kernel -KERNEL_NAME = vec_add_parallel - -############################################################################### -# Host code compilation flags and flow -############################################################################### - -# TEST_SOURCES is a list of source files that need to be compiled -TEST_SOURCES = main.c - -DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE -CDEFINES += -CXXDEFINES += - -FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable -CFLAGS += -std=c99 $(FLAGS) -CXXFLAGS += -std=c++11 $(FLAGS) - -# compilation.mk defines rules for compilation of C/C++ -include $(EXAMPLES_PATH)/compilation.mk - -# Specify any header file dependencies -main.o: INCLUDES += -I$(EXAMPLES_PATH) -main.o: $(EXAMPLES_PATH)/cl_manycore_regression.h - -############################################################################### -# Host code link flags and flow -############################################################################### - -LDFLAGS += - -# link.mk defines rules for linking of the final execution binary. -include $(EXAMPLES_PATH)/link.mk - -############################################################################### -# Device code compilation flow -############################################################################### - -# BSG_MANYCORE_KERNELS is a list of manycore executables that should -# be built before executing. -BSG_MANYCORE_KERNELS = kernel.riscv - -kernel.rvo: RISCV_CXX = $(RISCV_CLANGXX) -kernel.riscv: kernel.rvo - -# Tile Group Dimensions -TILE_GROUP_DIM_X = 2 -TILE_GROUP_DIM_Y = 2 -RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) -RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) - -include $(EXAMPLES_PATH)/graphit/riscv.mk - -############################################################################### -# Execution flow -# -# C_ARGS: Use this to pass arguments that you want to appear in argv -# For SPMD tests C arguments are: -# -# SIM_ARGS: Use this to pass arguments to the simulator -############################################################################### -C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) - -SIM_ARGS ?= - -# Include platform-specific execution rules -include $(EXAMPLES_PATH)/execution.mk - -############################################################################### -# Regression Flow -############################################################################### - -regression: main.exec.log - @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null - -############################################################################### -# Default rules, help, and clean -############################################################################### -.DEFAULT_GOAL := help -help: - @echo "Usage:" - @echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}" - @echo " $(TEST_NAME).profile: Build executable with profilers enabled" - @echo " $(TEST_NAME).debug: Build waveform executable (if VCS)" - @echo " $(TEST_NAME).{profile,debug}.log: Run specific executable" - @echo " clean: Remove all subdirectory-specific outputs" - - -.PHONY: clean - -clean: - - diff --git a/examples/graphit/test_vec_add_parallel/kernel.cpp b/examples/graphit/test_vec_add_parallel/kernel.cpp deleted file mode 100644 index b2ea1ae88..000000000 --- a/examples/graphit/test_vec_add_parallel/kernel.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//This kernel adds 2 vectors - -#include -#include -#include - -bsg_barrier barrier; - -extern "C" __attribute__ ((noinline)) -int kernel_vec_add_parallel(int *A, int *B, int *C, int N, int block_size_x) { - - int start_x = block_size_x * (__bsg_tile_group_id_y * __bsg_grid_dim_x + __bsg_tile_group_id_x); - for (int iter_x = __bsg_id; iter_x < block_size_x; iter_x += bsg_tiles_X * bsg_tiles_Y) { - C[start_x + iter_x] = A[start_x + iter_x] + B[start_x + iter_x]; - } - - barrier.sync(); - - return 0; -} diff --git a/examples/graphit/test_vec_add_parallel/main.c b/examples/graphit/test_vec_add_parallel/main.c deleted file mode 100644 index 07c9bd209..000000000 --- a/examples/graphit/test_vec_add_parallel/main.c +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright (c) 2019, University of Washington All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// Redistributions of source code must retain the above copyright notice, this list -// of conditions and the following disclaimer. -// -// Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// -// Neither the name of the copyright holder nor the names of its contributors may -// be used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define ALLOC_NAME "default_allocator" - -/*! - * Runs the vector addition a grid of 2x2 tile groups. A[N] + B[N] --> C[N] - * Grid dimensions are determines by how much of a load we want for each tile group (block_size_x) - * This tests uses the software/spmd/bsg_cuda_lite_runtime/vec_add_parallel/ Manycore binary in the BSG Manycore bitbucket repository. -*/ - - -void host_vec_add (int *A, int *B, int *C, int N) { - for (int i = 0; i < N; i ++) { - C[i] = A[i] + B[i]; - } - return; -} - - -int kernel_vec_add_parallel (int argc, char **argv) { - int rc; - char *bin_path, *test_name; - struct arguments_path args = {NULL, NULL}; - - argp_parse (&argp_path, argc, argv, 0, 0, &args); - bin_path = args.path; - test_name = args.name; - - bsg_pr_test_info("Running the CUDA Vector Addition Kernel on a grid of 2x2 tile groups.\n\n"); - - srand(time); - - /*********************/ - /* Initialize device */ - /*********************/ - hb_mc_device_t device; - BSG_CUDA_CALL(hb_mc_device_init(&device, test_name, 0)); - - hb_mc_pod_id_t pod; - hb_mc_device_foreach_pod_id(&device, pod) - { - /**********************************************************************/ - /* Define path to binary. */ - /* Initialize device, load binary and unfreeze tiles. */ - /**********************************************************************/ - bsg_pr_test_info("Loading program for %s onto pod %d\n", - test_name, pod); - - BSG_CUDA_CALL(hb_mc_device_set_default_pod(&device, pod)); - BSG_CUDA_CALL(hb_mc_device_program_init(&device, bin_path, ALLOC_NAME, 0)); - - /***************************************************************************************************************** - * Allocate memory on the device for A, B and C. - ******************************************************************************************************************/ - uint32_t N = 1024; - - eva_t A_device, B_device, C_device; - BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &A_device)); /* allocate A[N] on the device */ - BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &B_device)); /* allocate B[N] on the device */ - BSG_CUDA_CALL(hb_mc_device_malloc(&device, N * sizeof(uint32_t), &C_device)); /* allocate C[N] on the device */ - - /***************************************************************************************************************** - * Allocate memory on the host for A & B and initialize with random values. - ******************************************************************************************************************/ - uint32_t A_host[N]; /* allocate A[N] on the host */ - uint32_t B_host[N]; /* allocate B[N] on the host */ - for (int i = 0; i < N; i++) { /* fill A with arbitrary data */ - A_host[i] = rand() & 0xFFFF; - B_host[i] = rand() & 0xFFFF; - } - - /***************************************************************************************************************** - * Copy A & B from host onto device DRAM. - ******************************************************************************************************************/ - void *dst = (void *) ((intptr_t) A_device); - void *src = (void *) &A_host[0]; - BSG_CUDA_CALL(hb_mc_device_memcpy (&device, dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_DEVICE)); /* Copy A to the device */ - - dst = (void *) ((intptr_t) B_device); - src = (void *) &B_host[0]; - BSG_CUDA_CALL(hb_mc_device_memcpy (&device, dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_DEVICE)); /* Copy B to the device */ - - /***************************************************************************************************************** - * Define block_size_x/y: amount of work for each tile group - * Define tg_dim_x/y: number of tiles in each tile group - * Calculate grid_dim_x/y: number of tile groups needed based on block_size_x/y - ******************************************************************************************************************/ - uint32_t block_size_x = 64; - hb_mc_dimension_t tg_dim = { .x = 2, .y = 2 }; - hb_mc_dimension_t grid_dim = { .x = N / block_size_x, .y = 1 }; - - /***************************************************************************************************************** - * Prepare list of input arguments for kernel. - ******************************************************************************************************************/ - int cuda_argv[5] = {A_device, B_device, C_device, N, block_size_x}; - - /***************************************************************************************************************** - * Enquque grid of tile groups, pass in grid and tile group dimensions, kernel name, number and list of input arguments - ******************************************************************************************************************/ - BSG_CUDA_CALL(hb_mc_kernel_enqueue (&device, grid_dim, tg_dim, "kernel_vec_add_parallel", 5, cuda_argv)); - - /***************************************************************************************************************** - * Launch and execute all tile groups on device and wait for all to finish. - ******************************************************************************************************************/ - BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(&device)); - - /***************************************************************************************************************** - * Copy result matrix back from device DRAM into host memory. - ******************************************************************************************************************/ - uint32_t C_host[N]; - src = (void *) ((intptr_t) C_device); - dst = (void *) &C_host[0]; - BSG_CUDA_CALL(hb_mc_device_memcpy (&device, (void *) dst, src, N * sizeof(uint32_t), HB_MC_MEMCPY_TO_HOST)); /* copy C to the host */ - - /***************************************************************************************************************** - * Freeze the tiles and memory manager cleanup. - ******************************************************************************************************************/ - BSG_CUDA_CALL(hb_mc_device_program_finish(&device)); - - /***************************************************************************************************************** - * Calculate the expected result using host code and compare the results. - ******************************************************************************************************************/ - uint32_t C_expected[N]; - host_vec_add (A_host, B_host, C_expected, N); - - - int mismatch = 0; - for (int i = 0; i < N; i++) { - if (A_host[i] + B_host[i] != C_host[i]) { - bsg_pr_err(BSG_RED("Mismatch: ") "C[%d]: 0x%08" PRIx32 " + 0x%08" PRIx32 " = 0x%08" PRIx32 "\t Expected: 0x%08" PRIx32 "\n", - i , A_host[i], B_host[i], C_host[i], C_expected[i]); - mismatch = 1; - } - } - - if (mismatch) { - return HB_MC_FAIL; - } - } - - BSG_CUDA_CALL(hb_mc_device_finish(&device)); - - return HB_MC_SUCCESS; -} - -#ifdef VCS -int vcs_main(int argc, char ** argv) -#else -int main(int argc, char ** argv) -#endif -{ - bsg_pr_test_info("test_vec_add_parallel Regression Test \n"); - int rc = kernel_vec_add_parallel(argc, argv); - bsg_pr_test_pass_fail(rc == HB_MC_SUCCESS); - return rc; -} - - From b77d6353a39c4b69630bb59aab19aaf025f90c81 Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Wed, 5 May 2021 17:12:49 -0700 Subject: [PATCH 19/22] [pr-nibble] change regression/clean rules, update kernel and host code with prefetching kernel --- examples/graphit/test_pr_nibble/Makefile | 6 +- examples/graphit/test_pr_nibble/kernel.cpp | 230 ------------------ .../test_pr_nibble/kernel/hybrid/kernel.cpp | 101 +++----- examples/graphit/test_pr_nibble/main.cpp | 28 +-- 4 files changed, 44 insertions(+), 321 deletions(-) delete mode 100644 examples/graphit/test_pr_nibble/kernel.cpp diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/test_pr_nibble/Makefile index bdd77f513..720bbdc24 100644 --- a/examples/graphit/test_pr_nibble/Makefile +++ b/examples/graphit/test_pr_nibble/Makefile @@ -52,7 +52,7 @@ GRAPH_PATH := $(GRAPHIT_PATH)/test/graphs/darpa-eval/jhu.mtx TEST_NAME = main # KERNEL_NAME is the name of the CUDA-Lite Kernel KERNEL_NAME = pr_nibble -HOST_TARGET := $(TEST_NAME).profile +HOST_TARGET := $(TEST_NAME).exec BASE_VERSIONS += hybrid @@ -179,7 +179,7 @@ all-versions: $(foreach v,$(VERSIONS),kernel/$v/$(HOST_TARGET).log) # Regression Flow ############################################################################### -regression: main.exec.log +regression: versions all-versions @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null ############################################################################### @@ -201,6 +201,6 @@ version.clean: .PHONY: clean -clean: version.clean +clean: bleach-versions version.clean diff --git a/examples/graphit/test_pr_nibble/kernel.cpp b/examples/graphit/test_pr_nibble/kernel.cpp deleted file mode 100644 index 79186ab8a..000000000 --- a/examples/graphit/test_pr_nibble/kernel.cpp +++ /dev/null @@ -1,230 +0,0 @@ -//#define DEBUG -#include - -#ifdef DEBUG -#define BSG_TILE_GROUP_X_DIM 1 -#define BSG_TILE_GROUP_Y_DIM 1 -#define bsg_tiles_X BSG_TILE_GROUP_X_DIM -#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM -#else -#include -// #define BSG_TILE_GROUP_X_DIM 16 -// #define BSG_TILE_GROUP_Y_DIM 8 -#endif - -#include -bsg_barrier barrier; - -#include -#include - -#ifdef DEBUG -#define pr_dbg(fmt, ...) \ - bsg_printf(fmt, ##__VA_ARGS__) -#else -#define pr_dbg(fmt, ...) -#endif - -__attribute__((section(".dram"))) float * __restrict p; -__attribute__((section(".dram"))) float * __restrict old_rank; -__attribute__((section(".dram"))) float * __restrict new_rank; -__attribute__((section(".dram"))) int * __restrict out_degree; -__attribute__((section(".dram"))) int * __restrict generated_tmp_vector_3; -//__attribute__((section(".dram"))) double alpha = 0.15; -//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; - -template int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) -{ - //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); - //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); - int start, end; - local_range(V, &start, &end); - for ( int d = start; d < end; d++) { - int degree = in_indices[d + 1] - in_indices[d]; - int * neighbors = &in_neighbors[in_indices[d]]; - for(int s = 0; s < degree; s++) { - if(from_vertexset[neighbors[s]]) { - //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); - apply_func (neighbors[s] , d); - } - } //end of loop on in neighbors - } //end of outer for loop - return 0; -} //end of edgeset apply function - -template int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) -{ - //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); - //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); - int start, end; - local_range(V, &start, &end); - for ( int s = start; s < end; s++) { - if(from_vertexset[s]) { - int degree = out_indices[s + 1] - out_indices[s]; - int * neighbors = &out_neighbors[out_indices[s]]; - for(int d = 0; d < degree; d++) { - apply_func (s, neighbors[d]); - //if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); } - - } - } //end of loop on in neighbors - } //end of outer for loop - //barrier.sync(); - return 0; -} //end of edgeset apply function - - -struct generated_vector_op_apply_func_4 -{ - void operator() (int v) - { - out_degree[v] = generated_tmp_vector_3[v]; - }; -}; -struct new_rank_generated_vector_op_apply_func_2 -{ - void operator() (int v) - { - new_rank[v] = ((float) 0) ; - }; -}; -struct old_rank_generated_vector_op_apply_func_1 -{ - void operator() (int v) - { - old_rank[v] = ((float) 0) ; - }; -}; -struct p_generated_vector_op_apply_func_0 -{ - void operator() (int v) - { - p[v] = ((float) 0) ; - }; -}; -struct updateEdge -{ - void operator() (int src, int dst) - { - float alpha = 0.15; - new_rank[dst] = (new_rank[dst] + (((((1) - alpha) / ((1) + alpha)) * old_rank[src]) / out_degree[src])); - }; -}; -struct updateSelf -{ - void operator() (int v) - { - float alpha = 0.15; - p[v] = (p[v] + ((((2) * alpha) / ((1) + alpha)) * old_rank[v])); - new_rank[v] = (0) ; - }; -}; -struct filter_frontier -{ - bool operator() (int v) - { - float epsilon = (float) 1e-6; - bool output ; - //if(old_rank[v] == 0) return 0; - if(new_rank[v] == 0) return 0; - //output = (old_rank[v]) > ((out_degree[v] * epsilon)); - output = (new_rank[v]) > ((out_degree[v] * epsilon)); - return output; - }; -}; - -extern "C" int __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - p_generated_vector_op_apply_func_0()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - old_rank_generated_vector_op_apply_func_1()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - new_rank_generated_vector_op_apply_func_2()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - generated_vector_op_apply_func_4()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) updateSelf_kernel(int V, int tag_c) { - //pr_dbg("%i: on update self tag: %i\n", bsg_id, tag_c); - bsg_cuda_print_stat_start(tag_c); - barrier.sync(); - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - updateSelf()(iter_x); - } - bsg_cuda_print_stat_end(tag_c); - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { - barrier.sync(); - //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c); - bsg_cuda_print_stat_start(tag_c); - bsg_saif_start(); - edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x); - bsg_saif_end(); - bsg_cuda_print_stat_end(tag_c); - barrier.sync(); - return 0; -} - - extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { - barrier.sync(); - bsg_cuda_print_stat_start(tag_c); - bsg_saif_start(); - edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x); - bsg_saif_end(); - bsg_cuda_print_stat_end(tag_c); - barrier.sync(); - return 0; -} - -extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { - //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c); - //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c); - bsg_cuda_print_stat_start(tag_c); - barrier.sync(); - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - if (iter_x < V) { - next5[iter_x] = 0; - if ( filter_frontier()( iter_x ) ) { - next5[iter_x] = 1; - //pr_dbg("added vertex %i to frontier\n", iter_x); - } - } - else { break; } - } //end of loop - bsg_cuda_print_stat_end(tag_c); - barrier.sync(); - return 0; -} - - diff --git a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp index 16e66425c..14449a85d 100644 --- a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp +++ b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp @@ -29,10 +29,11 @@ __attribute__((section(".dram"))) float * __restrict p; __attribute__((section(".dram"))) float * __restrict old_rank; __attribute__((section(".dram"))) float * __restrict new_rank; __attribute__((section(".dram"))) int * __restrict out_degree; -__attribute__((section(".dram"))) int * __restrict generated_tmp_vector_3; +//__attribute__((section(".dram"))) int * __restrict generated_tmp_vector_3; //__attribute__((section(".dram"))) double alpha = 0.15; //__attribute__((section(".dram"))) double epsilon = (double) 1e-6; + template int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) { //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); @@ -73,35 +74,6 @@ template int edgeset_apply_push_parallel_from_vertexset(i return 0; } //end of edgeset apply function - -struct generated_vector_op_apply_func_4 -{ - void operator() (int v) - { - out_degree[v] = generated_tmp_vector_3[v]; - }; -}; -struct new_rank_generated_vector_op_apply_func_2 -{ - void operator() (int v) - { - new_rank[v] = ((float) 0) ; - }; -}; -struct old_rank_generated_vector_op_apply_func_1 -{ - void operator() (int v) - { - old_rank[v] = ((float) 0) ; - }; -}; -struct p_generated_vector_op_apply_func_0 -{ - void operator() (int v) - { - p[v] = ((float) 0) ; - }; -}; struct updateEdge { void operator() (int src, int dst) @@ -133,49 +105,14 @@ struct filter_frontier }; }; -extern "C" int __attribute__ ((noinline)) p_generated_vector_op_apply_func_0_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - p_generated_vector_op_apply_func_0()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) old_rank_generated_vector_op_apply_func_1_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - old_rank_generated_vector_op_apply_func_1()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) new_rank_generated_vector_op_apply_func_2_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - new_rank_generated_vector_op_apply_func_2()(iter_x); - } - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) generated_vector_op_apply_func_4_kernel(int V) { - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - generated_vector_op_apply_func_4()(iter_x); - } - barrier.sync(); - return 0; -} extern "C" int __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) { + //pr_dbg("%i: on update self tag: %i\n", bsg_id, tag_c); bsg_cuda_print_stat_start(tag_c); barrier.sync(); int start, end; local_range(V, &start, &end); for (int iter_x = start; iter_x < end; iter_x++) { - if(frontier[iter_x]) { updateSelf()(iter_x); } + if(frontier[iter_x]) {updateSelf()(iter_x);} } bsg_cuda_print_stat_end(tag_c); barrier.sync(); @@ -226,4 +163,34 @@ extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5 return 0; } +extern "C" void prefetch(int * in_indices, int * in_neighbors, int * from_vertexset, int V, int E) { + int id = __bsg_id; + int threads = bsg_tiles_X * bsg_tiles_Y; + // prefetch all data; + for (int i = 32 * id; i < E; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (in_neighbors[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (in_indices[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (from_vertexset[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (out_degree[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (p[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (old_rank[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (new_rank[i])); + } + barrier.sync(); + return ; + +} + diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp index f0bee8b64..56dd9aaf6 100644 --- a/examples/graphit/test_pr_nibble/main.cpp +++ b/examples/graphit/test_pr_nibble/main.cpp @@ -45,14 +45,12 @@ int test_pr_nibble(int argc, char ** argv){ if(ucode_path.find("push") != std::string::npos) { version = 1; } - else if(ucode_path.find("block") != std::string::npos) { - version = 2; - } int hybrid = 0; //default to vertex pull if(ucode_path.find("hybrid") != std::string::npos) { hybrid = 1; } std::cerr << "version: " << version << std::endl; + std::cerr << "hybrid: " << hybrid << std::endl; std::cerr << "load microcode" << std::endl; hammerblade::builtin_loadMicroCodeFromFile(ucode_path); @@ -129,6 +127,9 @@ int test_pr_nibble(int argc, char ** argv){ switch(version) { case 0: //vertex pull std::cerr << "pull kernel\n"; + std::cerr << "preloading the cache\n"; + device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()}); + device->runJobs(); std::cerr << "run update self vertex kernel\n"; device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); device->runJobs(); @@ -147,6 +148,9 @@ int test_pr_nibble(int argc, char ** argv){ break; case 1: //vertex push std::cerr << "push kernel\n"; + std::cerr << "preloading the cache\n"; + device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()}); + device->runJobs(); std::cerr << "run update self vertex kernel\n"; device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); device->runJobs(); @@ -163,24 +167,6 @@ int test_pr_nibble(int argc, char ** argv){ f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; break; - case 2: //blocked pull - std::cerr << "blocked pull kernel\n"; - std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); - device->runJobs(); - tag_c++; - std::cerr << "run update edges kernel on iter : " << iter << "\n"; - device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInVertexlistAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); - device->runJobs(); - tag_c++; - std::cerr << "create next frontier\n"; - device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); - device->runJobs(); - std::cerr << "swap arrays\n"; - hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); - f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); - std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; - break; } tag_c++; From 7d47bbf7cf9ad696ec5c0aa4f8d64671de7cf8fb Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Wed, 5 May 2021 17:16:08 -0700 Subject: [PATCH 20/22] clone graphit submodule with ssh not https --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 18ee1bd0f..f352d419a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,4 +9,4 @@ url = git@github.com:bespoke-silicon-group/hammerblade-helpers [submodule "examples/graphit/graphit-src"] path = examples/graphit/graphit-src - url = https://github.com/bespoke-silicon-group/graphit.git + url = git@github.com:bespoke-silicon-group/graphit.git From 741062c4b4ce71680b864edbaa4d9b0b00b77a2b Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Mon, 10 May 2021 14:02:33 -0700 Subject: [PATCH 21/22] rename test, remove tabs, switch to 4 spaces for tabs, some cleaning up of code --- .../{test_pr_nibble => pr_nibble}/Makefile | 0 .../pr_nibble/kernel/hybrid/kernel.cpp | 175 ++++++++++++++++ .../kernel/include/pr_nibble.hpp | 0 examples/graphit/pr_nibble/main.cpp | 177 ++++++++++++++++ .../{test_pr_nibble => pr_nibble}/pr.hpp | 2 +- examples/graphit/pr_nibble/pr_host.hpp | 45 ++++ .../test_pr_nibble/kernel/hybrid/kernel.cpp | 196 ------------------ examples/graphit/test_pr_nibble/main.cpp | 191 ----------------- examples/graphit/test_pr_nibble/pr_host.hpp | 50 ----- 9 files changed, 398 insertions(+), 438 deletions(-) rename examples/graphit/{test_pr_nibble => pr_nibble}/Makefile (100%) create mode 100644 examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp rename examples/graphit/{test_pr_nibble => pr_nibble}/kernel/include/pr_nibble.hpp (100%) create mode 100644 examples/graphit/pr_nibble/main.cpp rename examples/graphit/{test_pr_nibble => pr_nibble}/pr.hpp (90%) create mode 100644 examples/graphit/pr_nibble/pr_host.hpp delete mode 100644 examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp delete mode 100644 examples/graphit/test_pr_nibble/main.cpp delete mode 100644 examples/graphit/test_pr_nibble/pr_host.hpp diff --git a/examples/graphit/test_pr_nibble/Makefile b/examples/graphit/pr_nibble/Makefile similarity index 100% rename from examples/graphit/test_pr_nibble/Makefile rename to examples/graphit/pr_nibble/Makefile diff --git a/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp new file mode 100644 index 000000000..294d564a6 --- /dev/null +++ b/examples/graphit/pr_nibble/kernel/hybrid/kernel.cpp @@ -0,0 +1,175 @@ +//#define DEBUG +#include + +#ifdef DEBUG +#define BSG_TILE_GROUP_X_DIM 1 +#define BSG_TILE_GROUP_Y_DIM 1 +#define bsg_tiles_X BSG_TILE_GROUP_X_DIM +#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM +#else +#include +#endif + +#include +bsg_barrier barrier; + +#include +#include + +#ifdef DEBUG +#define pr_dbg(fmt, ...) \ + bsg_printf(fmt, ##__VA_ARGS__) +#else +#define pr_dbg(fmt, ...) +#endif + +__attribute__((section(".dram"))) float * __restrict p; +__attribute__((section(".dram"))) float * __restrict old_rank; +__attribute__((section(".dram"))) float * __restrict new_rank; +__attribute__((section(".dram"))) int * __restrict out_degree; + + +template int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + int start, end; + local_range(V, &start, &end); + for ( int d = start; d < end; d++) { + int degree = in_indices[d + 1] - in_indices[d]; + int * neighbors = &in_neighbors[in_indices[d]]; + for(int s = 0; s < degree; s++) { + if(from_vertexset[neighbors[s]]) { + apply_func (neighbors[s] , d); + } + } //end of loop on in neighbors + } //end of outer for loop + return 0; +} //end of edgeset apply function + +template int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) +{ + int start, end; + local_range(V, &start, &end); + for ( int s = start; s < end; s++) { + if(from_vertexset[s]) { + int degree = out_indices[s + 1] - out_indices[s]; + int * neighbors = &out_neighbors[out_indices[s]]; + for(int d = 0; d < degree; d++) { + apply_func (s, neighbors[d]); + + } + } //end of loop on in neighbors + } //end of outer for loop + return 0; +} //end of edgeset apply function + +struct updateEdge +{ + void operator() (int src, int dst) + { + float alpha = 0.15; + new_rank[dst] = (new_rank[dst] + (((((1) - alpha) / ((1) + alpha)) * old_rank[src]) / out_degree[src])); + }; +}; +struct updateSelf +{ + void operator() (int v) + { + float alpha = 0.15; + p[v] = (p[v] + ((((2) * alpha) / ((1) + alpha)) * old_rank[v])); + new_rank[v] = (0) ; + }; +}; +struct filter_frontier +{ + bool operator() (int v) + { + float epsilon = (float) 1e-6; + bool output ; + if(new_rank[v] == 0) return 0; + output = (new_rank[v]) > ((out_degree[v] * epsilon)); + return output; + }; +}; + +extern "C" int __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) { + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if(frontier[iter_x]) {updateSelf()(iter_x);} + } + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} +extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); + edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + + extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { + barrier.sync(); + bsg_cuda_print_stat_start(tag_c); + bsg_saif_start(); + edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x); + bsg_saif_end(); + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + +extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { + bsg_cuda_print_stat_start(tag_c); + barrier.sync(); + int start, end; + local_range(V, &start, &end); + for (int iter_x = start; iter_x < end; iter_x++) { + if (iter_x < V) { + next5[iter_x] = 0; + if ( filter_frontier()( iter_x ) ) { + next5[iter_x] = 1; + } + } + else { break; } + } //end of loop + bsg_cuda_print_stat_end(tag_c); + barrier.sync(); + return 0; +} + +extern "C" void prefetch(int * in_indices, int * in_neighbors, int * from_vertexset, int V, int E) { + int id = __bsg_id; + int threads = bsg_tiles_X * bsg_tiles_Y; + // prefetch all data; + for (int i = 32 * id; i < E; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (in_neighbors[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (in_indices[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (from_vertexset[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (out_degree[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (p[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (old_rank[i])); + } + for (int i = 32 * id; i < V; i += 32 * threads) { + asm volatile ("lw x0, %[p]" :: [p] "m" (new_rank[i])); + } + barrier.sync(); + return ; + +} diff --git a/examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp b/examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp similarity index 100% rename from examples/graphit/test_pr_nibble/kernel/include/pr_nibble.hpp rename to examples/graphit/pr_nibble/kernel/include/pr_nibble.hpp diff --git a/examples/graphit/pr_nibble/main.cpp b/examples/graphit/pr_nibble/main.cpp new file mode 100644 index 000000000..55a05fd8e --- /dev/null +++ b/examples/graphit/pr_nibble/main.cpp @@ -0,0 +1,177 @@ +#include "pr.hpp" + +//#define DEBUG + +#define VERIFY 0 + +#ifdef DEBUG +#define X 1 +#define Y 1 +#else +#define X 16 //tile group dim X +#define Y 8 // tile group dim Y +#endif + +#define ROOT 6 +#define NUM_LOCKS 1024 //width of manycore * 64 + +GraphHB edges; +GlobalScalar p_dev; +GlobalScalar old_rank_dev; +GlobalScalar new_rank_dev; +GlobalScalar out_degree_dev; + +#include "pr_host.hpp" + +int test_pr_nibble(int argc, char ** argv){ + InputParser input(argc, argv); + if(!input.cmdOptionExists("-g")){ + std::cerr << "no input args\n"; + return 0; + } + std::string ucode_path = input.getRISCVFile(); + + int iter = 0; + std::string iterstrbase = "iteration-"; + auto pos = ucode_path.find(iterstrbase); + auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos); + std::stringstream ss(iterstr); + ss >> iter; + std::cerr << "iteration: " << iter << std::endl; + + int version = 0; //default to vertex pull + if(ucode_path.find("push") != std::string::npos) { + version = 1; + } + int hybrid = 0; //default to vertex pull + if(ucode_path.find("hybrid") != std::string::npos) { + hybrid = 1; + } + std::cerr << "version: " << version << std::endl; + std::cerr << "hybrid: " << hybrid << std::endl; + std::cerr << "load microcode" << std::endl; + hammerblade::builtin_loadMicroCodeFromFile(ucode_path); + + std::cerr << "load graph" << std::endl; + std::string graph_f = input.getCmdOption("-g"); + edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); + + std::cerr << "size of graph: " << std::endl; + std::cerr << edges.num_nodes() << std::endl; + std::cerr << edges.num_edges() << std::endl; + + std::cerr << "init global scalars" << std::endl; + p_dev = GlobalScalar("p"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), p_dev); + old_rank_dev = GlobalScalar("old_rank"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), old_rank_dev); + new_rank_dev = GlobalScalar("new_rank"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), new_rank_dev); + out_degree_dev = GlobalScalar("out_degree"); + hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), out_degree_dev); + + std::cerr << "init locks" << std::endl; + GlobalScalar glbl_locks = GlobalScalar("locks"); + hammerblade::init_global_array>(NUM_LOCKS, glbl_locks); + std::atomic tmp_a[NUM_LOCKS] = {}; + Device::Ptr device = Device::GetInstance(); + int start_vertex = ROOT; + Vector frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); + + std::vector hfrontier(edges.num_nodes(), 0); + std::vector p(edges.num_nodes(), (float) 0.0); + std::vector new_rank(edges.num_nodes(), (float) 0.0); + std::vector old_rank(edges.num_nodes(), (float) 0.0); + std::vector out_degs = edges.get_out_degrees(); + + //compute up to current iter on host + hfrontier[start_vertex] = 1; + new_rank[start_vertex] = (float) 1.0; + old_rank[start_vertex] = (float) 1.0; + host_pr_calc(p, old_rank, new_rank, hfrontier, iter); + + //copy all variables at their current state to device + frontier.copyToDevice(hfrontier.data(), hfrontier.size()); + hammerblade::write_global_buffer_dma(p.data(), p_dev, p.size()); + hammerblade::write_global_buffer_dma(old_rank.data(), old_rank_dev, old_rank.size()); + hammerblade::write_global_buffer_dma(new_rank.data(), new_rank_dev, new_rank.size()); + hammerblade::write_global_buffer_dma(out_degs.data(), out_degree_dev, out_degs.size()); + //initialize locks for atomics on device + hammerblade::write_global_buffer_dma>(tmp_a, glbl_locks, NUM_LOCKS); + + device->freeze_cores(); + device->write_dma(); + device->unfreeze_cores(); + //determine push or pull traversal for this iteration + if(hybrid) { + int num_items = std::count(hfrontier.begin(), hfrontier.end(), 1); + int dir = calculate_direction(num_items, hfrontier, edges, edges.num_nodes(), edges.num_edges()); + if(dir){ + version = 0; //pull + } else { + version = 1; //push + } + } + + std::cerr << "start of while loop\n"; + int tag_c = 0; + int f_sz = 0; + switch(version) { + case 0: //vertex pull + std::cerr << "pull kernel\n"; + std::cerr << "preloading the cache\n"; + device->enqueuejob("prefetch", hb_mc_dimension(x,y),{edges.getinindicesaddr() , edges.getinneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges()}); + device->runjobs(); + std::cerr << "run update self vertex kernel\n"; + device->enqueuejob("updateself_kernel",hb_mc_dimension(x,y), {frontier.getaddr(), edges.num_nodes(), tag_c}); + device->runjobs(); + tag_c++; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; + device->enqueuejob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(x,y),{edges.getinindicesaddr() , edges.getinneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runjobs(); + tag_c++; + std::cerr << "create next frontier\n"; + device->enqueuejob("filter_frontier_where_call", hb_mc_dimension(x,y),{frontier.getaddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runjobs(); + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + f_sz = builtin_getvertexsetsizehb(frontier, edges.num_nodes()); + std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; + break; + case 1: //vertex push + std::cerr << "push kernel\n"; + std::cerr << "preloading the cache\n"; + device->enqueuejob("prefetch", hb_mc_dimension(x,y),{edges.getoutindicesaddr() , edges.getoutneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges()}); + device->runjobs(); + std::cerr << "run update self vertex kernel\n"; + device->enqueuejob("updateself_kernel",hb_mc_dimension(x,y), {frontier.getaddr(), edges.num_nodes(), tag_c}); + device->runjobs(); + tag_c++; + std::cerr << "run update edges kernel on iter : " << iter << "\n"; + device->enqueuejob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(x,y),{edges.getoutindicesaddr() , edges.getoutneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runjobs(); + tag_c++; + std::cerr << "create next frontier\n"; + device->enqueuejob("filter_frontier_where_call", hb_mc_dimension(x,y),{frontier.getaddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runjobs(); + std::cerr << "swap arrays\n"; + hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); + f_sz = builtin_getvertexsetsizehb(frontier, edges.num_nodes()); + std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; + break; + } + if(verify) { + ofstream ver_file; + ver_file.open("./rank.txt"); + float host_rank[edges.num_nodes()]; + hammerblade::read_global_buffer_dma(host_rank, old_rank_dev, edges.num_nodes()); + for(int i = 0; i < edges.num_nodes(); i++) { + ver_file << host_rank[i] << std::endl; + } + ver_file.close(); + } + device->finish(); + return 0; +} + +declare_program_main("test_pr_nibble", test_pr_nibble); diff --git a/examples/graphit/test_pr_nibble/pr.hpp b/examples/graphit/pr_nibble/pr.hpp similarity index 90% rename from examples/graphit/test_pr_nibble/pr.hpp rename to examples/graphit/pr_nibble/pr.hpp index 5cce0e30a..ae01c8cc2 100644 --- a/examples/graphit/test_pr_nibble/pr.hpp +++ b/examples/graphit/pr_nibble/pr.hpp @@ -2,7 +2,7 @@ #ifndef __PR_PULL_BENCHMARK_HPP #define __PR_PULL_BENCHMARK_HPP -#include "hb_intrinsics.h" +#include "hb_intrinsics.h" //graphit host runtime libs #include "infra_hb/host/arg_parser.hpp" #include #include diff --git a/examples/graphit/pr_nibble/pr_host.hpp b/examples/graphit/pr_nibble/pr_host.hpp new file mode 100644 index 000000000..fcbb811e0 --- /dev/null +++ b/examples/graphit/pr_nibble/pr_host.hpp @@ -0,0 +1,45 @@ +//function to compute pr-nibble on host up to current iter +#pragma once +#include +#include + +inline void host_pr_calc(std::vector & p, std::vector & old_rank, std::vector & new_rank, std::vector & frontier, int iter) { + float alpha = (float) 0.15; + float epsilon = (float) 1e-06; + auto g = edges.getHostGraph(); + int * in_neigh = g.in_neighbors_shared_.get(); + int ** in_index = g.in_index_shared_.get(); + for(int i = 0; i < iter; i++) { + new_rank.assign(old_rank.begin(), old_rank.end()); + //print out iteration and size: + int num_items = std::count(frontier.begin(), frontier.end(), 1); + std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl; + //update_self + for(int v = 0; v < g.num_nodes(); v++) { + if(frontier[v]) { + p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; + new_rank[v] = (float) 0.0 ; + } + } + //update edges + for(int d = 0; d < g.num_nodes(); d++) { + for(int s : g.in_neigh(d)) { + if(frontier[s]){ + float update = ((1.0 - alpha) / (1.0 + alpha)) * old_rank[s]; + update = update / ((float) g.out_degree(s)); + new_rank[d] += update; + } + } + } + old_rank.assign(new_rank.begin(), new_rank.end()); + //update frontier + for(int v = 0; v < g.num_nodes(); v++) { + frontier[v] = 0; + if(g.out_degree(v) > 0 && old_rank[v] >= (((float) g.out_degree(v)) * epsilon)) { + frontier[v] = 1; + } + } + } + int num_items = std::count(frontier.begin(), frontier.end(), 1); + std::cerr << "returning with frontier size: " << num_items << std::endl; +} diff --git a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp b/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp deleted file mode 100644 index 14449a85d..000000000 --- a/examples/graphit/test_pr_nibble/kernel/hybrid/kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -//#define DEBUG -#include - -#ifdef DEBUG -#define BSG_TILE_GROUP_X_DIM 1 -#define BSG_TILE_GROUP_Y_DIM 1 -#define bsg_tiles_X BSG_TILE_GROUP_X_DIM -#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM -#else -#include -// #define BSG_TILE_GROUP_X_DIM 16 -// #define BSG_TILE_GROUP_Y_DIM 8 -#endif - -#include -bsg_barrier barrier; - -#include -#include - -#ifdef DEBUG -#define pr_dbg(fmt, ...) \ - bsg_printf(fmt, ##__VA_ARGS__) -#else -#define pr_dbg(fmt, ...) -#endif - -__attribute__((section(".dram"))) float * __restrict p; -__attribute__((section(".dram"))) float * __restrict old_rank; -__attribute__((section(".dram"))) float * __restrict new_rank; -__attribute__((section(".dram"))) int * __restrict out_degree; -//__attribute__((section(".dram"))) int * __restrict generated_tmp_vector_3; -//__attribute__((section(".dram"))) double alpha = 0.15; -//__attribute__((section(".dram"))) double epsilon = (double) 1e-6; - - -template int edgeset_apply_pull_parallel_from_vertexset(int *in_indices , int *in_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) -{ - //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); - //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); - int start, end; - local_range(V, &start, &end); - for ( int d = start; d < end; d++) { - int degree = in_indices[d + 1] - in_indices[d]; - int * neighbors = &in_neighbors[in_indices[d]]; - for(int s = 0; s < degree; s++) { - if(from_vertexset[neighbors[s]]) { - //pr_dbg("found a vertex to update: %i %i\n", neighbors[s], d); - apply_func (neighbors[s] , d); - } - } //end of loop on in neighbors - } //end of outer for loop - return 0; -} //end of edgeset apply function - -template int edgeset_apply_push_parallel_from_vertexset(int *out_indices , int *out_neighbors, int* from_vertexset, APPLY_FUNC apply_func, int V, int E, int block_size_x) -{ - //if(bsg_id == 0) pr_dbg("val of root front: %i\n", from_vertexset[6]); - //if(bsg_id == 0) pr_dbg("size of graph: %i\n", V); - int start, end; - local_range(V, &start, &end); - for ( int s = start; s < end; s++) { - if(from_vertexset[s]) { - int degree = out_indices[s + 1] - out_indices[s]; - int * neighbors = &out_neighbors[out_indices[s]]; - for(int d = 0; d < degree; d++) { - apply_func (s, neighbors[d]); - //if (new_rank[neighbors[d]] != 0.0){ pr_dbg("value updated in iteration: %i\n", neighbors[d]); } - - } - } //end of loop on in neighbors - } //end of outer for loop - //barrier.sync(); - return 0; -} //end of edgeset apply function - -struct updateEdge -{ - void operator() (int src, int dst) - { - float alpha = 0.15; - new_rank[dst] = (new_rank[dst] + (((((1) - alpha) / ((1) + alpha)) * old_rank[src]) / out_degree[src])); - }; -}; -struct updateSelf -{ - void operator() (int v) - { - float alpha = 0.15; - p[v] = (p[v] + ((((2) * alpha) / ((1) + alpha)) * old_rank[v])); - new_rank[v] = (0) ; - }; -}; -struct filter_frontier -{ - bool operator() (int v) - { - float epsilon = (float) 1e-6; - bool output ; - //if(old_rank[v] == 0) return 0; - if(new_rank[v] == 0) return 0; - //output = (old_rank[v]) > ((out_degree[v] * epsilon)); - output = (new_rank[v]) > ((out_degree[v] * epsilon)); - return output; - }; -}; - -extern "C" int __attribute__ ((noinline)) updateSelf_kernel(int * frontier, int V, int tag_c) { - //pr_dbg("%i: on update self tag: %i\n", bsg_id, tag_c); - bsg_cuda_print_stat_start(tag_c); - barrier.sync(); - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - if(frontier[iter_x]) {updateSelf()(iter_x);} - } - bsg_cuda_print_stat_end(tag_c); - barrier.sync(); - return 0; -} -extern "C" int __attribute__ ((noinline)) edgeset_apply_pull_parallel_from_vertexset_call(int *in_indices, int *in_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { - barrier.sync(); - //pr_dbg("%i: on update edges %i\n", bsg_id, tag_c); - bsg_cuda_print_stat_start(tag_c); - bsg_saif_start(); - edgeset_apply_pull_parallel_from_vertexset(in_indices, in_neighbors, frontier, updateEdge(), V, E, block_size_x); - bsg_saif_end(); - bsg_cuda_print_stat_end(tag_c); - barrier.sync(); - return 0; -} - - extern "C" int __attribute__ ((noinline)) edgeset_apply_push_parallel_from_vertexset_call(int *out_indices, int *out_neighbors, int *frontier, int V, int E, int block_size_x, int tag_c) { - barrier.sync(); - bsg_cuda_print_stat_start(tag_c); - bsg_saif_start(); - edgeset_apply_push_parallel_from_vertexset(out_indices, out_neighbors, frontier, updateEdge(), V, E, block_size_x); - bsg_saif_end(); - bsg_cuda_print_stat_end(tag_c); - barrier.sync(); - return 0; -} - -extern "C" int __attribute__ ((noinline)) filter_frontier_where_call(int * next5, int V, int block_size_x, int tag_c) { - //if(bsg_id == 0) pr_dbg("0x%08x next, %i tag\n", next5, tag_c); - //pr_dbg("%i: on frontier filter %i\n", bsg_id, tag_c); - bsg_cuda_print_stat_start(tag_c); - barrier.sync(); - int start, end; - local_range(V, &start, &end); - for (int iter_x = start; iter_x < end; iter_x++) { - if (iter_x < V) { - next5[iter_x] = 0; - if ( filter_frontier()( iter_x ) ) { - next5[iter_x] = 1; - //pr_dbg("added vertex %i to frontier\n", iter_x); - } - } - else { break; } - } //end of loop - bsg_cuda_print_stat_end(tag_c); - barrier.sync(); - return 0; -} - -extern "C" void prefetch(int * in_indices, int * in_neighbors, int * from_vertexset, int V, int E) { - int id = __bsg_id; - int threads = bsg_tiles_X * bsg_tiles_Y; - // prefetch all data; - for (int i = 32 * id; i < E; i += 32 * threads) { - asm volatile ("lw x0, %[p]" :: [p] "m" (in_neighbors[i])); - } - for (int i = 32 * id; i < V; i += 32 * threads) { - asm volatile ("lw x0, %[p]" :: [p] "m" (in_indices[i])); - } - for (int i = 32 * id; i < V; i += 32 * threads) { - asm volatile ("lw x0, %[p]" :: [p] "m" (from_vertexset[i])); - } - for (int i = 32 * id; i < V; i += 32 * threads) { - asm volatile ("lw x0, %[p]" :: [p] "m" (out_degree[i])); - } - for (int i = 32 * id; i < V; i += 32 * threads) { - asm volatile ("lw x0, %[p]" :: [p] "m" (p[i])); - } - for (int i = 32 * id; i < V; i += 32 * threads) { - asm volatile ("lw x0, %[p]" :: [p] "m" (old_rank[i])); - } - for (int i = 32 * id; i < V; i += 32 * threads) { - asm volatile ("lw x0, %[p]" :: [p] "m" (new_rank[i])); - } - barrier.sync(); - return ; - -} - - diff --git a/examples/graphit/test_pr_nibble/main.cpp b/examples/graphit/test_pr_nibble/main.cpp deleted file mode 100644 index 56dd9aaf6..000000000 --- a/examples/graphit/test_pr_nibble/main.cpp +++ /dev/null @@ -1,191 +0,0 @@ -#include "pr.hpp" - -//#define DEBUG - -#define VERIFY 0 - -#ifdef DEBUG -#define X 1 -#define Y 1 -#else -#define X 16 -#define Y 8 -#endif - -#define ROOT 6 //eventually we will need to do 50 start vertices (in parallel) -#define NUM_LOCKS 1024 //width of manycore * 64 - -GraphHB edges; -GlobalScalar p_dev; -GlobalScalar old_rank_dev; -GlobalScalar new_rank_dev; -GlobalScalar out_degree_dev; -//GlobalScalar alpha_dev; -//GlobalScalar epsilon_dev; - -#include "pr_host.hpp" - -int test_pr_nibble(int argc, char ** argv){ - InputParser input(argc, argv); - if(!input.cmdOptionExists("-g")){ - std::cerr << "no input args\n"; - return 0; - } - std::string ucode_path = input.getRISCVFile(); - - int iter = 0; - std::string iterstrbase = "iteration-"; - auto pos = ucode_path.find(iterstrbase); - auto iterstr = ucode_path.substr(pos+iterstrbase.size(), std::string::npos); - std::stringstream ss(iterstr); - ss >> iter; - std::cerr << "iteration: " << iter << std::endl; - - int version = 0; //default to vertex pull - if(ucode_path.find("push") != std::string::npos) { - version = 1; - } - int hybrid = 0; //default to vertex pull - if(ucode_path.find("hybrid") != std::string::npos) { - hybrid = 1; - } - std::cerr << "version: " << version << std::endl; - std::cerr << "hybrid: " << hybrid << std::endl; - std::cerr << "load microcode" << std::endl; - hammerblade::builtin_loadMicroCodeFromFile(ucode_path); - - std::cerr << "load graph" << std::endl; - std::string graph_f = input.getCmdOption("-g"); - edges = hammerblade::builtin_loadEdgesFromFileToHB (graph_f.c_str()); - - std::cerr << "size of graph: " << std::endl; - std::cerr << edges.num_nodes() << std::endl; - std::cerr << edges.num_edges() << std::endl; - std::cerr << "init global scalars" << std::endl; - - p_dev = GlobalScalar("p"); - hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), p_dev); - old_rank_dev = GlobalScalar("old_rank"); - hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), old_rank_dev); - new_rank_dev = GlobalScalar("new_rank"); - hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), new_rank_dev); - out_degree_dev = GlobalScalar("out_degree"); - hammerblade::init_global_array(hammerblade::builtin_getVerticesHB(edges), out_degree_dev); - //alpha_dev = GlobalScalar("alpha"); - //epsilon_dev = GlobalScalar("epsilon"); - std::cerr << "init locks" << std::endl; - GlobalScalar glbl_locks = GlobalScalar("locks"); - hammerblade::init_global_array>(NUM_LOCKS, glbl_locks); - std::atomic tmp_a[NUM_LOCKS] = {}; - Device::Ptr device = Device::GetInstance(); - float alpha = ((float) 0.15) ; - float epsilon = ((float) 1e-06) ; - int start_vertex = ROOT; - Vector frontier = Vector(hammerblade::builtin_getVerticesHB(edges)); - - std::vector hfrontier(edges.num_nodes(), 0); - std::vector p(edges.num_nodes(), (float) 0.0); - std::vector new_rank(edges.num_nodes(), (float) 0.0); - std::vector old_rank(edges.num_nodes(), (float) 0.0); - std::vector out_degs = edges.get_out_degrees(); - - //compute up to current iter on host - hfrontier[start_vertex] = 1; - new_rank[start_vertex] = (float) 1.0; - old_rank[start_vertex] = (float) 1.0; - host_pr_calc(p, old_rank, new_rank, hfrontier, iter); - - frontier.copyToDevice(hfrontier.data(), hfrontier.size()); - - //next_frontier.copyToDevice(zeros.data(), zeros.size()); - hammerblade::write_global_buffer_dma(p.data(), p_dev, p.size()); - hammerblade::write_global_buffer_dma(old_rank.data(), old_rank_dev, old_rank.size()); - hammerblade::write_global_buffer_dma(new_rank.data(), new_rank_dev, new_rank.size()); - hammerblade::write_global_buffer_dma(out_degs.data(), out_degree_dev, out_degs.size()); - hammerblade::write_global_buffer_dma>(tmp_a, glbl_locks, NUM_LOCKS); - - device->freeze_cores(); - device->write_dma(); - device->unfreeze_cores(); - if(hybrid || version == 2) { - int num_items = std::count(hfrontier.begin(), hfrontier.end(), 1); - int dir = calculate_direction(num_items, hfrontier, edges, edges.num_nodes(), edges.num_edges()); - if(dir){ - if(version != 2) version = 0; //pull - } else { - version = 1; //push - } - } - - std::cerr << "start of while loop\n"; - int tag_c = 0; - //while ( builtin_getVertexSetSizeHB(frontier, edges.num_nodes()) != 0) - for(int i = 0; i < 1; i++) - { - int f_sz = 0; - //new_rank = old_rank; - switch(version) { - case 0: //vertex pull - std::cerr << "pull kernel\n"; - std::cerr << "preloading the cache\n"; - device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()}); - device->runJobs(); - std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); - device->runJobs(); - tag_c++; - std::cerr << "run update edges kernel on iter : " << iter << "\n"; - device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); - device->runJobs(); - tag_c++; - std::cerr << "create next frontier\n"; - device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); - device->runJobs(); - std::cerr << "swap arrays\n"; - hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); - f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); - std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; - break; - case 1: //vertex push - std::cerr << "push kernel\n"; - std::cerr << "preloading the cache\n"; - device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()}); - device->runJobs(); - std::cerr << "run update self vertex kernel\n"; - device->enqueueJob("updateSelf_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); - device->runJobs(); - tag_c++; - std::cerr << "run update edges kernel on iter : " << iter << "\n"; - device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); - device->runJobs(); - tag_c++; - std::cerr << "create next frontier\n"; - device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); - device->runJobs(); - std::cerr << "swap arrays\n"; - hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); - f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); - std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; - break; - } - tag_c++; - - iter++; - } - std::cerr << "*******end of program********\n"; - std::cerr << "took: " << iter << " iterations to complete\n"; - if(VERIFY) { - ofstream ver_file; - ver_file.open("./rank.txt"); - float host_rank[edges.num_nodes()]; - hammerblade::read_global_buffer_dma(host_rank, old_rank_dev, edges.num_nodes()); - for(int i = 0; i < edges.num_nodes(); i++) { - ver_file << host_rank[i] << std::endl; - } - ver_file.close(); - } - device->finish(); - return 0; -} - -declare_program_main("test_pr_nibble", test_pr_nibble); diff --git a/examples/graphit/test_pr_nibble/pr_host.hpp b/examples/graphit/test_pr_nibble/pr_host.hpp deleted file mode 100644 index 1923c6d6d..000000000 --- a/examples/graphit/test_pr_nibble/pr_host.hpp +++ /dev/null @@ -1,50 +0,0 @@ -//function to compute pr-nibble on host up to current iter -#pragma once -#include -#include - -inline void host_pr_calc(std::vector & p, std::vector & old_rank, std::vector & new_rank, std::vector & frontier, int iter) { - float alpha = (float) 0.15; - float epsilon = (float) 1e-06; - auto g = edges.getHostGraph(); - int * in_neigh = g.in_neighbors_shared_.get(); - int ** in_index = g.in_index_shared_.get(); - std::string fname = "iter-" + std::to_string(iter) + ".txt"; - ofstream ofile; - ofile.open (fname); - for(int i = 0; i < iter; i++) { - new_rank.assign(old_rank.begin(), old_rank.end()); - //print out iteration and size: - int num_items = std::count(frontier.begin(), frontier.end(), 1); - std::cerr << "on iteration: " << i << " with frontier size: " << num_items << std::endl; - //update_self - for(int v = 0; v < g.num_nodes(); v++) { - if(frontier[v]) { - p[v] += (2.0 * alpha) / (1.0 + alpha) * old_rank[v]; - new_rank[v] = (float) 0.0 ; - } - } - //update edges - for(int d = 0; d < g.num_nodes(); d++) { - for(int s : g.in_neigh(d)) { - if(frontier[s]){ - float update = ((1.0 - alpha) / (1.0 + alpha)) * old_rank[s]; - update = update / ((float) g.out_degree(s)); - new_rank[d] += update; - if(i == (iter - 1)) {ofile << s << " " << d << " " << new_rank[d] << std::endl;} - } - } - } - old_rank.assign(new_rank.begin(), new_rank.end()); - //update frontier - for(int v = 0; v < g.num_nodes(); v++) { - frontier[v] = 0; - if(g.out_degree(v) > 0 && old_rank[v] >= (((float) g.out_degree(v)) * epsilon)) { - frontier[v] = 1; - } - } - } - ofile.close(); - int num_items = std::count(frontier.begin(), frontier.end(), 1); - std::cerr << "returning with frontier size: " << num_items << std::endl; -} From 15278cf7d798dc06b65afb5131b1bd84a2ce7c5a Mon Sep 17 00:00:00 2001 From: Emily Furst Date: Mon, 10 May 2021 14:16:02 -0700 Subject: [PATCH 22/22] [pr-nibble] fixing bad makefile path, vim trampled all over my camel case --- examples/graphit/pr_nibble/Makefile | 2 +- examples/graphit/pr_nibble/main.cpp | 38 ++++++++++++++--------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/examples/graphit/pr_nibble/Makefile b/examples/graphit/pr_nibble/Makefile index 720bbdc24..af6475765 100644 --- a/examples/graphit/pr_nibble/Makefile +++ b/examples/graphit/pr_nibble/Makefile @@ -128,7 +128,7 @@ TILE_GROUP_DIM_Y = 8 RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) -RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/graphit/test_pr_nibble/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/ +RISCV_INCLUDES += -I$(CURRENT_PATH)/kernel/include -I$(GRAPHIT_PATH)/src/runtime_lib/infra_hb/device/ include $(EXAMPLES_PATH)/cuda/riscv.mk diff --git a/examples/graphit/pr_nibble/main.cpp b/examples/graphit/pr_nibble/main.cpp index 55a05fd8e..aa2d4032f 100644 --- a/examples/graphit/pr_nibble/main.cpp +++ b/examples/graphit/pr_nibble/main.cpp @@ -120,47 +120,47 @@ int test_pr_nibble(int argc, char ** argv){ case 0: //vertex pull std::cerr << "pull kernel\n"; std::cerr << "preloading the cache\n"; - device->enqueuejob("prefetch", hb_mc_dimension(x,y),{edges.getinindicesaddr() , edges.getinneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges()}); - device->runjobs(); + device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()}); + device->runJobs(); std::cerr << "run update self vertex kernel\n"; - device->enqueuejob("updateself_kernel",hb_mc_dimension(x,y), {frontier.getaddr(), edges.num_nodes(), tag_c}); - device->runjobs(); + device->enqueueJob("updateself_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); + device->runJobs(); tag_c++; std::cerr << "run update edges kernel on iter : " << iter << "\n"; - device->enqueuejob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(x,y),{edges.getinindicesaddr() , edges.getinneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); - device->runjobs(); + device->enqueueJob("edgeset_apply_pull_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getInIndicesAddr() , edges.getInNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runJobs(); tag_c++; std::cerr << "create next frontier\n"; - device->enqueuejob("filter_frontier_where_call", hb_mc_dimension(x,y),{frontier.getaddr(), edges.num_nodes(), edges.num_edges(), tag_c}); - device->runjobs(); + device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runJobs(); std::cerr << "swap arrays\n"; hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); - f_sz = builtin_getvertexsetsizehb(frontier, edges.num_nodes()); + f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; break; case 1: //vertex push std::cerr << "push kernel\n"; std::cerr << "preloading the cache\n"; - device->enqueuejob("prefetch", hb_mc_dimension(x,y),{edges.getoutindicesaddr() , edges.getoutneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges()}); - device->runjobs(); + device->enqueueJob("prefetch", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges()}); + device->runJobs(); std::cerr << "run update self vertex kernel\n"; - device->enqueuejob("updateself_kernel",hb_mc_dimension(x,y), {frontier.getaddr(), edges.num_nodes(), tag_c}); - device->runjobs(); + device->enqueueJob("updateself_kernel",hb_mc_dimension(X,Y), {frontier.getAddr(), edges.num_nodes(), tag_c}); + device->runJobs(); tag_c++; std::cerr << "run update edges kernel on iter : " << iter << "\n"; - device->enqueuejob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(x,y),{edges.getoutindicesaddr() , edges.getoutneighborsaddr(), frontier.getaddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); - device->runjobs(); + device->enqueueJob("edgeset_apply_push_parallel_from_vertexset_call", hb_mc_dimension(X,Y),{edges.getOutIndicesAddr() , edges.getOutNeighborsAddr(), frontier.getAddr(), edges.num_nodes(), edges.num_edges(), edges.num_nodes(), tag_c}); + device->runJobs(); tag_c++; std::cerr << "create next frontier\n"; - device->enqueuejob("filter_frontier_where_call", hb_mc_dimension(x,y),{frontier.getaddr(), edges.num_nodes(), edges.num_edges(), tag_c}); - device->runjobs(); + device->enqueueJob("filter_frontier_where_call", hb_mc_dimension(X,Y),{frontier.getAddr(), edges.num_nodes(), edges.num_edges(), tag_c}); + device->runJobs(); std::cerr << "swap arrays\n"; hammerblade::swap_global_arrays(new_rank_dev, old_rank_dev); - f_sz = builtin_getvertexsetsizehb(frontier, edges.num_nodes()); + f_sz = builtin_getVertexSetSizeHB(frontier, edges.num_nodes()); std::cerr << "size of frontier after iteration " << iter << " : " << f_sz << std::endl; break; } - if(verify) { + if(VERIFY) { ofstream ver_file; ver_file.open("./rank.txt"); float host_rank[edges.num_nodes()];