Skip to content

Commit

Permalink
BFS optimization and memory access count script update (#73)
Browse files Browse the repository at this point in the history
* more useful access count script

* add kernel print for DLCSR

* print out number of mirrors

* BFS optimization to bypass broadcast during sync

* clean up remote accesses in reduce and broadcast

* print out location of source vertex

* minor fix

* change cout to cerr
  • Loading branch information
ywwu928 authored Apr 19, 2024
1 parent 618bb20 commit 31e9246
Show file tree
Hide file tree
Showing 7 changed files with 233 additions and 130 deletions.
115 changes: 61 additions & 54 deletions include/pando-lib-galois/graphs/mirror_dist_local_csr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include <pando-lib-galois/graphs/local_csr.hpp>
#include <pando-lib-galois/import/wmd_graph_importer.hpp>
#include <pando-lib-galois/loops/do_all.hpp>
#include <pando-lib-galois/sync/global_barrier.hpp>
#include <pando-lib-galois/sync/simple_lock.hpp>
#include <pando-lib-galois/utility/gptr_monad.hpp>
#include <pando-lib-galois/utility/tuple.hpp>
Expand Down Expand Up @@ -570,50 +569,49 @@ class MirrorDistLocalCSR {
WaitGroup wg;
PANDO_CHECK(wg.initialize(0));
WaitGroup::HandleType wgh = wg.getHandle();
auto thisCSR = *this;
auto state =
galois::make_tpl(thisCSR, localMirrorToRemoteMasterOrderedTable, masterBitSets, func, wgh);
auto thisMDLCSR = *this;
auto state = galois::make_tpl(thisMDLCSR, func, wgh);

PANDO_CHECK(galois::doAll(
wgh, state, mirrorBitSets,
+[](decltype(state) state, pando::GlobalRef<pando::Array<bool>> mirrorBitSet) {
auto [graph, localMirrorToRemoteMasterOrderedTable, masterBitSets, func, wgh] = state;
wgh, state, localMirrorToRemoteMasterOrderedTable,
+[](decltype(state) state,
pando::GlobalRef<pando::Array<MirrorToMasterMap>> localMirrorToRemoteMasterOrderedMap) {
auto [thisMDLCSR, func, wgh] = state;

pando::GlobalRef<pando::Array<MirrorToMasterMap>> localMirrorToRemoteMasterOrderedMap =
localMirrorToRemoteMasterOrderedTable[pando::getCurrentPlace().node.id];
pando::GlobalRef<pando::Array<bool>> mirrorBitSet = thisMDLCSR.getLocalMirrorBitSet();

for (std::uint64_t i = 0ul; i < lift(mirrorBitSet, size); i++) {
bool dirty = fmap(mirrorBitSet, operator[], i);
if (dirty) {
// obtain the local mirror vertex data
VertexTopologyID mirrorTopologyID = graph.getMirrorTopologyIDFromIndex(i);
VertexTopologyID mirrorTopologyID = thisMDLCSR.getMirrorTopologyIDFromIndex(i);
// a copy
VertexData mirrorData = graph.getData(mirrorTopologyID);
VertexData mirrorData = thisMDLCSR.getData(mirrorTopologyID);

// obtain the corresponding remote master information
MirrorToMasterMap mirrorToMasterMap =
fmap(localMirrorToRemoteMasterOrderedMap, operator[], i);
VertexTopologyID masterTopologyID = mirrorToMasterMap.getMaster();
MirrorToMasterMap map = fmap(localMirrorToRemoteMasterOrderedMap, operator[], i);
VertexTopologyID masterTopologyID = map.getMaster();
// atomic function signature: func(VertexData mirror, pando::GlobalRef<VertexData>
// master) apply the function
wgh.addOne();
PANDO_CHECK(executeOn(
localityOf(masterTopologyID),
+[](Func func, decltype(graph) graph, VertexData mirrorData,
VertexTopologyID masterID, WaitGroup::HandleType wgh) {
auto masterData = graph.getData(masterID);
+[](Func func, decltype(thisMDLCSR) thisMDLCSR, VertexTopologyID masterTopologyID,
VertexData mirrorData, WaitGroup::HandleType wgh) {
pando::GlobalRef<VertexData> masterData = thisMDLCSR.getData(masterTopologyID);
VertexData oldMasterData = masterData;
func(mirrorData, masterData);
if (masterData != oldMasterData) {
// set the remote master bit set
pando::GlobalRef<pando::Array<bool>> masterBitSet =
graph.masterBitSets.getLocalRef();
std::uint64_t index = graph.getIndex(masterID, graph.getLocalMasterRange());
thisMDLCSR.getLocalMasterBitSet();
std::uint64_t index =
thisMDLCSR.getIndex(masterTopologyID, thisMDLCSR.getLocalMasterRange());
fmap(masterBitSet, operator[], index) = true;
}
wgh.done();
},
func, graph, mirrorData, masterTopologyID, wgh));
func, thisMDLCSR, masterTopologyID, mirrorData, wgh));
// master data updated
}
}
Expand All @@ -625,65 +623,74 @@ class MirrorDistLocalCSR {
* @brief Broadcast the updated master values to the corresponding mirror values
*/
void broadcast() {
galois::GlobalBarrier barrier;
PANDO_CHECK(barrier.initialize(pando::getPlaceDims().node.id));
WaitGroup wg;
PANDO_CHECK(wg.initialize(0));
WaitGroup::HandleType wgh = wg.getHandle();

auto thisCSR = *this;
auto state = galois::make_tpl(thisCSR, barrier, masterBitSets, mirrorBitSets);
auto thisMDLCSR = *this;
auto state = galois::make_tpl(thisMDLCSR, wgh);

galois::doAll(
state, localMasterToRemoteMirrorTable,
PANDO_CHECK(galois::doAll(
wgh, state, localMasterToRemoteMirrorTable,
+[](decltype(state) state, pando::GlobalRef<pando::Vector<pando::Vector<MirrorToMasterMap>>>
localMasterToRemoteMirrorMap) {
auto [object, barrier, masterBitSets, mirrorBitSets] = state;
auto [thisMDLCSR, wgh] = state;

pando::GlobalRef<pando::Array<bool>> masterBitSet =
masterBitSets[pando::getCurrentPlace().node.id];
pando::GlobalRef<pando::Array<bool>> masterBitSet = thisMDLCSR.getLocalMasterBitSet();

std::uint64_t numHosts = static_cast<std::uint64_t>(pando::getPlaceDims().node.id);

for (std::uint64_t nodeId = 0ul; nodeId < numHosts; nodeId++) {
pando::GlobalRef<pando::Vector<MirrorToMasterMap>> mapVectorFromHost =
fmap(localMasterToRemoteMirrorMap, operator[], nodeId);
for (std::uint64_t i = 0ul; i < lift(mapVectorFromHost, size); i++) {
MirrorToMasterMap m = fmap(mapVectorFromHost, operator[], i);
VertexTopologyID masterTopologyID = m.getMaster();
std::uint64_t index = object.getIndex(masterTopologyID, object.getLocalMasterRange());
MirrorToMasterMap map = fmap(mapVectorFromHost, operator[], i);
VertexTopologyID masterTopologyID = map.getMaster();
std::uint64_t index =
thisMDLCSR.getIndex(masterTopologyID, thisMDLCSR.getLocalMasterRange());
bool dirty = fmap(masterBitSet, operator[], index);
if (dirty) {
// obtain the local master vertex data
VertexData masterData = object.getData(masterTopologyID);
VertexData masterData = thisMDLCSR.getData(masterTopologyID);

// obtain the corresponding remote mirror information
VertexTopologyID mirrorTopologyID = m.getMirror();
// actual reference
pando::GlobalRef<VertexData> mirrorData = object.getData(mirrorTopologyID);

VertexData oldMirrorData = mirrorData;
mirrorData = masterData;
// mirror data updated
if (mirrorData != oldMirrorData) {
// set the remote mirror bit set
pando::GlobalRef<pando::Array<bool>> mirrorBitSet = mirrorBitSets[nodeId];
std::uint64_t index =
object.getIndex(mirrorTopologyID, object.getMirrorRange(nodeId));
fmap(mirrorBitSet, operator[], index) = true;
}
VertexTopologyID mirrorTopologyID = map.getMirror();
wgh.addOne();
PANDO_CHECK(executeOn(
localityOf(mirrorTopologyID),
+[](decltype(thisMDLCSR) thisMDLCSR, VertexTopologyID mirrorTopologyID,
VertexData masterData, WaitGroup::HandleType wgh) {
pando::GlobalRef<VertexData> mirrorData =
thisMDLCSR.getData(mirrorTopologyID);
VertexData oldMirrorData = mirrorData;
mirrorData = masterData;
if (mirrorData != oldMirrorData) {
// set the remote mirror bit set
pando::GlobalRef<pando::Array<bool>> mirrorBitSet =
thisMDLCSR.getLocalMirrorBitSet();
std::uint64_t index =
thisMDLCSR.getIndex(mirrorTopologyID, thisMDLCSR.getLocalMirrorRange());
fmap(mirrorBitSet, operator[], index) = true;
}
wgh.done();
},
thisMDLCSR, mirrorTopologyID, masterData, wgh));
}
}
}

barrier.done();
PANDO_CHECK(barrier.wait());
});
}));
PANDO_CHECK(wg.wait());
wg.deinitialize();
}
/**
* @brief Synchronize master and mirror values among hosts
*/
template <typename Func, bool NOBROADCAST = false>
template <typename Func, bool REDUCE = true, bool BROADCAST = true>
void sync(Func func) {
reduce(func);
if (!NOBROADCAST) {
if (REDUCE) {
reduce(func);
}
if (BROADCAST) {
broadcast();
}
}
Expand Down
50 changes: 31 additions & 19 deletions microbench/bfs/include/pando-bfs-galois/sssp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ pando::Status SSSP_DLCSR(
G& graph, std::uint64_t src, ThreadLocalVector<typename G::VertexTopologyID>& active,
galois::HostLocalStorage<pando::Vector<typename G::VertexTopologyID>>& phbfs) {
#ifdef DPRINTS
std::cout << "Got into SSSP" << std::endl;
std::cerr << "Got into SSSP" << std::endl;
#endif

galois::WaitGroup wg{};
Expand All @@ -132,7 +132,13 @@ pando::Status SSSP_DLCSR(
state.active = active;
state.dist = 0;

PANDO_MEM_STAT_NEW_KERNEL("BFS Start");
#ifdef PANDO_STAT_TRACE_ENABLE
PANDO_CHECK(galois::doAll(
wgh, phbfs, +[](pando::Vector<typename G::VertexTopologyID>) {
PANDO_MEM_STAT_NEW_KERNEL("BFS Start");
}));
PANDO_CHECK(wg.wait());
#endif

while (!IsactiveIterationEmpty(phbfs)) {
#ifdef DPRINTS
Expand All @@ -156,7 +162,13 @@ pando::Status SSSP_DLCSR(
#endif
}

PANDO_MEM_STAT_NEW_KERNEL("BFS End");
#ifdef PANDO_STAT_TRACE_ENABLE
PANDO_CHECK(galois::doAll(
wgh, phbfs, +[](pando::Vector<typename G::VertexTopologyID>) {
PANDO_MEM_STAT_NEW_KERNEL("BFS END");
}));
PANDO_CHECK(wg.wait());
#endif

if constexpr (COUNT_EDGE) {
galois::doAll(
Expand Down Expand Up @@ -245,29 +257,22 @@ pando::Status MDLCSRLocal(G& graph, MDWorkList<G> toRead, MDWorkList<G> toWrite)
}

template <typename G>
bool updateActive(G& graph, MDWorkList<G> toRead, const pando::Array<bool>& masterBitSet,
const pando::Array<bool>& mirrorBitSet) {
bool updateActive(G& graph, MDWorkList<G> toRead, const pando::Array<bool>& masterBitSet) {
bool active = false;
for (std::uint64_t i = 0; i < masterBitSet.size(); i++) {
if (masterBitSet[i]) {
active = true;
PANDO_CHECK(fmap(toRead[0], pushBack, graph.getMasterTopologyIDFromIndex(i)));
}
}
for (std::uint64_t i = 0; i < mirrorBitSet.size(); i++) {
if (mirrorBitSet[i]) {
active = true;
PANDO_CHECK(fmap(toRead[0], pushBack, graph.getMirrorTopologyIDFromIndex(i)));
}
}
return active;
}

template <typename G>
pando::Status SSSPMDLCSR(G& graph, std::uint64_t src, HostLocalStorage<MDWorkList<G>>& toRead,
HostLocalStorage<MDWorkList<G>>& toWrite, P<bool> active) {
#ifdef DPRINTS
std::cout << "Got into SSSP" << std::endl;
std::cerr << "Got into SSSP" << std::endl;
#endif
galois::WaitGroup wg{};
PANDO_CHECK_RETURN(wg.initialize(0));
Expand All @@ -285,11 +290,20 @@ pando::Status SSSPMDLCSR(G& graph, std::uint64_t src, HostLocalStorage<MDWorkLis
auto [srcID, found] = graph.getLocalTopologyID(src);
if (found) {
graph.setDataOnly(srcID, 0);
PANDO_CHECK(fmap(toRead[0], pushBack, srcID));

std::int16_t srcHost = graph.getPhysicalHostID(src);
if (srcHost == pando::getCurrentPlace().node.id) {
PANDO_CHECK(fmap(toRead[0], pushBack, srcID));
}
}
}));
PANDO_CHECK_RETURN(wg.wait());

#ifdef DPRINTS
std::uint64_t srcHost = graph.getPhysicalHostID(src);
std::cerr << "Source is on host " << srcHost << std::endl;
#endif

#ifdef PANDO_STAT_TRACE_ENABLE
PANDO_CHECK(galois::doAll(
wgh, toRead, +[](MDWorkList<G>) {
Expand All @@ -313,17 +327,15 @@ pando::Status SSSPMDLCSR(G& graph, std::uint64_t src, HostLocalStorage<MDWorkLis
}));
PANDO_CHECK_RETURN(wg.wait());

graph.template sync<decltype(updateData), true>(updateData);
graph.template sync<decltype(updateData), true, false>(updateData);

galois::HostLocalStorage<pando::Array<bool>> masterBitSets = graph.getMasterBitSets();
galois::HostLocalStorage<pando::Array<bool>> mirrorBitSets = graph.getMirrorBitSets();
auto activeState = galois::make_tpl(graph, mirrorBitSets, toRead, active);
auto activeState = galois::make_tpl(graph, toRead, active);
PANDO_CHECK_RETURN(galois::doAll(
wgh, activeState, masterBitSets,
+[](decltype(activeState) activeState, pando::Array<bool> masterBitSet) {
auto [graph, mirrorBitSets, toRead, active] = activeState;
pando::Array<bool> mirrorBitSet = mirrorBitSets.getLocalRef();
if (updateActive(graph, toRead.getLocalRef(), masterBitSet, mirrorBitSet)) {
auto [graph, toRead, active] = activeState;
if (updateActive(graph, toRead.getLocalRef(), masterBitSet)) {
*active = true;
}
}));
Expand Down
9 changes: 9 additions & 0 deletions microbench/bfs/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ void HBMainMDLCSR(pando::Vector<std::uint64_t> srcVertices, std::uint64_t numVer
PANDO_CHECK(toRead.initialize());
PANDO_CHECK(toWrite.initialize());

#ifdef DPRINTS
// print out number of mirrors
PANDO_CHECK(galois::doAll(
graph, toRead, +[](Graph graph, bfs::MDWorkList<Graph>) {
std::cerr << "Host " << pando::getCurrentPlace().node.id << " has " << graph.getMirrorSize()
<< " mirrors" << std::endl;
}));
#endif

PANDO_CHECK(galois::doAll(
toRead, toWrite,
+[](decltype(toRead) toRead, pando::GlobalRef<bfs::MDWorkList<Graph>> toWriteLocal) {
Expand Down
Loading

0 comments on commit 31e9246

Please sign in to comment.