Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Map Exchange to set up communication for MDLCSR #22

Merged
merged 8 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ jobs:
echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile.dev)" >> $GITHUB_ENV
if [ ${{ matrix.build-type }} == 'LSAN' ]; then
echo "PANDO_BUILD_DOCS=OFF" >> $GITHUB_ENV
echo "PANDO_CONTAINER_ENV=-e=PANDO_PREP_L1SP_HART=16384 -ePANDO_PREP_MAIN_NODE=8589934592" >> $GITHUB_ENV
echo "PANDO_CONTAINER_ENV=-e=PANDO_PREP_L1SP_HART=16384" >> $GITHUB_ENV
fi
if [ ${{ matrix.build-type }} == 'UBSAN' ]; then
echo "PANDO_BUILD_DOCS=OFF" >> $GITHUB_ENV
echo "PANDO_CONTAINER_ENV=-e=PANDO_PREP_L1SP_HART=16384 -ePANDO_PREP_MAIN_NODE=8589934592" >> $GITHUB_ENV
echo "PANDO_CONTAINER_ENV=-e=PANDO_PREP_L1SP_HART=16384" >> $GITHUB_ENV
fi
if [ ${{ matrix.build-type }} == 'Release' ]; then
echo "PANDO_BUILD_DOCS=OFF" >> $GITHUB_ENV
Expand Down
8 changes: 4 additions & 4 deletions cmake/PANDOTesting.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ function(pando_add_driver_test_lib TARGET SOURCEFILE LIBRARY)
else ()
set(HTHREADS "")
if (${GASNet_CONDUIT} STREQUAL "smp")
set(DRIVER_SCRIPT ${pando-lib-galois_SOURCE_DIR}/pando-rt/scripts/preprun.sh)
set(DRIVER_SCRIPT ${pando-lib-galois_SOURCE_DIR}/scripts/preprun.sh)
elseif (${GASNet_CONDUIT} STREQUAL "mpi")
set(DRIVER_SCRIPT ${pando-lib-galois_SOURCE_DIR}/pando-rt/scripts/preprun_mpi.sh)
set(DRIVER_SCRIPT ${pando-lib-galois_SOURCE_DIR}/scripts/preprun_mpi.sh)
else ()
message(FATAL_ERROR "No runner script for GASNet conduit ${GASNet_CONDUIT}")
endif ()
Expand Down Expand Up @@ -124,9 +124,9 @@ endfunction()
function(pando_add_bin_test TARGET ARGS INPUTFILE OKFILE)
if (NOT PANDO_RT_BACKEND STREQUAL "DRVX")
if (${GASNet_CONDUIT} STREQUAL "smp")
set(DRIVER_SCRIPT ${pando-lib-galois_SOURCE_DIR}/pando-rt/scripts/preprun.sh)
set(DRIVER_SCRIPT ${pando-lib-galois_SOURCE_DIR}/scripts/preprun.sh)
elseif (${GASNet_CONDUIT} STREQUAL "mpi")
set(DRIVER_SCRIPT ${pando-lib-galois_SOURCE_DIR}/pando-rt/scripts/preprun_mpi.sh)
set(DRIVER_SCRIPT ${pando-lib-galois_SOURCE_DIR}/scripts/preprun_mpi.sh)
else ()
message(FATAL_ERROR "No runner script for GASNet conduit ${GASNet_CONDUIT}")
endif ()
Expand Down
10 changes: 7 additions & 3 deletions include/pando-lib-galois/graphs/dist_local_csr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -487,10 +487,14 @@ class DistLocalCSR {
}

/** Host Information **/
std::uint64_t getPhysicalHostID(VertexTokenID tid) {
std::uint64_t getVirtualHostID(VertexTokenID tid) {
std::uint64_t virtualHostID = tid % this->numVHosts();
std::uint64_t physicalHost = fmap(virtualToPhysicalMap.getLocalRef(), get, virtualHostID);
return physicalHost;
return virtualHostID;
}
std::uint64_t getPhysicalHostID(VertexTokenID tid) {
std::uint64_t virtualHostID = this->getVirtualHostID(tid);
std::uint64_t physicalHostID = fmap(virtualToPhysicalMap.getLocalRef(), get, virtualHostID);
return physicalHostID;
}

/** Topology Modifications **/
Expand Down
115 changes: 66 additions & 49 deletions include/pando-lib-galois/graphs/mirror_dist_local_csr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

#include <utility>

#include "pando-rt/sync/mutex.hpp"
#include <pando-lib-galois/containers/hashtable.hpp>
#include <pando-lib-galois/containers/host_indexed_map.hpp>
#include <pando-lib-galois/containers/host_local_storage.hpp>
Expand All @@ -17,7 +16,10 @@
#include <pando-lib-galois/graphs/local_csr.hpp>
#include <pando-lib-galois/import/wmd_graph_importer.hpp>
#include <pando-lib-galois/loops/do_all.hpp>
#include <pando-lib-galois/sync/global_barrier.hpp>
#include <pando-lib-galois/sync/simple_lock.hpp>
#include <pando-lib-galois/utility/gptr_monad.hpp>
#include <pando-lib-galois/utility/tuple.hpp>
#include <pando-rt/containers/array.hpp>
#include <pando-rt/containers/vector.hpp>
#include <pando-rt/memory/memory_guard.hpp>
Expand Down Expand Up @@ -132,6 +134,10 @@ class MirrorDistLocalCSR {
VertexTopologyID getMaster() {
return master;
}

bool operator==(const MirrorToMasterMap& a) noexcept {
return a.mirror == mirror && a.master == master;
}
};

/** Vertex Manipulation **/
Expand Down Expand Up @@ -239,19 +245,13 @@ class MirrorDistLocalCSR {
}

/** Host Information **/
std::uint64_t getVirtualHostID(VertexTokenID tid) {
return dlcsr.getVirtualHostID(tid);
}
std::uint64_t getPhysicalHostID(VertexTokenID tid) {
return dlcsr.getPhysicalHostID(tid);
}

/** Sync **/
// TODO(Ying-Wei):
// write a sync function that reduces mirror values and then broadcasts master values
// return a bitmap of modified vertices
//
// template <typename Func>
// pando::Array<bool> sync(Func func, pando::Array<bool>) {
//}

/**
* @brief get vertex local dense ID
*/
Expand Down Expand Up @@ -303,8 +303,6 @@ class MirrorDistLocalCSR {
return dlcsr.getLocalCSR();
}

// TODO(Jeageun):
// write a initialize function that calls initializeAfterGather function of DistLocalCSR dlcsr
template <typename ReadVertexType, typename ReadEdgeType>
pando::Status initializeAfterGather(
galois::HostLocalStorage<pando::Vector<ReadVertexType>> vertexData, std::uint64_t numVertices,
Expand Down Expand Up @@ -390,62 +388,81 @@ class MirrorDistLocalCSR {
numVertices += lift(mirrorList[i], size);
}
PANDO_CHECK(wg.wait());

PANDO_CHECK_RETURN(setupCommunication());

return pando::Status::Success;
}
ywwu928 marked this conversation as resolved.
Show resolved Hide resolved

// TODO(Ying-Wei):
// uses doAll to send remoteMasterToLocalMirrorMap to corresponding remote hosts
// no need to use executeON
// just push to the localMasterToRemoteMirrorOrderedTable vector
// make sure to use the spin lock in pando-rt
/**
* @brief Get the local mutex
* @brief Exchanges the mirror to master mapping from the mirror side to the maser side
*/
pando::GlobalRef<pando::Mutex> getLocalMutex(std::uint64_t host_id) {
return hostMutex[host_id];
}

pando::Status setupCommunication() {
auto dims = pando::getPlaceDims();

// initialize localMirrorToRemoteMasterOrderedTable
PANDO_CHECK_RETURN(localMasterToRemoteMirrorTable.initialize());
for (std::int16_t i = 0; i < dims.node.id; i++) {
pando::GlobalRef<pando::Vector<pando::Vector<MirrorToMasterMap>>>
localMasterToRemoteMirrorMap = localMasterToRemoteMirrorTable[i];
PANDO_CHECK_RETURN(fmap(localMasterToRemoteMirrorMap, initialize, dims.node.id));
for (std::int16_t i = 0; i < dims.node.id; i++) {
pando::GlobalRef<pando::Vector<MirrorToMasterMap>> mapVectorFromHost =
fmap(localMasterToRemoteMirrorMap, get, i);
PANDO_CHECK_RETURN(fmap(mapVectorFromHost, initialize, 0));
}
}

PANDO_CHECK_RETURN(hostMutex.initialize());

PANDO_CHECK_RETURN(galois::doAll(
localMirrorToRemoteMasterOrderedTable, localMasterToRemoteMirrorTable,
+[](galois::HostLocalStorage<pando::Array<MirrorToMasterMap>>
localMirrorToRemoteMasterOrderedTable,
pando::GlobalRef<pando::Vector<EdgeHandle>> localMasterToRemoteMirrorTable) {
PANDO_CHECK(fmap(localMirrorToRemoteMasterOrderedTable, initialize, 0));
pando::Array<MirrorToMasterMap> remoteMasterToLocalMirrorMap =
localMirrorToRemoteMasterOrderedTable.getLocal();
for (MirrorToMasterMap m : remoteMasterToLocalMirrorMap) {
VertexTopologyID masterTopologyID = m.master;
VertexTokenID masterTokenID = getTokenID(masterTopologyID);
std::uint64_t physicalHost = getPhysicalHostID(masterTokenID);
pando::Mutex mutex = getLocalMutex(physicalHost);

// Lock mutex to ensure atomic append to the vector
mutex.lock();
PANDO_CHECK(fmap(localMasterToRemoteMirrorTable, pushBack, m));
mutex.unlock();
auto thisCSR = *this;
auto state = galois::make_tpl(thisCSR, localMasterToRemoteMirrorTable);

// push style
// each host traverses its own localMirrorToRemoteMasterOrderedTable and send out the mapping to
// the corresponding remote host append to the vector of vector where each vector is the mapping
// from a specific host
galois::doAll(
state, localMirrorToRemoteMasterOrderedTable,
+[](decltype(state) state,
pando::GlobalRef<pando::Array<MirrorToMasterMap>> localMirrorToRemoteMasterOrderedMap) {
auto [object, localMasterToRemoteMirrorTable] = state;
for (std::uint64_t i = 0ul; i < lift(localMirrorToRemoteMasterOrderedMap, size); i++) {
MirrorToMasterMap m = fmap(localMirrorToRemoteMasterOrderedMap, get, i);
VertexTopologyID masterTopologyID = m.getMaster();
VertexTokenID masterTokenID = object.getTokenID(masterTopologyID);
std::uint64_t physicalHost = object.getPhysicalHostID(masterTokenID);

pando::GlobalRef<pando::Vector<pando::Vector<MirrorToMasterMap>>>
localMasterToRemoteMirrorMap = localMasterToRemoteMirrorTable[physicalHost];
pando::GlobalRef<pando::Vector<MirrorToMasterMap>> mapVectorFromHost =
fmap(localMasterToRemoteMirrorMap, get, pando::getCurrentPlace().node.id);

PANDO_CHECK(fmap(mapVectorFromHost, pushBack, m));
}
}));
});

return pando::Status::Success;
}

/**
* @brief For testing only
*/
pando::GlobalRef<pando::Array<MirrorToMasterMap>> getLocalMirrorToRemoteMasterOrderedMap(
int16_t hostId) {
return localMirrorToRemoteMasterOrderedTable[hostId];
}
pando::GlobalRef<pando::Vector<pando::Vector<MirrorToMasterMap>>> getLocalMasterToRemoteMirrorMap(
uint64_t hostId) {
return localMasterToRemoteMirrorTable[hostId];
}

private:
DLCSR dlcsr;
uint64_t _mirror_size;
galois::HostLocalStorage<LocalVertexRange> masterRange;
galois::HostLocalStorage<LocalVertexRange> mirrorRange;
galois::HostLocalStorage<pando::Array<MirrorToMasterMap>> localMirrorToRemoteMasterOrderedTable;

// TODO(Ying-Wei):
// generate the following
galois::HostLocalStorage<pando::Mutex> hostMutex;
galois::HostLocalStorage<pando::Vector<EdgeHandle>> localMasterToRemoteMirrorTable;
// galois::GlobalBarrier barrier;
galois::HostLocalStorage<pando::Vector<pando::Vector<MirrorToMasterMap>>>
localMasterToRemoteMirrorTable;
};

static_assert(graph_checker<MirrorDistLocalCSR<std::uint64_t, std::uint64_t>>::value);
Expand Down
4 changes: 2 additions & 2 deletions pando-rt/cmake/PANDOTesting.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ function(pando_add_driver_test TARGET SOURCEFILE)

if (PANDO_RT_BACKEND STREQUAL "PREP")
if (${GASNet_CONDUIT} STREQUAL "smp")
set(RUNNER_SCRIPT ${PROJECT_SOURCE_DIR}/scripts/preprun.sh)
set(RUNNER_SCRIPT ${PROJECT_SOURCE_DIR}/../scripts/preprun.sh)
elseif (${GASNet_CONDUIT} STREQUAL "mpi")
set(RUNNER_SCRIPT ${PROJECT_SOURCE_DIR}/scripts/preprun_mpi.sh)
set(RUNNER_SCRIPT ${PROJECT_SOURCE_DIR}/../scripts/preprun_mpi.sh)
else ()
message(FATAL_ERROR "No runner script for GASNet conduit ${GASNet_CONDUIT}")
endif ()
Expand Down
36 changes: 19 additions & 17 deletions pando-rt/scripts/preprun.sh → scripts/preprun.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/bin/bash
# SPDX-License-Identifier: MIT
# Copyright (c) 2023. University of Texas at Austin. All rights reserved.

#
# SPDX-License-Identifier: MIT
# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Expand Down Expand Up @@ -29,32 +32,31 @@ usage: preprun -n <n> prog [program args]

harts=""

while getopts "n:c:t:h" option
do
while getopts "n:c:t:h" option; do
case ${option} in
n) # number of emulated PXNs
nodes=${OPTARG}
;;
c) # number of emulated cores per PXN
cores=${OPTARG}
;;
t) # number of emulated cores per PXN
harts=${OPTARG}
;;
h) # help
show_help
exit
;;
n) # number of emulated PXNs
nodes=${OPTARG}
;;
c) # number of emulated cores per PXN
cores=${OPTARG}
;;
t) # number of emulated cores per PXN
harts=${OPTARG}
;;
h) # help
show_help
exit
;;
esac
done
shift $(expr $OPTIND - 1 )
shift $(expr $OPTIND - 1)
prog=$@

export GASNET_PSHM_NODES=$nodes
export PANDO_PREP_NUM_CORES=$cores

if [ -n "$harts" ]; then
export PANDO_PREP_NUM_HARTS=$harts
export PANDO_PREP_NUM_HARTS=$harts
fi

exec $prog
40 changes: 23 additions & 17 deletions pando-rt/scripts/preprun_mpi.sh → scripts/preprun_mpi.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/bin/bash
# SPDX-License-Identifier: MIT
# Copyright (c) 2023. University of Texas at Austin. All rights reserved.

#
# SPDX-License-Identifier: MIT
# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Expand Down Expand Up @@ -28,32 +31,35 @@ usage: preprun -n <n> prog [program args]
#

harts=""
# 16GB Main memory size by default
main_memory_size="${main_memory_size-8589934592}"

while getopts "n:c:t:h" option
do
while getopts "n:c:t:h" option; do
case ${option} in
n) # number of emulated PXNs
nodes=${OPTARG}
;;
c) # number of emulated cores per PXN
cores=${OPTARG}
;;
t) # number of emulated cores per PXN
harts=${OPTARG}
;;
h) # help
show_help
exit
;;
n) # number of emulated PXNs
nodes=${OPTARG}
;;
c) # number of emulated cores per PXN
cores=${OPTARG}
;;
t) # number of emulated cores per PXN
harts=${OPTARG}
;;
h) # help
show_help
exit
;;
esac
done
shift $(expr $OPTIND - 1 )
shift $(expr $OPTIND - 1)
prog=$@

export PANDO_PREP_NUM_CORES=$cores

if [ -n "$harts" ]; then
export PANDO_PREP_NUM_HARTS=$harts
export PANDO_PREP_NUM_HARTS=$harts
fi

export PANDO_PREP_MAIN_NODE=$main_memory_size

gasnetrun_mpi -n $nodes $prog
Loading
Loading