Skip to content

Commit

Permalink
Fix hang in tc microbenchmark + fix overlapping prep timers (supersede
Browse files Browse the repository at this point in the history
…#127 + #129) (#137)

* Fix hang in TC microbenchmark

Added a shared WaitGroup between vertex doAll and the nested, per vertex,
edge doAll. Original code would hang because of the separate wait groups:
after enqueueing a doAll in edge_tc_couting, harts wait for it to complete
(tc_algos.cpp:42). However, this occurs on every hart because of the outer
doAll in tc_no_chunk, therefore every hart is waiting and none is available
to complete the work being waited on. When using one combined wait group,
the outer doAll tasks are able to complete after enqueuing, but before
completion of, the inner doAll tasks. Thus, harts are freed to complete the
inner doAll and therefore forward progress.

* fix overlapping prep timers

* add synchronization to drv timers
  • Loading branch information
elgarten authored Aug 12, 2024
1 parent c869479 commit 30140ba
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 51 deletions.
3 changes: 1 addition & 2 deletions microbench/triangle-counting/include/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ void printUsage(char* argv0);
// CONNECTION KERNELS
// #####################################################################
template <typename GraphType>
void intersect_dag_merge(galois::WaitGroup::HandleType wgh, pando::GlobalPtr<GraphType> graph_ptr,
void intersect_dag_merge(pando::GlobalPtr<GraphType> graph_ptr,
typename GraphType::VertexTopologyID v0,
typename GraphType::VertexTopologyID v1,
galois::DAccumulator<uint64_t> final_tri_count) {
Expand All @@ -90,7 +90,6 @@ void intersect_dag_merge(galois::WaitGroup::HandleType wgh, pando::GlobalPtr<Gra
count++;
}
final_tri_count.add(count);
wgh.done();
}

template <typename GraphType>
Expand Down
38 changes: 23 additions & 15 deletions microbench/triangle-counting/src/tc_algos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,9 @@
* @param[in] final_tri_count Thread-safe counter
*/
template <typename Graph>
void edge_tc_counting(pando::GlobalPtr<Graph> graph_ptr, typename Graph::VertexTopologyID v0,
typename Graph::EdgeRange edge_range,
void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr<Graph> graph_ptr,
typename Graph::VertexTopologyID v0, typename Graph::EdgeRange edge_range,
galois::DAccumulator<uint64_t> final_tri_count) {
galois::WaitGroup wg;
PANDO_CHECK(wg.initialize(0));
auto wgh = wg.getHandle();
auto innerState = galois::make_tpl(graph_ptr, v0, wgh, final_tri_count);
Graph graph = *graph_ptr;
galois::doAll(
Expand All @@ -28,8 +25,7 @@ void edge_tc_counting(pando::GlobalPtr<Graph> graph_ptr, typename Graph::VertexT
auto [graph_ptr, v0, wgh, final_tri_count] = innerState;
Graph g = *graph_ptr;
typename Graph::VertexTopologyID v1 = fmap(g, getEdgeDst, eh);
wgh.addOne();
intersect_dag_merge<Graph>(wgh, graph_ptr, v0, v1, final_tri_count);
intersect_dag_merge<Graph>(graph_ptr, v0, v1, final_tri_count);
},
[&graph](decltype(innerState) innerState, typename Graph::EdgeHandle eh) -> pando::Place {
auto v0 = std::get<1>(innerState);
Expand All @@ -39,7 +35,6 @@ void edge_tc_counting(pando::GlobalPtr<Graph> graph_ptr, typename Graph::VertexT
: fmap(graph, getLocalityVertex, v1);
return locality;
});
PANDO_CHECK(wg.wait());
}

// #####################################################################
Expand All @@ -55,19 +50,27 @@ template <typename GraphType>
void tc_no_chunk(pando::GlobalPtr<GraphType> graph_ptr,
galois::DAccumulator<uint64_t> final_tri_count) {
GraphType graph = *graph_ptr;
auto state = galois::make_tpl(graph_ptr, final_tri_count);

galois::WaitGroup wg;
PANDO_CHECK(wg.initialize(0));
auto wgh = wg.getHandle();
auto state = galois::make_tpl(graph_ptr, final_tri_count, wgh);

galois::doAll(
state, graph.vertices(), +[](decltype(state) state, typename GraphType::VertexTopologyID v0) {
auto [graph_ptr, final_tri_count] = state;
wgh, state, graph.vertices(),
+[](decltype(state) state, typename GraphType::VertexTopologyID v0) {
auto [graph_ptr, final_tri_count, wgh] = state;
GraphType graph = *graph_ptr;

// Degree Filtering Optimization
uint64_t v0_degree = graph.getNumEdges(v0);
if (v0_degree < (TC_EMBEDDING_SZ - 1))
return;

edge_tc_counting<GraphType>(graph_ptr, v0, graph.edges(v0), final_tri_count);
edge_tc_counting<GraphType>(wgh, graph_ptr, v0, graph.edges(v0), final_tri_count);
});
PANDO_CHECK(wg.wait());
wg.deinitialize();
}

/**
Expand Down Expand Up @@ -159,27 +162,32 @@ void tc_chunk_vertices(pando::GlobalPtr<GraphDL> graph_ptr,
auto lcsr = graph.getLocalCSR();
uint64_t host_vertex_iter_offset = host_vertex_iter_offset_ref;

auto inner_state = galois::make_tpl(graph_ptr, final_tri_count);
galois::WaitGroup wg;
PANDO_CHECK(wg.initialize(0));
auto wgh = wg.getHandle();
auto inner_state = galois::make_tpl(graph_ptr, final_tri_count, wgh);
galois::doAll(
inner_state, fmap(lcsr, vertices, host_vertex_iter_offset, query_sz),
+[](decltype(inner_state) inner_state, typename GraphDL::VertexTopologyID v0) {
auto [graph_ptr, final_tri_count] = inner_state;
auto [graph_ptr, final_tri_count, wgh] = inner_state;
GraphDL graph = *graph_ptr;

// Degree Filtering Optimization
uint64_t v0_degree = graph.getNumEdges(v0);
if (v0_degree < (TC_EMBEDDING_SZ - 1))
return;

edge_tc_counting<GraphDL>(graph_ptr, v0, graph.edges(v0), final_tri_count);
edge_tc_counting<GraphDL>(wgh, graph_ptr, v0, graph.edges(v0), final_tri_count);
});
PANDO_CHECK(wg.wait());

// Move iter offset
uint64_t lcsr_num_vertices = fmap(lcsr, size);
host_vertex_iter_offset += query_sz;
if (host_vertex_iter_offset < lcsr_num_vertices)
work_remaining.increment();
host_vertex_iter_offset_ref = host_vertex_iter_offset;
wg.deinitialize();
});

uint64_t current_count = final_tri_count.reduce();
Expand Down
84 changes: 50 additions & 34 deletions pando-rt/src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,29 +198,35 @@ int main(int argc, char* argv[]) {
rc = getrusage(RUSAGE_SELF, &end);
if(rc != 0) {PANDO_ABORT("GETRUSAGE FAILED");}
auto thisPlace = pando::getCurrentPlace();
SPDLOG_WARN("Total time on node: {}, was {}ns",
thisPlace.node.id,
end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 -
(start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) +
end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 -
(start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000));
for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) {
SPDLOG_WARN("Idle time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
idleCount.get(i));
SPDLOG_WARN("Pointer time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
pointerCount.get(i));
SPDLOG_WARN("Scheduler time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
schedulerCount.get(i));
SPDLOG_WARN("DoAll time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
doAllCount.get(i));
for(std::int64_t j = 0; j < std::int64_t(dims.node.id); j++) {
if (j == thisPlace.node.id) {
SPDLOG_WARN("Total time on node: {}, was {}ns",
thisPlace.node.id,
end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 -
(start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) +
end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 -
(start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000));
for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) {
SPDLOG_WARN("Idle time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
idleCount.get(i));
SPDLOG_WARN("Pointer time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
pointerCount.get(i));
SPDLOG_WARN("Scheduler time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
schedulerCount.get(i));
SPDLOG_WARN("DoAll time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
doAllCount.get(i));
}
}

pando::Nodes::barrier();
}


Expand Down Expand Up @@ -258,17 +264,27 @@ extern "C" __attribute__((visibility("default"))) int __drv_api_main(int argc, c
if(rc != 0) {PANDO_ABORT("GETRUSAGE FAILED");}
auto thisPlace = pando::getCurrentPlace();
auto dims = pando::getPlaceDims();
SPDLOG_WARN("Total time on node: {}, was {}ns",
thisPlace.node.id,
end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 -
(start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) +
end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 -
(start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000));
for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 2); i++) {
SPDLOG_WARN("Idle time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x + 1)) ? -1 : i),
idleCount.get(i));


if (pando::isOnCP()) {
for (std::int64_t j = 0; j < std::int64_t(dims.node.id); j++) {
if (j == thisPlace.node.id) {
SPDLOG_WARN("Total time on node: {}, was {}ns",
thisPlace.node.id,
end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 -
(start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) +
end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 -
(start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000));
for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) {
SPDLOG_WARN("Idle time on node: {}, core: {} was {}",
thisPlace.node.id,
std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i),
idleCount.get(i));
}
}

pando::CommandProcessor::barrier();
}
}

return ret;
Expand Down

0 comments on commit 30140ba

Please sign in to comment.