From 30140baa3e4f0a9a4ea8b650573db12ec021d259 Mon Sep 17 00:00:00 2001 From: Brenden Elgarten <32827342+elgarten@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:50:18 -0500 Subject: [PATCH] Fix hang in tc microbenchmark + fix overlapping prep timers (supersede #127 + #129) (#137) * Fix hang in TC microbenchmark Added a shared WaitGroup between vertex doAll and the nested, per vertex, edge doAll. Original code would hang because of the separate wait groups: after enqueueing a doAll in edge_tc_couting, harts wait for it to complete (tc_algos.cpp:42). However, this occurs on every hart because of the outer doAll in tc_no_chunk, therefore every hart is waiting and none is available to complete the work being waited on. When using one combined wait group, the outer doAll tasks are able to complete after enqueuing, but before completion of, the inner doAll tasks. Thus, harts are freed to complete the inner doAll and therefore forward progress. * fix overlapping prep timers * add synchronization to drv timers --- .../triangle-counting/include/utils.hpp | 3 +- microbench/triangle-counting/src/tc_algos.cpp | 38 +++++---- pando-rt/src/init.cpp | 84 +++++++++++-------- 3 files changed, 74 insertions(+), 51 deletions(-) diff --git a/microbench/triangle-counting/include/utils.hpp b/microbench/triangle-counting/include/utils.hpp index 71aaac71..d2410d0c 100644 --- a/microbench/triangle-counting/include/utils.hpp +++ b/microbench/triangle-counting/include/utils.hpp @@ -63,7 +63,7 @@ void printUsage(char* argv0); // CONNECTION KERNELS // ##################################################################### template -void intersect_dag_merge(galois::WaitGroup::HandleType wgh, pando::GlobalPtr graph_ptr, +void intersect_dag_merge(pando::GlobalPtr graph_ptr, typename GraphType::VertexTopologyID v0, typename GraphType::VertexTopologyID v1, galois::DAccumulator final_tri_count) { @@ -90,7 +90,6 @@ void intersect_dag_merge(galois::WaitGroup::HandleType wgh, pando::GlobalPtr diff --git a/microbench/triangle-counting/src/tc_algos.cpp b/microbench/triangle-counting/src/tc_algos.cpp index 6c1d813c..b9fcbeaf 100644 --- a/microbench/triangle-counting/src/tc_algos.cpp +++ b/microbench/triangle-counting/src/tc_algos.cpp @@ -14,12 +14,9 @@ * @param[in] final_tri_count Thread-safe counter */ template -void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexTopologyID v0, - typename Graph::EdgeRange edge_range, +void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr graph_ptr, + typename Graph::VertexTopologyID v0, typename Graph::EdgeRange edge_range, galois::DAccumulator final_tri_count) { - galois::WaitGroup wg; - PANDO_CHECK(wg.initialize(0)); - auto wgh = wg.getHandle(); auto innerState = galois::make_tpl(graph_ptr, v0, wgh, final_tri_count); Graph graph = *graph_ptr; galois::doAll( @@ -28,8 +25,7 @@ void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexT auto [graph_ptr, v0, wgh, final_tri_count] = innerState; Graph g = *graph_ptr; typename Graph::VertexTopologyID v1 = fmap(g, getEdgeDst, eh); - wgh.addOne(); - intersect_dag_merge(wgh, graph_ptr, v0, v1, final_tri_count); + intersect_dag_merge(graph_ptr, v0, v1, final_tri_count); }, [&graph](decltype(innerState) innerState, typename Graph::EdgeHandle eh) -> pando::Place { auto v0 = std::get<1>(innerState); @@ -39,7 +35,6 @@ void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexT : fmap(graph, getLocalityVertex, v1); return locality; }); - PANDO_CHECK(wg.wait()); } // ##################################################################### @@ -55,10 +50,16 @@ template void tc_no_chunk(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count) { GraphType graph = *graph_ptr; - auto state = galois::make_tpl(graph_ptr, final_tri_count); + + galois::WaitGroup wg; + PANDO_CHECK(wg.initialize(0)); + auto wgh = wg.getHandle(); + auto state = galois::make_tpl(graph_ptr, final_tri_count, wgh); + galois::doAll( - state, graph.vertices(), +[](decltype(state) state, typename GraphType::VertexTopologyID v0) { - auto [graph_ptr, final_tri_count] = state; + wgh, state, graph.vertices(), + +[](decltype(state) state, typename GraphType::VertexTopologyID v0) { + auto [graph_ptr, final_tri_count, wgh] = state; GraphType graph = *graph_ptr; // Degree Filtering Optimization @@ -66,8 +67,10 @@ void tc_no_chunk(pando::GlobalPtr graph_ptr, if (v0_degree < (TC_EMBEDDING_SZ - 1)) return; - edge_tc_counting(graph_ptr, v0, graph.edges(v0), final_tri_count); + edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), final_tri_count); }); + PANDO_CHECK(wg.wait()); + wg.deinitialize(); } /** @@ -159,11 +162,14 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, auto lcsr = graph.getLocalCSR(); uint64_t host_vertex_iter_offset = host_vertex_iter_offset_ref; - auto inner_state = galois::make_tpl(graph_ptr, final_tri_count); + galois::WaitGroup wg; + PANDO_CHECK(wg.initialize(0)); + auto wgh = wg.getHandle(); + auto inner_state = galois::make_tpl(graph_ptr, final_tri_count, wgh); galois::doAll( inner_state, fmap(lcsr, vertices, host_vertex_iter_offset, query_sz), +[](decltype(inner_state) inner_state, typename GraphDL::VertexTopologyID v0) { - auto [graph_ptr, final_tri_count] = inner_state; + auto [graph_ptr, final_tri_count, wgh] = inner_state; GraphDL graph = *graph_ptr; // Degree Filtering Optimization @@ -171,8 +177,9 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, if (v0_degree < (TC_EMBEDDING_SZ - 1)) return; - edge_tc_counting(graph_ptr, v0, graph.edges(v0), final_tri_count); + edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), final_tri_count); }); + PANDO_CHECK(wg.wait()); // Move iter offset uint64_t lcsr_num_vertices = fmap(lcsr, size); @@ -180,6 +187,7 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, if (host_vertex_iter_offset < lcsr_num_vertices) work_remaining.increment(); host_vertex_iter_offset_ref = host_vertex_iter_offset; + wg.deinitialize(); }); uint64_t current_count = final_tri_count.reduce(); diff --git a/pando-rt/src/init.cpp b/pando-rt/src/init.cpp index f151fe28..ecfd1001 100644 --- a/pando-rt/src/init.cpp +++ b/pando-rt/src/init.cpp @@ -198,29 +198,35 @@ int main(int argc, char* argv[]) { rc = getrusage(RUSAGE_SELF, &end); if(rc != 0) {PANDO_ABORT("GETRUSAGE FAILED");} auto thisPlace = pando::getCurrentPlace(); - SPDLOG_WARN("Total time on node: {}, was {}ns", - thisPlace.node.id, - end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - - (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + - end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - - (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); - for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { - SPDLOG_WARN("Idle time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - idleCount.get(i)); - SPDLOG_WARN("Pointer time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - pointerCount.get(i)); - SPDLOG_WARN("Scheduler time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - schedulerCount.get(i)); - SPDLOG_WARN("DoAll time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - doAllCount.get(i)); + for(std::int64_t j = 0; j < std::int64_t(dims.node.id); j++) { + if (j == thisPlace.node.id) { + SPDLOG_WARN("Total time on node: {}, was {}ns", + thisPlace.node.id, + end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - + (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + + end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - + (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); + for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { + SPDLOG_WARN("Idle time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + idleCount.get(i)); + SPDLOG_WARN("Pointer time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + pointerCount.get(i)); + SPDLOG_WARN("Scheduler time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + schedulerCount.get(i)); + SPDLOG_WARN("DoAll time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + doAllCount.get(i)); + } + } + + pando::Nodes::barrier(); } @@ -258,17 +264,27 @@ extern "C" __attribute__((visibility("default"))) int __drv_api_main(int argc, c if(rc != 0) {PANDO_ABORT("GETRUSAGE FAILED");} auto thisPlace = pando::getCurrentPlace(); auto dims = pando::getPlaceDims(); - SPDLOG_WARN("Total time on node: {}, was {}ns", - thisPlace.node.id, - end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - - (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + - end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - - (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); - for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 2); i++) { - SPDLOG_WARN("Idle time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x + 1)) ? -1 : i), - idleCount.get(i)); + + + if (pando::isOnCP()) { + for (std::int64_t j = 0; j < std::int64_t(dims.node.id); j++) { + if (j == thisPlace.node.id) { + SPDLOG_WARN("Total time on node: {}, was {}ns", + thisPlace.node.id, + end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - + (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + + end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - + (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); + for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { + SPDLOG_WARN("Idle time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + idleCount.get(i)); + } + } + + pando::CommandProcessor::barrier(); + } } return ret;