diff --git a/src/alg/totem_alg.h b/src/alg/totem_alg.h index f013822..49ccf4e 100644 --- a/src/alg/totem_alg.h +++ b/src/alg/totem_alg.h @@ -108,6 +108,7 @@ error_t bfs_gpu(graph_t* graph, vid_t src_id, cost_t* cost); error_t bfs_bu_gpu(graph_t* graph, vid_t src_id, cost_t* cost); error_t bfs_vwarp_gpu(graph_t* graph, vid_t src_id, cost_t* cost); error_t bfs_hybrid(vid_t src_id, cost_t* cost); +error_t bfs_stepwise_hybrid(vid_t src_id, cost_t* cost); /** * Given an undirected, unweighted graph and a source vertex, compute the diff --git a/src/alg/totem_betweenness_hybrid.cu b/src/alg/totem_betweenness_hybrid.cu index 1147b7f..991a0bd 100644 --- a/src/alg/totem_betweenness_hybrid.cu +++ b/src/alg/totem_betweenness_hybrid.cu @@ -7,62 +7,58 @@ * Robert Woff */ -// Totem includes +// totem includes #include "totem_alg.h" #include "totem_centrality.h" #include "totem_engine.cuh" #include -/** - * Per-partition specific state - */ + +// Per-partition specific state. typedef struct betweenness_state_s { - cost_t* distance[MAX_PARTITION_COUNT]; // a list of distances state, one per - // partition - uint32_t* numSPs[MAX_PARTITION_COUNT]; // a list of number of shortest paths - // state, one per partition - uint32_t* numSPs_f[MAX_PARTITION_COUNT]; // a list of number of shortest paths - // state, one per partition - score_t* delta[MAX_PARTITION_COUNT]; // delta BC score for a vertex - bool* done; // pointer to global finish flag - cost_t level; // current level being processed by the partition - score_t* betweenness; // betweenness score + cost_t* distance[MAX_PARTITION_COUNT]; // a list of distances state, one + // per partition + uint32_t* numSPs[MAX_PARTITION_COUNT]; // a list of number of shortest + // paths state, one per partition + uint32_t* numSPs_f[MAX_PARTITION_COUNT]; // a list of number of shortest + // paths state, one per partition + score_t* delta[MAX_PARTITION_COUNT]; // delta BC score for a vertex + bool* done; // pointer to global finish flag + cost_t level; // current level being processed by the partition + score_t* betweenness; // betweenness score frontier_state_t frontier; - bool* comm; // flags that indicates whether to instruct the engine + bool* comm; // flags that indicates whether to instruct the engine // to perform communication or not. This array is // populated during the forward phase, and used during // the backward propagation phase } betweenness_state_t; -/** - * State shared between all partitions - */ +// State shared between all partitions. typedef struct betweenness_global_state_s { - score_t* betweenness_score; // final output buffer - score_t* betweenness_score_h; // used as a temporary buffer - vid_t src; // source vertex id (id after partitioning) - double epsilon; // determines accuracy of BC computation - int num_samples; // number of samples for approximate BC + score_t* betweenness_score; // final output buffer + score_t* betweenness_score_h; // used as a temporary buffer + vid_t src; // source vertex id (id after partitioning) + double epsilon; // determines accuracy of BC computation + int num_samples; // number of samples for approximate BC } betweenness_global_state_t; PRIVATE betweenness_global_state_t bc_g; -/** - * The neighbors forward propagation processing function. This function sets - * the level of the neighbors' vertex to one level more than the parent vertex. - * The assumption is that the threads of a warp invoke this function to process - * the warp's batch of work. In each iteration of the for loop, each thread - * processes a neighbor. For example, thread 0 in the warp processes neighbors - * at indices 0, VWARP_WIDTH, (2 * VWARP_WIDTH) etc. in the edges array, while - * thread 1 in the warp processes neighbors 1, (1 + VWARP_WIDTH), - * (1 + 2 * VWARP_WIDTH) and so on. - */ + +// The neighbors forward propagation processing function. This function sets +// the level of the neighbors' vertex to one level more than the parent vertex. +// The assumption is that the threads of a warp invoke this function to process +// the warp's batch of work. In each iteration of the for loop, each thread +// processes a neighbor. For example, thread 0 in the warp processes neighbors +// at indices 0, VWARP_WIDTH, (2 * VWARP_WIDTH) etc. in the edges array, while +// thread 1 in the warp processes neighbors 1, (1 + VWARP_WIDTH), +// (1 + 2 * VWARP_WIDTH) and so on. template __device__ inline void -forward_process_neighbors(int warp_offset, const vid_t* __restrict nbrs, - const vid_t nbr_count, uint32_t v_numSPs, +forward_process_neighbors(int warp_offset, const vid_t* __restrict nbrs, + const vid_t nbr_count, uint32_t v_numSPs, betweenness_state_t* state, bool& done_d) { - // Iterate through the portion of work - for(vid_t i = warp_offset; i < nbr_count; i += VWARP_WIDTH) { + // Iterate through the portion of work. + for (vid_t i = warp_offset; i < nbr_count; i += VWARP_WIDTH) { vid_t nbr = GET_VERTEX_ID(nbrs[i]); int nbr_pid = GET_PARTITION_ID(nbrs[i]); cost_t* nbr_distance = state->distance[nbr_pid]; @@ -79,10 +75,10 @@ forward_process_neighbors(int warp_offset, const vid_t* __restrict nbrs, template __global__ void -forward_kernel(partition_t par, betweenness_state_t state, +forward_kernel(partition_t par, betweenness_state_t state, const vid_t* __restrict frontier, vid_t count) { - if (THREAD_GLOBAL_INDEX >= - vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH)) return; + if (THREAD_GLOBAL_INDEX >= + vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH)) { return; } const eid_t* __restrict vertices = par.subgraph.vertices; const uint32_t* __restrict numSPs = state.numSPs[par.id]; @@ -90,50 +86,50 @@ forward_kernel(partition_t par, betweenness_state_t state, // This flag is used to report the finish state of a block of threads. This // is useful to avoid having many threads writing to the global finished // flag, which can hurt performance (since "finished" is actually allocated - // on the host, and each write will cause a transfer over the PCI-E bus) + // on the host, and each write will cause a transfer over the PCI-E bus). __shared__ bool finished_block; finished_block = true; __syncthreads(); - vid_t start_vertex = vwarp_block_start_vertex(VWARP_WIDTH, VWARP_BATCH) + + vid_t start_vertex = vwarp_block_start_vertex(VWARP_WIDTH, VWARP_BATCH) + vwarp_warp_start_vertex(VWARP_WIDTH, VWARP_BATCH); vid_t end_vertex = start_vertex + vwarp_warp_batch_size(count, VWARP_WIDTH, VWARP_BATCH); int warp_offset = vwarp_thread_index(VWARP_WIDTH); - - // Iterate over my work - for(vid_t i = start_vertex; i < end_vertex; i++) { + + // Iterate over my work. + for (vid_t i = start_vertex; i < end_vertex; i++) { vid_t v = frontier[i]; // If the distance for this node is equal to the current level, then // forward process its neighbours to determine its contribution to - // the number of shortest paths + // the number of shortest paths. const eid_t nbr_count = vertices[v + 1] - vertices[v]; vid_t* nbrs = par.subgraph.edges + vertices[v]; if (v >= par.subgraph.vertex_ext) { - nbrs = par.subgraph.edges_ext + + nbrs = par.subgraph.edges_ext + (vertices[v] - par.subgraph.edge_count_ext); } forward_process_neighbors - (warp_offset, nbrs, nbr_count, numSPs[v], &state, finished_block); + (warp_offset, nbrs, nbr_count, numSPs[v], &state, finished_block); } __syncthreads(); - // If there is remaining work to do, set the done flag to false + // If there is remaining work to do, set the done flag to false. if (!finished_block && THREAD_BLOCK_INDEX == 0) *(state.done) = false; } -typedef void(*bc_gpu_func_t)(partition_t*, betweenness_state_t*, vid_t*, vid_t, +typedef void(*bc_gpu_func_t)(partition_t*, betweenness_state_t*, vid_t*, vid_t, cudaStream_t); template #ifdef FEATURE_SM35 -PRIVATE __host__ __device__ +PRIVATE __host__ __device__ #else PRIVATE __host__ #endif /* FEATURE_SM35 */ void forward_launch_gpu(partition_t* par, betweenness_state_t* state, vid_t* frontier, vid_t count, cudaStream_t stream) { - if (count == 0) return; - dim3 blocks; + if (count == 0) { return; } + dim3 blocks; const int threads = MAX_THREADS_PER_BLOCK; kernel_configure(vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH), blocks, threads); @@ -143,7 +139,7 @@ void forward_launch_gpu(partition_t* par, betweenness_state_t* state, #ifdef FEATURE_SM35 PRIVATE __global__ -void forward_launch_at_boundary_kernel(partition_t par, +void forward_launch_at_boundary_kernel(partition_t par, betweenness_state_t state) { if (THREAD_GLOBAL_INDEX > 0 || (*state.frontier.count == 0)) { return; @@ -181,12 +177,11 @@ PRIVATE const bc_gpu_func_t FORWARD_GPU_FUNC[] = { forward_launch_gpu }; -/** - * Entry point for forward propagation on the GPU - */ +// Entry point for forward propagation on the GPU. PRIVATE inline void betweenness_forward_gpu(partition_t* par) { - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; - frontier_update_list_gpu(&state->frontier, state->level, + betweenness_state_t* state = + reinterpret_cast(par->algo_state); + frontier_update_list_gpu(&state->frontier, state->level, state->distance[par->id], par->streams[1]); if (engine_partition_algorithm() == PAR_SORTED_DSC) { @@ -202,16 +197,16 @@ PRIVATE inline void betweenness_forward_gpu(partition_t* par) { } } - // clear out the outbox buffers + // Clear out the outbox buffers. for (int rmt_pid = 0; rmt_pid < context.pset->partition_count; rmt_pid++) { grooves_box_table_t* outbox = &par->outbox[rmt_pid]; - if (rmt_pid == par->id || !outbox->count) continue; - cudaMemsetAsync(outbox->push_values, 0, outbox->count * sizeof(uint32_t), + if (rmt_pid == par->id || !outbox->count) { continue; } + cudaMemsetAsync(outbox->push_values, 0, outbox->count * sizeof(uint32_t), par->streams[1]); } - // If the vertices are sorted by degree, call a kernel that takes - // advantage of that + // If the vertices are sorted by degree, call a kernel that takes + // advantage of that. #ifdef FEATURE_SM35 if (engine_sorted()) { frontier_update_boundaries_gpu(&state->frontier, &par->subgraph, @@ -224,34 +219,32 @@ PRIVATE inline void betweenness_forward_gpu(partition_t* par) { #endif /* FEATURE_SM35 */ // Call the corresponding cuda kernel to perform forward propagation - // given the current state of the algorithm + // given the current state of the algorithm. vid_t count; - CALL_CU_SAFE(cudaMemcpyAsync(&count, state->frontier.count, - sizeof(vid_t), cudaMemcpyDefault, - par->streams[1])); + CALL_CU_SAFE(cudaMemcpyAsync(&count, state->frontier.count, sizeof(vid_t), + cudaMemcpyDefault, par->streams[1])); CALL_CU_SAFE(cudaStreamSynchronize(par->streams[1])); int par_alg = engine_partition_algorithm(); - FORWARD_GPU_FUNC[par_alg](par, state, state->frontier.list, count, + FORWARD_GPU_FUNC[par_alg](par, state, state->frontier.list, count, par->streams[1]); } -/** - * Entry point for forward propagation on the CPU - */ +// Entry point for forward propagation on the CPU void betweenness_forward_cpu(partition_t* par) { // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; - graph_t* subgraph = &par->subgraph; + betweenness_state_t* state = + reinterpret_cast(par->algo_state); + graph_t* subgraph = &par->subgraph; cost_t* distance = state->distance[par->id]; uint32_t* numSPs = state->numSPs[par->id]; bool done = true; bool comm = false; - // In parallel, iterate over vertices which are at the current level - OMP(omp parallel for schedule(runtime) reduction(& : done) + // In parallel, iterate over vertices which are at the current level. + OMP(omp parallel for schedule(runtime) reduction(& : done) reduction(| : comm)) for (vid_t v = 0; v < subgraph->vertex_count; v++) { if (distance[v] == state->level) { - for (eid_t e = subgraph->vertices[v]; e < subgraph->vertices[v + 1]; + for (eid_t e = subgraph->vertices[v]; e < subgraph->vertices[v + 1]; e++) { vid_t nbr = GET_VERTEX_ID(subgraph->edges[e]); int nbr_pid = GET_PARTITION_ID(subgraph->edges[e]); @@ -263,7 +256,7 @@ void betweenness_forward_cpu(partition_t* par) { } if (nbr_distance[nbr] == state->level + 1) { uint32_t* nbr_numSPs = state->numSPs_f[nbr_pid]; - __sync_fetch_and_add(&nbr_numSPs[nbr], numSPs[v]); + __sync_fetch_and_add(&nbr_numSPs[nbr], numSPs[v]); } } } @@ -273,30 +266,30 @@ void betweenness_forward_cpu(partition_t* par) { state->comm[state->level] = false; } - // If there is remaining work to do, set the done flag to false - if (!done) { + // If there is remaining work to do, set the done flag to false. + if (!done) { *(state->done) = false; } } -/** - * Distributes work to either the CPU or GPU - */ +// Distributes work to either the CPU or GPU. PRIVATE void betweenness_forward(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count) { return; } - // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + // Get the current state of the algorithm. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); for (int pid = 0; pid < engine_partition_count(); pid++) { if (pid != par->id) { - state->numSPs_f[pid] = (uint32_t*)par->outbox[pid].push_values; + state->numSPs_f[pid] = + reinterpret_cast(par->outbox[pid].push_values); } } // Check which kind of processor this partition corresponds to and - // call the appropriate function to perform forward propagation + // call the appropriate function to perform forward propagation. if (par->processor.type == PROCESSOR_CPU) { betweenness_forward_cpu(par); } else if (par->processor.type == PROCESSOR_GPU) { @@ -304,14 +297,12 @@ PRIVATE void betweenness_forward(partition_t* par) { } else { assert(false); } - // Increment the level for the next round of forward propagation + // Increment the level for the next round of forward propagation. state->level++; } -/** - * The neighbors backward propagation processing function. This function - * computes the delta of a vertex. - */ +// The neighbors backward propagation processing function. This function +// computes the delta of a vertex. template __device__ void backward_process_neighbors(partition_t* par, betweenness_state_t* state, @@ -319,13 +310,13 @@ backward_process_neighbors(partition_t* par, betweenness_state_t* state, uint32_t v_numSPs, score_t* vwarp_delta_s, vid_t v) { int warp_offset = vwarp_thread_index(VWARP_WIDTH); score_t sum = 0; - // Iterate through the portion of work - for(vid_t i = warp_offset; i < nbr_count; i += VWARP_WIDTH) { + // Iterate through the portion of work. + for (vid_t i = warp_offset; i < nbr_count; i += VWARP_WIDTH) { vid_t nbr = GET_VERTEX_ID(nbrs[i]); int nbr_pid = GET_PARTITION_ID(nbrs[i]); cost_t* nbr_distance = state->distance[nbr_pid]; if (nbr_distance[nbr] == state->level + 1) { - // Compute an intermediary delta value in shared memory + // Compute an intermediary delta value in shared memory. score_t* nbr_delta = state->delta[nbr_pid]; uint32_t* nbr_numSPs = state->numSPs[nbr_pid]; sum += ((((score_t)v_numSPs) / ((score_t)nbr_numSPs[nbr])) * @@ -347,40 +338,38 @@ backward_process_neighbors(partition_t* par, betweenness_state_t* state, } } -/** - * CUDA kernel which performs backward propagation - */ +// CUDA kernel which performs backward propagation. template __global__ void betweenness_backward_kernel(partition_t par, betweenness_state_t state, const vid_t* __restrict frontier, vid_t count) { - if (THREAD_GLOBAL_INDEX >= - vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH)) return; + if (THREAD_GLOBAL_INDEX >= + vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH)) { return; } const eid_t* __restrict vertices = par.subgraph.vertices; const uint32_t* __restrict numSPs = state.numSPs[par.id]; // Each thread in every warp has an entry in the following array which will be - // used to calculate intermediary delta values in shared memory + // used to calculate intermediary delta values in shared memory. __shared__ score_t delta_s[MAX_THREADS_PER_BLOCK]; const int index = THREAD_BLOCK_INDEX / VWARP_WIDTH; score_t* vwarp_delta_s = &delta_s[index * VWARP_WIDTH]; - vid_t start_vertex = vwarp_block_start_vertex(VWARP_WIDTH, VWARP_BATCH) + + vid_t start_vertex = vwarp_block_start_vertex(VWARP_WIDTH, VWARP_BATCH) + vwarp_warp_start_vertex(VWARP_WIDTH, VWARP_BATCH); vid_t end_vertex = start_vertex + vwarp_warp_batch_size(count, VWARP_WIDTH, VWARP_BATCH); int warp_offset = vwarp_thread_index(VWARP_WIDTH); - - // Iterate over my work - for(vid_t i = start_vertex; i < end_vertex; i++) { + + // Iterate over my work. + for (vid_t i = start_vertex; i < end_vertex; i++) { vid_t v = frontier[i]; // If the vertex is at the current level, determine its contribution - // to the source vertex's delta value + // to the source vertex's delta value. const eid_t nbr_count = vertices[v + 1] - vertices[v]; vid_t* nbrs = par.subgraph.edges + vertices[v]; if (v >= par.subgraph.vertex_ext) { - nbrs = par.subgraph.edges_ext + + nbrs = par.subgraph.edges_ext + (vertices[v] - par.subgraph.edge_count_ext); } backward_process_neighbors @@ -390,15 +379,15 @@ betweenness_backward_kernel(partition_t par, betweenness_state_t state, template #ifdef FEATURE_SM35 -PRIVATE __host__ __device__ +PRIVATE __host__ __device__ #else PRIVATE __host__ -#endif /* FEATURE_SM35 */ +#endif /* FEATURE_SM35 */ void backward_launch_gpu(partition_t* par, betweenness_state_t* state, vid_t* frontier, vid_t count, cudaStream_t stream) { - if (count == 0) return; + if (count == 0) { return; } dim3 blocks; const int threads = MAX_THREADS_PER_BLOCK; - kernel_configure(vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH), + kernel_configure(vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH), blocks, threads); betweenness_backward_kernel <<>>(*par, *state, frontier, count); @@ -406,7 +395,7 @@ void backward_launch_gpu(partition_t* par, betweenness_state_t* state, #ifdef FEATURE_SM35 PRIVATE __global__ -void backward_launch_at_boundary_kernel(partition_t par, +void backward_launch_at_boundary_kernel(partition_t par, betweenness_state_t state) { if (THREAD_GLOBAL_INDEX > 0 || (*state.frontier.count == 0)) { return; @@ -433,7 +422,7 @@ void backward_launch_at_boundary_kernel(partition_t par, } } } -#endif /* FEATURE_SM35 */ +#endif /* FEATURE_SM35 */ PRIVATE const bc_gpu_func_t BACKWARD_GPU_FUNC[] = { // RANDOM algorithm @@ -444,14 +433,13 @@ PRIVATE const bc_gpu_func_t BACKWARD_GPU_FUNC[] = { backward_launch_gpu }; -/** - * Entry point for backward propagation on GPU - */ +// Entry point for backward propagation on GPU. PRIVATE inline void betweenness_backward_gpu(partition_t* par) { // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + betweenness_state_t* state = + reinterpret_cast(par->algo_state); - // If the vertices are sorted by degree, call a kernel that takes + // If the vertices are sorted by degree, call a kernel that takes // advantage of that #ifdef FEATURE_SM35 if (engine_sorted()) { @@ -465,32 +453,31 @@ PRIVATE inline void betweenness_backward_gpu(partition_t* par) { #endif /* FEATURE_SM35 */ vid_t count; - CALL_CU_SAFE(cudaMemcpyAsync(&count, state->frontier.count, - sizeof(vid_t), cudaMemcpyDefault, + CALL_CU_SAFE(cudaMemcpyAsync(&count, state->frontier.count, + sizeof(vid_t), cudaMemcpyDefault, par->streams[1])); CALL_CU_SAFE(cudaStreamSynchronize(par->streams[1])); int par_alg = engine_partition_algorithm(); - BACKWARD_GPU_FUNC[par_alg](par, state, state->frontier.list, + BACKWARD_GPU_FUNC[par_alg](par, state, state->frontier.list, count, par->streams[1]); } -/** - * Entry point for backward propagation on CPU - */ +// Entry point for backward propagation on CPU. void betweenness_backward_cpu(partition_t* par) { - // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + // Get the current state of the algorithm. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); graph_t* subgraph = &par->subgraph; cost_t* distance = state->distance[par->id]; uint32_t* numSPs = state->numSPs[par->id]; score_t* delta = state->delta[par->id]; - // In parallel, iterate over vertices which are at the current level + // In parallel, iterate over vertices which are at the current level. OMP(omp parallel for schedule(runtime)) for (vid_t v = 0; v < subgraph->vertex_count; v++) { cost_t v_distance = distance[v]; if (v_distance == state->level) { - // For all neighbors of v, iterate over paths + // For all neighbors of v, iterate over paths. score_t delta_v = 0; for (eid_t e = subgraph->vertices[v]; e < subgraph->vertices[v + 1]; e++) { @@ -498,7 +485,8 @@ void betweenness_backward_cpu(partition_t* par) { int nbr_pid = GET_PARTITION_ID(subgraph->edges[e]); cost_t* nbr_distance = state->distance[nbr_pid]; - // Check whether the neighbour is local or remote and update accordingly + // Check whether the neighbour is local or remote and update + // accordingly. if (nbr_distance[nbr] == state->level + 1) { score_t* nbr_delta = state->delta[nbr_pid]; uint32_t* nbr_numSPs = state->numSPs[nbr_pid]; @@ -506,32 +494,32 @@ void betweenness_backward_cpu(partition_t* par) { (nbr_delta[nbr] + 1)); } } - // Add the dependency to the BC sum + // Add the dependency to the BC sum. delta[v] += delta_v; state->betweenness[v] += delta[v]; } } } -/** - * Distributes work for backward propagation to either the CPU or GPU - */ +// Distributes work for backward propagation to either the CPU or GPU. PRIVATE void betweenness_backward(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; + // Get the current state of the algorithm. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); - // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count || state->level == 0) { return; } for (int pid = 0; pid < engine_partition_count(); pid++) { if (pid != par->id) { - state->delta[pid] = (score_t*)par->outbox[pid].pull_values; + state->delta[pid] = + reinterpret_cast(par->outbox[pid].pull_values); } } if (engine_superstep() > 1) { // Check what kind of processing unit corresponds to this partition and - // then call the appropriate function to perform backward propagation + // then call the appropriate function to perform backward propagation. if (par->processor.type == PROCESSOR_CPU) { betweenness_backward_cpu(par); } else if (par->processor.type == PROCESSOR_GPU) { @@ -540,10 +528,11 @@ PRIVATE void betweenness_backward(partition_t* par) { assert(false); } } - // Decrement the level for the next round of backward propagation + + // Decrement the level for the next round of backward propagation. state->level--; - // Check whether backward propagation is finished + // Check whether backward propagation is finished. if (state->level > 0) { engine_report_not_finished(); if (!state->comm[state->level]) { @@ -552,26 +541,24 @@ PRIVATE void betweenness_backward(partition_t* par) { } } -/* - * Parallel CPU implementation of betweenness scatter function - */ +// Parallel CPU implementation of betweenness scatter function. PRIVATE inline void betweenness_scatter_cpu(int pid, grooves_box_table_t* inbox, betweenness_state_t* state) { cost_t* distance = state->distance[pid]; uint32_t* numSPs = state->numSPs[pid]; - // Get the values that have been pushed to this vertex - uint32_t* inbox_values = (uint32_t*)inbox->push_values; + // Get the values that have been pushed to this vertex. + uint32_t* inbox_values = reinterpret_cast(inbox->push_values); OMP(omp parallel for schedule(runtime)) for (vid_t index = 0; index < inbox->count; index++) { if (inbox_values[index] != 0) { vid_t vid = inbox->rmt_nbrs[index]; // If the distance was previously infinity, initialize it to the - // current level + // current level. if (distance[vid] == INF_COST) { distance[vid] = state->level; } - // If the distance is equal to the current level, update the - // nodes number of shortest paths with the pushed value + // If the distance is equal to the current level, update the nodes + // number of shortest paths with the pushed value. if (distance[vid] == state->level) { numSPs[vid] += inbox_values[index]; } @@ -579,65 +566,60 @@ PRIVATE inline void betweenness_scatter_cpu(int pid, grooves_box_table_t* inbox, } } -/* - * Kernel for betweenness_scatter_gpu - */ -__global__ void betweenness_scatter_kernel(grooves_box_table_t inbox, +__global__ void betweenness_scatter_kernel(grooves_box_table_t inbox, cost_t* distance, uint32_t* numSPs, cost_t level) { vid_t index = THREAD_GLOBAL_INDEX; - if (index >= inbox.count) return; + if (index >= inbox.count) { return; } - // Get the values that have been pushed to this vertex - uint32_t* inbox_values = (uint32_t*)inbox.push_values; + // Get the values that have been pushed to this vertex. + uint32_t* inbox_values = reinterpret_cast(inbox.push_values); if (inbox_values[index] != 0) { vid_t vid = inbox.rmt_nbrs[index]; // If the distance was previously infinity, initialize it to the - // current level + // current level. if (distance[vid] == INF_COST) { distance[vid] = level; } - // If the distance is equal to the current level, update the - // nodes number of shortest paths with the pushed value + // If the distance is equal to the current level, update the + // nodes number of shortest paths with the pushed value. if (distance[vid] == level) { numSPs[vid] += inbox_values[index]; } } } -/* - * Parallel GPU implementation of betweenness scatter function - */ -PRIVATE inline void betweenness_scatter_gpu(partition_t* par, +// Parallel GPU implementation of betweenness scatter function. +PRIVATE inline void betweenness_scatter_gpu(partition_t* par, grooves_box_table_t* inbox, betweenness_state_t* state) { dim3 blocks, threads; KERNEL_CONFIGURE(inbox->count, blocks, threads); - // Invoke the appropriate CUDA kernel to perform the scatter functionality + // Invoke the appropriate CUDA kernel to perform the scatter functionality. betweenness_scatter_kernel<<streams[1]>>> (*inbox, state->distance[par->id], state->numSPs[par->id], state->level); CALL_CU_SAFE(cudaGetLastError()); } -/** - * Update the number of shortest paths from remote vertices - * Also update distance if it has yet to be initialized - */ + +// Update the number of shortest paths from remote vertices +// Also update distance if it has yet to be initialized. PRIVATE void betweenness_scatter_forward(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count) { return; } - // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + // Get the current state of the algorithm. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) { - if (rmt_pid == par->id) continue; - // For all remote partitions, get the corresponding inbox + if (rmt_pid == par->id) { continue; } + // For all remote partitions, get the corresponding inbox. grooves_box_table_t* inbox = &par->inbox[rmt_pid]; - if (!inbox->count) continue; + if (!inbox->count) { continue; } // If the inbox has some values, determine which type of processing unit - // corresponds to this partition and call the appropriate scatter function - if (!engine_get_comm_prev(rmt_pid)) continue; + // corresponds to this partition and call the appropriate scatter function. + if (!engine_get_comm_prev(rmt_pid)) { continue; } if (par->processor.type == PROCESSOR_CPU) { betweenness_scatter_cpu(par->id, inbox, state); } else if (par->processor.type == PROCESSOR_GPU) { @@ -648,10 +630,8 @@ PRIVATE void betweenness_scatter_forward(partition_t* par) { } } -/* - * Parallel CPU implementation of betweenness gather function - */ -PRIVATE inline void betweenness_gather_cpu(int pid, grooves_box_table_t* inbox, +// Parallel CPU implementation of betweenness gather function. +PRIVATE inline void betweenness_gather_cpu(int pid, grooves_box_table_t* inbox, betweenness_state_t* state, score_t* values) { cost_t* distance = state->distance[pid]; @@ -659,79 +639,73 @@ PRIVATE inline void betweenness_gather_cpu(int pid, grooves_box_table_t* inbox, OMP(omp parallel for schedule(runtime)) for (vid_t index = 0; index < inbox->count; index++) { vid_t vid = inbox->rmt_nbrs[index]; - // Check whether the vertex's distance is equal to level + 1 + // Check whether the vertex's distance is equal to level + 1. if (distance[vid] == (state->level + 1)) { // If it is, we'll pass the vertex's current delta value to neighbouring - // nodes to be used during their next backward propagation phase - values[index] = delta[vid]; + // nodes to be used during their next backward propagation phase. + values[index] = delta[vid]; } } } -/* - * Kernel for betweenness_gather_gpu - */ -__global__ -void betweenness_gather_kernel(const vid_t* __restrict rmt_nbrs, +__global__ +void betweenness_gather_kernel(const vid_t* __restrict rmt_nbrs, const vid_t rmt_nbrs_count, - const cost_t* __restrict distance, - const cost_t level, - const score_t* __restrict delta, + const cost_t* __restrict distance, + const cost_t level, + const score_t* __restrict delta, score_t* values) { vid_t index = THREAD_GLOBAL_INDEX; - if (index >= rmt_nbrs_count) return; + if (index >= rmt_nbrs_count) { return; } vid_t vid = rmt_nbrs[index]; - // Check whether the vertex's distance is equal to level + 1 + // Check whether the vertex's distance is equal to level + 1. if (distance[vid] == level + 1) { - // If it is, we'll pass the vertex's current delta value to neighbouring - // nodes to be used during their next backward propagation phase - values[index] = delta[vid]; + // If it is, we'll pass the vertex's current delta value to neighbouring + // nodes to be used during their next backward propagation phase. + values[index] = delta[vid]; } } -/* - * Parallel GPU implementation of betweenness gather function - */ -PRIVATE inline +// Parallel GPU implementation of betweenness gather function. +PRIVATE inline void betweenness_gather_gpu(partition_t* par, grooves_box_table_t* inbox, betweenness_state_t* state, score_t* values) { dim3 blocks, threads; - KERNEL_CONFIGURE(inbox->count, blocks, threads); - // Invoke the appropriate CUDA kernel to perform the gather functionality + KERNEL_CONFIGURE(inbox->count, blocks, threads); + // Invoke the appropriate CUDA kernel to perform the gather functionality. betweenness_gather_kernel<<streams[1]>>> - (inbox->rmt_nbrs, inbox->count, state->distance[par->id], + (inbox->rmt_nbrs, inbox->count, state->distance[par->id], state->level, state->delta[par->id], values); CALL_CU_SAFE(cudaGetLastError()); } -/** - * Pass the number of shortest paths and delta values to neighbouring - * vertices to be used in the backwards propagation phase - */ +// Pass the number of shortest paths and delta values to neighbouring +// vertices to be used in the backwards propagation phase. PRIVATE void betweenness_gather_backward(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count) { return; } - // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + // Get the current state of the algorithm. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); if (par->processor.type == PROCESSOR_GPU) { if (!state->comm[state->level]) { cudaMemsetAsync(state->frontier.count, 0, sizeof(vid_t), par->streams[1]); } else { - frontier_update_list_gpu(&state->frontier, state->level, + frontier_update_list_gpu(&state->frontier, state->level, state->distance[par->id], par->streams[1]); } } - + for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) { - if (rmt_pid == par->id) continue; - grooves_box_table_t* inbox = &par->inbox[rmt_pid]; - // For all remote partitions, get the corresponding inbox - if (!inbox->count) continue; - score_t* values = (score_t*)inbox->pull_values; + if (rmt_pid == par->id) { continue; } + grooves_box_table_t* inbox = &par->inbox[rmt_pid]; + // For all remote partitions, get the corresponding inbox. + if (!inbox->count) { continue; } + score_t* values = reinterpret_cast(inbox->pull_values); - if (!engine_get_comm_curr(rmt_pid)) continue; + if (!engine_get_comm_curr(rmt_pid)) { continue; } // If the inbox has some values, determine which type of processing unit // corresponds to this partition and call the appropriate gather function if (par->processor.type == PROCESSOR_CPU) { @@ -740,59 +714,56 @@ PRIVATE void betweenness_gather_backward(partition_t* par) { betweenness_gather_gpu(par, inbox, state, values); } else { assert(false); - } + } } } -/** - * Initializes the state for a round of backward propagation - */ +// Initializes the state for a round of backward propagation. PRIVATE void betweenness_init_backward(partition_t* par) { - if (!par->subgraph.vertex_count) return; - // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + if (!par->subgraph.vertex_count) { return; } + // Get the current state of the algorithm. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); assert(state); vid_t vcount = par->subgraph.vertex_count; - // Determine which type of memory this partition corresponds to - totem_mem_t type = TOTEM_MEM_HOST; - if (par->processor.type == PROCESSOR_GPU) { + // Determine which type of memory this partition corresponds to. + totem_mem_t type = TOTEM_MEM_HOST; + if (par->processor.type == PROCESSOR_GPU) { type = TOTEM_MEM_DEVICE; } - // Initialize the delta values to 0 - CALL_SAFE(totem_memset(state->delta[par->id], (score_t)0, vcount, type, + // Initialize the delta values to 0. + CALL_SAFE(totem_memset(state->delta[par->id], (score_t)0, vcount, type, par->streams[1])); - state->level--; } -/** - * Initializes the state for a round of forward propagation - */ +// Initializes the state for a round of forward propagation. PRIVATE void betweenness_init_forward(partition_t* par) { - if (!par->subgraph.vertex_count) return; - // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + if (!par->subgraph.vertex_count) { return; } + // Get the current state of the algorithm. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); assert(state); - // Get the source partition and source vertex values + // Get the source partition and source vertex values. id_t src_pid = GET_PARTITION_ID(bc_g.src); id_t src_vid = GET_VERTEX_ID(bc_g.src); vid_t vcount = par->subgraph.vertex_count; - // Determine which type of memory this partition corresponds to - totem_mem_t type = TOTEM_MEM_HOST; - if (par->processor.type == PROCESSOR_GPU) { + // Determine which type of memory this partition corresponds to. + totem_mem_t type = TOTEM_MEM_HOST; + if (par->processor.type == PROCESSOR_GPU) { type = TOTEM_MEM_DEVICE; } - // Initialize the distances to infinity and numSPs to 0 + // Initialize the distances to infinity and numSPs to 0. for (int pid = 0; pid < engine_partition_count(); pid++) { vid_t count = (pid != par->id) ? par->outbox[pid].count : vcount; if (count) { - CALL_SAFE(totem_memset((state->distance[pid]), INF_COST, count, type, + CALL_SAFE(totem_memset((state->distance[pid]), INF_COST, count, type, par->streams[1])); - CALL_SAFE(totem_memset((state->numSPs[pid]), (uint32_t)0, count, type, + CALL_SAFE(totem_memset((state->numSPs[pid]), (uint32_t)0, count, type, par->streams[1])); } } @@ -803,9 +774,9 @@ PRIVATE void betweenness_init_forward(partition_t* par) { CALL_SAFE(totem_memset(&((state->numSPs[par->id])[src_vid]), (uint32_t)1, 1, type, par->streams[1])); } - + // Initialize the outbox to 0 and set the level to 0 - engine_set_outbox(par->id, 0); + engine_set_outbox(par->id, 0); state->level = 0; totem_memset(state->comm, true, engine_vertex_count(), TOTEM_MEM_HOST); } @@ -814,68 +785,69 @@ PRIVATE void betweenness_init_forward(partition_t* par) { * Allocates and initializes the state for Betweenness Centrality */ PRIVATE void betweenness_init(partition_t* par) { - if (!par->subgraph.vertex_count) return; + if (!par->subgraph.vertex_count) { return; } // Allocate memory for the per-partition state - betweenness_state_t* state = (betweenness_state_t*) - calloc(1, sizeof(betweenness_state_t)); - assert(state); - // Set the partition's state variable to the previously allocated state + betweenness_state_t* state = reinterpret_cast + (calloc(1, sizeof(betweenness_state_t))); + assert(state); + // Set the partition's state variable to the previously allocated state. par->algo_state = state; vid_t vcount = par->subgraph.vertex_count; - // Determine which type of memory this partition corresponds to - totem_mem_t type = TOTEM_MEM_HOST; - if (par->processor.type == PROCESSOR_GPU) { + // Determine which type of memory this partition corresponds to. + totem_mem_t type = TOTEM_MEM_HOST; + if (par->processor.type == PROCESSOR_GPU) { type = TOTEM_MEM_DEVICE; frontier_init_gpu(&state->frontier, par->subgraph.vertex_count); } - + CALL_SAFE(totem_calloc(vcount * sizeof(score_t), type, - (void**)&(state->delta[par->id]))); + reinterpret_cast(&(state->delta[par->id])))); CALL_SAFE(totem_calloc(vcount * sizeof(score_t), type, - (void**)&(state->betweenness))); + reinterpret_cast(&(state->betweenness)))); // Allocate memory for the various pieces of data required for the - // Betweenness Centrality algorithm + // Betweenness Centrality algorithm. for (int pid = 0; pid < engine_partition_count(); pid++) { vid_t count = (pid != par->id) ? par->outbox[pid].count : vcount; if (count) { - CALL_SAFE(totem_malloc(count * sizeof(cost_t), type, - (void**)&(state->distance[pid]))); - CALL_SAFE(totem_calloc(count * sizeof(uint32_t), type, - (void**)&(state->numSPs[pid]))); - } + CALL_SAFE(totem_malloc(count * sizeof(cost_t), type, + reinterpret_cast + (&(state->distance[pid])))); + CALL_SAFE(totem_calloc(count * sizeof(uint32_t), type, + reinterpret_cast(&(state->numSPs[pid])))); + } state->numSPs_f[pid] = state->numSPs[pid]; } - // Initialize the state's done flag + // Initialize the state's done flag. state->done = engine_get_finished_ptr(par->id); - // Initialize the comm array - totem_calloc(engine_vertex_count(), TOTEM_MEM_HOST, (void**)&state->comm); + // Initialize the comm array. + totem_calloc(engine_vertex_count(), TOTEM_MEM_HOST, + reinterpret_cast(&state->comm)); - // Initialize the state + // Initialize the state. betweenness_init_forward(par); } -/** - * Cleans up allocated memory on the CPU and GPU - */ +// Cleans up allocated memory on the CPU and GPU. PRIVATE void betweenness_finalize(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; - - // Free the allocated memory - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; - - // Determine which type of memory this partition corresponds to - totem_mem_t type = TOTEM_MEM_HOST; - if (par->processor.type == PROCESSOR_GPU) { - type = TOTEM_MEM_DEVICE; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count) { return; } + + // Free the allocated memory. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); + + // Determine which type of memory this partition corresponds to. + totem_mem_t type = TOTEM_MEM_HOST; + if (par->processor.type == PROCESSOR_GPU) { + type = TOTEM_MEM_DEVICE; frontier_finalize_gpu(&state->frontier); } - // Free the memory allocated for the algorithm + // Free the memory allocated for the algorithm. for (int pid = 0; pid < engine_partition_count(); pid++) { totem_free(state->distance[pid], type); totem_free(state->numSPs[pid], type); @@ -884,34 +856,34 @@ PRIVATE void betweenness_finalize(partition_t* par) { totem_free(state->betweenness, type); totem_free(state->comm, TOTEM_MEM_HOST); - // Free the per-partition state and set it to NULL + // Free the per-partition state and set it to NULL. free(state); par->algo_state = NULL; } -/** - * Aggregates the final result to be returned at the end - */ -PRIVATE void betweenness_aggr(partition_t* par) { - if (!par->subgraph.vertex_count) return; - // Get the current state of the algorithm - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; +// Aggregates the final result to be returned at the end +PRIVATE void betweenness_aggr(partition_t* par) { + if (!par->subgraph.vertex_count) { return; } + // Get the current state of the algorithm. + betweenness_state_t* state = + reinterpret_cast(par->algo_state); graph_t* subgraph = &par->subgraph; score_t* betweenness_values = NULL; - // Determine which type of processor this partition corresponds to + // Determine which type of processor this partition corresponds to. if (par->processor.type == PROCESSOR_CPU) { - // If it is a CPU partition, grab the computed betweenness value directly + // If it is a CPU partition, grab the computed betweenness value directly. betweenness_values = state->betweenness; } else if (par->processor.type == PROCESSOR_GPU) { - // If it is a GPU partition, copy the computed score back to the host + // If it is a GPU partition, copy the computed score back to the host. assert(bc_g.betweenness_score_h); - CALL_CU_SAFE(cudaMemcpy(bc_g.betweenness_score_h, state->betweenness, + CALL_CU_SAFE(cudaMemcpy(bc_g.betweenness_score_h, state->betweenness, subgraph->vertex_count * sizeof(score_t), cudaMemcpyDefault)); betweenness_values = bc_g.betweenness_score_h; } else { assert(false); } + // Aggregate the results assert(bc_g.betweenness_score); OMP(omp parallel for schedule(static)) @@ -925,42 +897,44 @@ PRIVATE void betweenness_aggr(partition_t* par) { // computed using a subset of the total nodes within the graph // The scaling value is: (Total Number of Nodes / Subset of Nodes Used) bc_g.betweenness_score[par->map[v]] = betweenness_values[v] * - (score_t)(((double)(engine_vertex_count())) / bc_g.num_samples); + static_cast((static_cast(engine_vertex_count())) / + bc_g.num_samples); } } } -/** - * The following two functions are the kernel and gather callbacks of a single - * BSP cycle that synchronizes the distance of remote vertices - */ +// The following two functions are the kernel and gather callbacks of a single +// BSP cycle that synchronizes the distance of remote vertices. PRIVATE void betweenness_gather_distance(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count) { return; } if (engine_superstep() == 1) { - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + betweenness_state_t* state = + reinterpret_cast(par->algo_state); assert(state); engine_gather_inbox(par->id, state->distance[par->id]); } } PRIVATE void betweenness_synch_distance(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count) { return; } + betweenness_state_t* state = + reinterpret_cast(par->algo_state); assert(state); if (engine_superstep() == 1) { engine_report_not_finished(); } else { for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) { - if (par->id == rmt_pid) continue; + if (par->id == rmt_pid) { continue; } if (par->processor.type == PROCESSOR_GPU) { - CALL_CU_SAFE(cudaMemcpyAsync(state->distance[rmt_pid], + CALL_CU_SAFE(cudaMemcpyAsync(state->distance[rmt_pid], par->outbox[rmt_pid].pull_values, - par->outbox[rmt_pid].count * + par->outbox[rmt_pid].count * sizeof(cost_t), cudaMemcpyDefault, par->streams[1])); } else { - cost_t* src = (cost_t*)par->outbox[rmt_pid].pull_values; + cost_t* src = + reinterpret_cast(par->outbox[rmt_pid].pull_values); cost_t* dst = state->distance[rmt_pid]; OMP(omp parallel for schedule(static)) for (int i = 0; i < par->outbox[rmt_pid].count; i++) { @@ -971,37 +945,38 @@ PRIVATE void betweenness_synch_distance(partition_t* par) { } } -/** - * The following two functions are the kernel and gather callbacks of a single - * BSP cycle that synchronizes the numSPs of remote vertices - */ +// The following two functions are the kernel and gather callbacks of a single +// BSP cycle that synchronizes the numSPs of remote vertices. PRIVATE void betweenness_gather_numSPs(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count) { return; } if (engine_superstep() == 1) { - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + betweenness_state_t* state = + reinterpret_cast(par->algo_state); assert(state); engine_gather_inbox(par->id, state->numSPs[par->id]); } } PRIVATE void betweenness_synch_numSPs(partition_t* par) { - // Check if there is no work to be done - if (!par->subgraph.vertex_count) return; - betweenness_state_t* state = (betweenness_state_t*)par->algo_state; + // Check if there is no work to be done. + if (!par->subgraph.vertex_count) { return; } + betweenness_state_t* state = + reinterpret_cast(par->algo_state); assert(state); if (engine_superstep() == 1) { engine_report_not_finished(); } else { for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) { - if (par->id == rmt_pid) continue; + if (par->id == rmt_pid) { continue; } if (par->processor.type == PROCESSOR_GPU) { - CALL_CU_SAFE(cudaMemcpyAsync(state->numSPs[rmt_pid], + CALL_CU_SAFE(cudaMemcpyAsync(state->numSPs[rmt_pid], par->outbox[rmt_pid].pull_values, - par->outbox[rmt_pid].count * + par->outbox[rmt_pid].count * sizeof(uint32_t), cudaMemcpyDefault, par->streams[1])); } else { - uint32_t* src = (uint32_t*)par->outbox[rmt_pid].pull_values; + uint32_t* src = + reinterpret_cast(par->outbox[rmt_pid].pull_values); uint32_t* dst = state->numSPs[rmt_pid]; OMP(omp parallel for schedule(static)) for (int i = 0; i < par->outbox[rmt_pid].count; i++) { @@ -1012,33 +987,31 @@ PRIVATE void betweenness_synch_numSPs(partition_t* par) { } } -/** - * Core functionality for main for loop within the BC computation - */ +// Core functionality for main for loop within the BC computation. void betweenness_hybrid_core(vid_t source, bool is_first_iteration, bool is_last_iteration) { - // Set the source node for this iteration + // Set the source node for this iteration. bc_g.src = engine_vertex_id_in_partition(source); - // Forward propagation + // Forward propagation. engine_par_init_func_t init_forward = betweenness_init_forward; if (is_first_iteration) { init_forward = betweenness_init; } // Configure the parameters for forward propagation given the current - // iteration of the overall computation + // iteration of the overall computation. engine_config_t config_forward = { - NULL, betweenness_forward, betweenness_scatter_forward, NULL, + NULL, betweenness_forward, betweenness_scatter_forward, NULL, init_forward, NULL, NULL, GROOVES_PUSH }; - // Call Totem to begin the computation phase given the specified - // configuration + // Call Totem to begin the computation phase given the specified + // configuration. engine_config(&config_forward); engine_execute(); // Synchronize the distance and numSPs state, which have been calculated in // the forward phase, across all partitions. This state will be used in the - // backward propagation phase + // backward propagation phase. engine_config_t config_distance_state = { NULL, betweenness_synch_distance, NULL, betweenness_gather_distance, NULL, NULL, NULL, GROOVES_PULL @@ -1077,55 +1050,56 @@ void betweenness_hybrid_core(vid_t source, bool is_first_iteration, error_t betweenness_hybrid(double epsilon, score_t* betweenness_score) { // Sanity check on input bool finished = false; - error_t rc = betweenness_check_special_cases(engine_get_graph(), + error_t rc = betweenness_check_special_cases(engine_get_graph(), &finished, betweenness_score); - if (finished) return rc; + if (finished) { return rc; } // Initialize the global state memset(&bc_g, 0, sizeof(bc_g)); bc_g.betweenness_score = betweenness_score; - CALL_SAFE(totem_memset(bc_g.betweenness_score, (score_t)0, + CALL_SAFE(totem_memset(bc_g.betweenness_score, (score_t)0, engine_vertex_count(), TOTEM_MEM_HOST)); bc_g.epsilon = epsilon; if (engine_largest_gpu_partition()) { CALL_SAFE(totem_malloc(engine_largest_gpu_partition() * sizeof(score_t), - TOTEM_MEM_HOST_PINNED, - (void**)&bc_g.betweenness_score_h)); + TOTEM_MEM_HOST_PINNED, + reinterpret_cast + (&bc_g.betweenness_score_h))); } // Determine whether we will compute exact or approximate BC values if (epsilon == CENTRALITY_EXACT) { // Compute exact values for Betweenness Centrality vid_t vcount = engine_vertex_count(); - for (vid_t source = 0; source < vcount; source++) { - betweenness_hybrid_core(source, (source == 0), (source == (vcount-1))); + for (vid_t source = 0; source < vcount; source++) { + betweenness_hybrid_core(source, (source == 0), (source == (vcount-1))); } } else { // Compute approximate values based on the value of epsilon provided // Select a subset of source nodes to make the computation faster int num_samples = centrality_get_number_sample_nodes(engine_vertex_count(), epsilon); - // Store the number of samples used in the global state to be used for + // Store the number of samples used in the global state to be used for // scaling the computed metric during aggregation bc_g.num_samples = num_samples; // Populate the array of indices to sample vid_t* sample_nodes = centrality_select_sampling_nodes( engine_get_graph(), num_samples); - + for (int source_index = 0; source_index < num_samples; source_index++) { // Get the next sample node in the array to use as a source - vid_t source = sample_nodes[source_index]; - betweenness_hybrid_core(source, (source_index == 0), - (source_index == (num_samples-1))); - } - + vid_t source = sample_nodes[source_index]; + betweenness_hybrid_core(source, (source_index == 0), + (source_index == (num_samples-1))); + } + // Clean up the allocated memory free(sample_nodes); } - + // Clean up and return - if (engine_largest_gpu_partition()) { + if (engine_largest_gpu_partition()) { totem_free(bc_g.betweenness_score_h, TOTEM_MEM_HOST_PINNED); } memset(&bc_g, 0, sizeof(betweenness_global_state_t)); diff --git a/src/alg/totem_bfs_stepwise_hybrid.cu b/src/alg/totem_bfs_stepwise_hybrid.cu new file mode 100644 index 0000000..c5748cd --- /dev/null +++ b/src/alg/totem_bfs_stepwise_hybrid.cu @@ -0,0 +1,447 @@ +/** + * This file contains an implementation of the breadth-first search (BFS) graph + * search algorithm using the totem framework. This is a modified version that + * performs the algorithm in a Bottom Up fashion. + * + * This implementation only works for undirected graphs. + * + * Based off of the work by Scott Beamer et al. + * Searching for a Parent Instead of Fighting Over Children: A Fast + * Breadth-First Search Implementation for Graph500. + * http://www.eecs.berkeley.edu/Pubs/TechRpts/2011/EECS-2011-117.pdf + * + * TODO(scott): Modify the algorithm to swap between top down and bottom up + * steps. + * + * Created on: 2014-08-26 + * Authors: Scott Sallinen + * Abdullah Gharaibeh + */ + +#include "totem_alg.h" +#include "totem_engine.cuh" + +// Per-partition specific state. +typedef struct bfs_state_s { + cost_t* cost; // One slot per vertex in the partition. + bitmap_t visited[MAX_PARTITION_COUNT]; // A list of bitmaps, one for each + // remote partition. + bitmap_t frontier[MAX_PARTITION_COUNT]; // A list of bitmaps, one for each + // remote partition. + bool* finished; // Points to Totem's finish flag. + cost_t level; // Current level to process by the partition. + frontier_state_t frontier_state; // Frontier management state. +} bfs_state_t; + +// State shared between all partitions. +typedef struct bfs_global_state_s { + cost_t* cost; // Final output buffer. + cost_t* cost_h; // Used as a temporary buffer to receive the final + // result copied back from GPU partitions before being + // copied again to the final output buffer. + // TODO(abdullah): push this buffer to be managed by Totem + vid_t src; // Source vertex id. (The id after partitioning.) +} bfs_global_state_t; +PRIVATE bfs_global_state_t state_g = {NULL, NULL, 0}; + +// Checks for input parameters and special cases. This is invoked at the +// beginning of public interfaces (GPU and CPU) +PRIVATE error_t check_special_cases(vid_t src, cost_t* cost, bool* finished) { + *finished = true; + if ((src >= engine_vertex_count()) || (cost == NULL)) { + return FAILURE; + } else if (engine_vertex_count() == 1) { + cost[0] = 0; + return SUCCESS; + } else if (engine_edge_count() == 0) { + // Initialize cost to INFINITE. + totem_memset(cost, INF_COST, engine_vertex_count(), TOTEM_MEM_HOST); + cost[src] = 0; + return SUCCESS; + } + *finished = false; + return SUCCESS; +} + +// A step that iterates across unvisited vertices and determines +// their status in the next frontier. +PRIVATE void bfs_bu_step(partition_t* par, bfs_state_t* state) { + graph_t* subgraph = &par->subgraph; + bool finished = true; + + // Iterate across all of our vertices. + OMP(omp parallel for schedule(runtime) reduction(& : finished)) + for (vid_t vertex_id = 0; vertex_id < subgraph->vertex_count; vertex_id++) { + // Ignore the local vertex if it has already been visited. + if (bitmap_is_set(state->visited[par->id], vertex_id)) { continue; } + + // Iterate across the neighbours of this vertex. + for (eid_t i = subgraph->vertices[vertex_id]; + i < subgraph->vertices[vertex_id + 1]; i++) { + int nbr_pid = GET_PARTITION_ID(subgraph->edges[i]); + int nbr = GET_VERTEX_ID(subgraph->edges[i]); + + // Check if the bitmap corresponding to the vertices PID is set. + // This means the partition that the vertex belongs to, has explored it. + if (bitmap_is_set(state->frontier[nbr_pid], nbr)) { + // Add the vertex we are exploring to the next frontier. + bitmap_set_cpu(state->visited[par->id], vertex_id); + + // Increment the level of this vertex. + state->cost[vertex_id] = state->level + 1; + finished = false; + break; + } + } // End of neighbour check - vertex examined. + } // All vertices examined in level. + + // Move over the finished variable. + if (!finished) *(state->finished) = false; +} + +// This is a CPU version of the Bottom-up/Top-down BFS algorithm. +// See file header for full details. +void bfs_stepwise_cpu(partition_t* par, bfs_state_t* state) { + // Update the frontier. + frontier_update_bitmap_cpu(&state->frontier_state, state->visited[par->id]); + state->frontier[par->id] = state->frontier_state.current; + + // Execute a step. + bfs_bu_step(par, state); +} + +// A gpu version of the Bottom-up step as a kernel. +__global__ void bfs_bu_kernel(partition_t par, bfs_state_t state) { + const vid_t vertex_id = THREAD_GLOBAL_INDEX; + graph_t subgraph = par.subgraph; + if (vertex_id >= subgraph.vertex_count) { return; } + bool finished = true; + + // Ignore the local vertex if it has already been visited. + if (bitmap_is_set(state.visited[par.id], vertex_id)) { return; } + + // Iterate across all neighbours of the vertex. + for (eid_t i = subgraph.vertices[vertex_id]; + i < subgraph.vertices[vertex_id + 1]; i++) { + int nbr_pid = GET_PARTITION_ID(subgraph.edges[i]); + int nbr = GET_VERTEX_ID(subgraph.edges[i]); + + // Check if neighbour is in the current frontier. + if (bitmap_is_set(state.frontier[nbr_pid], nbr)) { + // Add the vertex we are exploring to the next frontier. + bitmap_set_gpu(state.visited[par.id], vertex_id); + + // Increment the level of this vertex. + state.cost[vertex_id] = state.level + 1; + finished = false; + break; + } + } + + // Move over the finished variable. + if (!finished) *(state.finished) = false; +} + +// This is a GPU version of the Bottom-up/Top-down BFS algorithm. +// See file header for full details. +__host__ error_t bfs_stepwise_gpu(partition_t* par, bfs_state_t* state) { + dim3 blocks; + dim3 threads_per_block; + KERNEL_CONFIGURE(par->subgraph.vertex_count, blocks, threads_per_block); + + // Update the frontier. + frontier_update_bitmap_gpu(&state->frontier_state, state->visited[par->id], + par->streams[1]); + state->frontier[par->id] = state->frontier_state.current; + + // Execute a step. + bfs_bu_kernel<<streams[1]>>> + (*par, *state); + + return SUCCESS; +} + +// The execution phase - based off of the partition we are, launch an approach. +PRIVATE void bfs(partition_t* par) { + if (par->subgraph.vertex_count == 0) { return; } + + // Ignore the first round - this allows us to communicate the frontier with + // an updated visited status of the source vertex. + if (engine_superstep() == 1) { + engine_report_not_finished(); + return; + } + + bfs_state_t* state = reinterpret_cast(par->algo_state); + + // Launch the processor specific algorithm. + if (par->processor.type == PROCESSOR_CPU) { + bfs_stepwise_cpu(par, state); + } else if (par->processor.type == PROCESSOR_GPU) { + bfs_stepwise_gpu(par, state); + } else { + assert(false); + } + + // At the end of the round, increase our BFS level. + state->level++; +} + +// Gather for the CPU bitmap to inbox. +PRIVATE void bfs_gather_cpu(partition_t* par, bfs_state_t* state, + grooves_box_table_t* inbox) { + // Iterate across the items in the inbox. + OMP(omp parallel for schedule(static)) + for (vid_t index = 0; index < inbox->count; index++) { + // Lookup the local vertex it points to. + vid_t vid = inbox->rmt_nbrs[index]; + + // Set the bit state to our local state. + if (bitmap_is_set(state->visited[par->id], vid)) { + bitmap_set_cpu(reinterpret_cast(inbox->pull_values), index); + } + } +} + +// Gather for the GPU bitmap to inbox. +__global__ void bfs_gather_gpu(partition_t par, bfs_state_t state, + grooves_box_table_t inbox) { + const vid_t index = THREAD_GLOBAL_INDEX; + if (index >= inbox.count) { return; } + + // Lookup the local vertex it points to. + vid_t vid = inbox.rmt_nbrs[index]; + + // Set the bit state to our local state. + if (bitmap_is_set(state.visited[par.id], vid)) { + bitmap_set_gpu(reinterpret_cast(inbox.pull_values), index); + } +} + +// The gather phase - apply values from the inboxes to the partitions' local +// variables. +PRIVATE void bfs_gather(partition_t* par) { + bfs_state_t* state = reinterpret_cast(par->algo_state); + + // Across all partitions that are not us. + for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) { + if (rmt_pid == par->id) { continue; } + + // Select the inbox to apply to. + grooves_box_table_t* inbox = &par->inbox[rmt_pid]; + if (inbox->count == 0) { continue; } + + // Select a method based off of our processor type. + if (par->processor.type == PROCESSOR_CPU) { + bfs_gather_cpu(par, state, inbox); + } else if (par->processor.type == PROCESSOR_GPU) { + // Set up to iterate across the items in the inbox. + dim3 blocks; + dim3 threads_per_block; + KERNEL_CONFIGURE(inbox->count, blocks, threads_per_block); + bfs_gather_gpu<<streams[1]>>> + (*par, *state, *inbox); + } else { + assert(false); + } + } +} + +// The aggregate phase - combine results to be presented. +PRIVATE void bfs_aggregate(partition_t* par) { + if (!par->subgraph.vertex_count) { return; } + + bfs_state_t* state = reinterpret_cast(par->algo_state); + graph_t* subgraph = &par->subgraph; + cost_t* src_cost = NULL; + + // Apply the cost from our partition into the final cost array. + if (par->processor.type == PROCESSOR_CPU) { + src_cost = state->cost; + } else if (par->processor.type == PROCESSOR_GPU) { + assert(state_g.cost_h); + CALL_CU_SAFE(cudaMemcpy(state_g.cost_h, state->cost, + subgraph->vertex_count * sizeof(cost_t), + cudaMemcpyDefault)); + src_cost = state_g.cost_h; + } else { + assert(false); + } + + // Aggregate the results. + assert(state_g.cost); + OMP(omp parallel for schedule(runtime)) + for (vid_t v = 0; v < subgraph->vertex_count; v++) { + state_g.cost[par->map[v]] = src_cost[v]; + } +} + +// A simple kernel that sets the source vertex to visited on the GPU. +__global__ void bfs_init_bu_kernel(bitmap_t visited, vid_t src) { + if (THREAD_GLOBAL_INDEX != 0) { return; } + bitmap_set_gpu(visited, src); +} + +// Initialize the GPU memory - bitmaps and frontier. +PRIVATE inline void bfs_init_gpu(partition_t* par) { + bfs_state_t* state = reinterpret_cast(par->algo_state); + + // Initialize our visited bitmap. + state->visited[par->id] = bitmap_init_gpu(par->subgraph.vertex_count); + + // Initialize other partitions frontier bitmaps. + for (int pid = 0; pid < engine_partition_count(); pid++) { + // Assign the outboxes to our frontier bitmap pointers. + if (pid != par->id && par->outbox[pid].count != 0) { + state->frontier[pid] = + reinterpret_cast(par->outbox[pid].pull_values); + } + // Clear the inboxes, and also their shadows. + if (pid != par->id && par->inbox[pid].count != 0) { + bitmap_reset_gpu(reinterpret_cast + (par->inbox[pid].pull_values), + par->inbox[pid].count, + par->streams[1]); + bitmap_reset_gpu(reinterpret_cast + (par->inbox[pid].pull_values_s), + par->inbox[pid].count, + par->streams[1]); + } + } + + // Set the source vertex as visited, if it is in our partition. + if (GET_PARTITION_ID(state_g.src) == par->id) { + bfs_init_bu_kernel<<<1, 1, 0, par->streams[1]>>> + (state->visited[par->id], GET_VERTEX_ID(state_g.src)); + CALL_CU_SAFE(cudaGetLastError()); + } + + // Initialize our local frontier. + frontier_init_gpu(&state->frontier_state, par->subgraph.vertex_count); +} + +// Initialize the CPU memory - bitmaps and frontier. +PRIVATE inline void bfs_init_cpu(partition_t* par) { + bfs_state_t* state = reinterpret_cast(par->algo_state); + + // Initialize our visited bitmap. + state->visited[par->id] = bitmap_init_cpu(par->subgraph.vertex_count); + + // Initialize other partitions bitmaps. + for (int pid = 0; pid < engine_partition_count(); pid++) { + // Assign the outboxes to our frontier bitmap pointers. + if (pid != par->id && par->outbox[pid].count != 0) { + state->frontier[pid] = + reinterpret_cast(par->outbox[pid].pull_values); + } + // Clear the inboxes, and also their shadows. + if (pid != par->id && par->inbox[pid].count != 0) { + bitmap_reset_cpu(reinterpret_cast + (par->inbox[pid].pull_values), + par->inbox[pid].count); + bitmap_reset_cpu(reinterpret_cast + (par->inbox[pid].pull_values_s), + par->inbox[pid].count); + } + } + + // Set the source vertex as visited, if it is in our partition. + if (GET_PARTITION_ID(state_g.src) == par->id) { + bitmap_set_cpu(state->visited[par->id], GET_VERTEX_ID(state_g.src)); + } + + // Initialize our local frontier. + frontier_init_cpu(&state->frontier_state, par->subgraph.vertex_count); +} + +// The init phase - Set up the memory and statuses. +PRIVATE void bfs_init(partition_t* par) { + if (par->subgraph.vertex_count == 0) { return; } + bfs_state_t* state = + reinterpret_cast(calloc(1, sizeof(bfs_state_t))); + assert(state); + + // Initialize based off of our processor type. + par->algo_state = state; + totem_mem_t type = TOTEM_MEM_HOST; + if (par->processor.type == PROCESSOR_CPU) { + bfs_init_cpu(par); + } else if (par->processor.type == PROCESSOR_GPU) { + type = TOTEM_MEM_DEVICE; + bfs_init_gpu(par); + } else { + assert(false); + } + + // Allocate memory for the cost array, and set it to INFINITE cost. + CALL_SAFE(totem_malloc(par->subgraph.vertex_count * sizeof(cost_t), type, + reinterpret_cast(&(state->cost)))); + totem_memset(state->cost, INF_COST, par->subgraph.vertex_count, type, + par->streams[1]); + + if (GET_PARTITION_ID(state_g.src) == par->id) { + // For the source vertex, initialize cost. + totem_memset(&((state->cost)[GET_VERTEX_ID(state_g.src)]), (cost_t)0, 1, + type, par->streams[1]); + } + + // Set level 0 to start, and finished pointer. + state->finished = engine_get_finished_ptr(par->id); + state->level = 0; +} + +// The finalize phase - clean up. +PRIVATE void bfs_finalize(partition_t* par) { + if (par->subgraph.vertex_count == 0) { return; } + bfs_state_t* state = reinterpret_cast(par->algo_state); + totem_mem_t type = TOTEM_MEM_HOST; + + // Finalize frontiers. + if (par->processor.type == PROCESSOR_CPU) { + bitmap_finalize_cpu(state->visited[par->id]); + frontier_finalize_cpu(&state->frontier_state); + } else if (par->processor.type == PROCESSOR_GPU) { + bitmap_finalize_gpu(state->visited[par->id]); + type = TOTEM_MEM_DEVICE; + frontier_finalize_gpu(&state->frontier_state); + } else { + assert(false); + } + + // Free memory. + totem_free(state->cost, type); + free(state); + par->algo_state = NULL; +} + +// The launch point for the algorithm - set up engine, cost, and launch. +error_t bfs_stepwise_hybrid(vid_t src, cost_t* cost) { + // Check for special cases. + bool finished = false; + error_t rc = check_special_cases(src, cost, &finished); + if (finished) { return rc; } + + // Initialize the global state. + state_g.cost = cost; + state_g.src = engine_vertex_id_in_partition(src); + + // Initialize the engine. + engine_config_t config = { + NULL, bfs, NULL, bfs_gather, bfs_init, bfs_finalize, bfs_aggregate, + GROOVES_PULL + }; + engine_config(&config); + if (engine_largest_gpu_partition()) { + CALL_SAFE(totem_malloc(engine_largest_gpu_partition() * sizeof(cost_t), + TOTEM_MEM_HOST, + reinterpret_cast(&state_g.cost_h))); + } + engine_execute(); + + // Clean up and return. + if (engine_largest_gpu_partition()) { + totem_free(state_g.cost_h, TOTEM_MEM_HOST); + } + memset(&state_g, 0, sizeof(bfs_global_state_t)); + return SUCCESS; +} diff --git a/src/alg/totem_page_rank.cu b/src/alg/totem_page_rank.cu index 0f4321f..c939e81 100644 --- a/src/alg/totem_page_rank.cu +++ b/src/alg/totem_page_rank.cu @@ -27,19 +27,17 @@ */ typedef struct { // One is added to make it easy to calculate the number of neighbors of the - // last vertex. Another one is added to ensure 8 bytes alignment irrespective - // whether sizeof(eid_t) is 4 or 8. Alignment is enforced for performance + // last vertex. Another one is added to ensure 8 bytes alignment irrespective + // whether sizeof(eid_t) is 4 or 8. Alignment is enforced for performance // reasons. eid_t vertices[VWARP_DEFAULT_BATCH_SIZE + 2]; rank_t rank[VWARP_DEFAULT_BATCH_SIZE]; } vwarp_mem_t; -/** - * Checks for input parameters and special cases. This is invoked at the - * beginning of public interfaces (GPU and CPU) -*/ +// Checks for input parameters and special cases. This is invoked at the +// beginning of public interfaces (GPU and CPU). PRIVATE -error_t check_special_cases(const graph_t* graph, rank_t* rank, +error_t check_special_cases(const graph_t* graph, rank_t* rank, bool* finished) { *finished = true; if (graph == NULL) { @@ -54,26 +52,23 @@ error_t check_special_cases(const graph_t* graph, rank_t* rank, return SUCCESS; } -/** - * A common initialization function for GPU implementations. It allocates and - * initalizes state on the GPU -*/ + +// A common initialization function for GPU implementations. It allocates and +// initalizes state on the GPU. PRIVATE -error_t initialize_gpu(const graph_t* graph, rank_t* rank_i, vid_t rank_length, +error_t initialize_gpu(const graph_t* graph, rank_t* rank_i, vid_t rank_length, graph_t** graph_d, rank_t **rank_d, rank_t** mailbox_d) { totem_mem_t type = TOTEM_MEM_DEVICE; - - // will be passed to the kernel CHK_SUCCESS(graph_initialize_device(graph, graph_d), err); - - // allocate mailbox and outbox device buffers CHK_SUCCESS(totem_calloc(graph->vertex_count * sizeof(rank_t), type, - (void**)mailbox_d), err_free_graph_d); + reinterpret_cast(mailbox_d)), + err_free_graph_d); CHK_SUCCESS(totem_malloc(rank_length * sizeof(rank_t), type, - (void**)rank_d), err_free_mailbox); + reinterpret_cast(rank_d)), + err_free_mailbox); if (rank_i == NULL) { - rank_t initial_value = 1 / (rank_t)graph->vertex_count; + rank_t initial_value = 1 / static_cast(graph->vertex_count); totem_memset(*rank_d, initial_value, rank_length, type); } else { CHK_CU_SUCCESS(cudaMemcpy(*rank_d, rank_i, rank_length * sizeof(rank_t), @@ -93,15 +88,13 @@ error_t initialize_gpu(const graph_t* graph, rank_t* rank_i, vid_t rank_length, return FAILURE; } -/** - * A common finalize function for GPU implementations. It allocates the host - * output buffer, moves the final results from GPU to the host buffers and - * frees up some resources. -*/ +// A common finalize function for GPU implementations. It allocates the host +// output buffer, moves the final results from GPU to the host buffers and +// frees up some resources. PRIVATE -error_t finalize_gpu(graph_t* graph_d, rank_t* rank_d, rank_t* mailbox_d, +error_t finalize_gpu(graph_t* graph_d, rank_t* rank_d, rank_t* mailbox_d, rank_t* rank) { - // Copy back the final result + // Copy back the final result. CHK_CU_SUCCESS(cudaMemcpy(rank, rank_d, graph_d->vertex_count * sizeof(rank_t), cudaMemcpyDeviceToHost), err); totem_free(rank_d, TOTEM_MEM_DEVICE); @@ -113,14 +106,12 @@ error_t finalize_gpu(graph_t* graph_d, rank_t* rank_d, rank_t* mailbox_d, return FAILURE; } -/** - * Phase1 kernel of the original PageRank GPU algorithm (i.e., non-vwarp). - * Produce the sum of the neighbors' ranks. Each vertex atomically - * adds its value to the mailbox of the destination neighbor vertex. - */ +// Phase1 kernel of the original PageRank GPU algorithm (i.e., non-vwarp). +// Produce the sum of the neighbors' ranks. Each vertex atomically +// adds its value to the mailbox of the destination neighbor vertex. __global__ void sum_neighbors_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox) { - // get the thread's linear index + // Get the thread's linear index. vid_t vertex_id = THREAD_GLOBAL_INDEX; if (vertex_id >= graph.vertex_count) return; @@ -132,74 +123,63 @@ void sum_neighbors_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox) { } } -/** - * Phase2 kernel of the original PageRank GPU algorithm (i.e., non-vwarp). - * Produce the rank of each vertex. The sum of ranks coming from the incoming - * edges is stored in the mailbox of the vertex. - */ +// Phase2 kernel of the original PageRank GPU algorithm (i.e., non-vwarp). +// Produce the rank of each vertex. The sum of ranks coming from the incoming +// edges is stored in the mailbox of the vertex. __global__ void compute_normalized_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox) { - // get the thread's linear index + // Get the thread's linear index. vid_t vertex_id = THREAD_GLOBAL_INDEX; if (vertex_id >= graph.vertex_count) return; - // get sum of incoming neighbors' ranks + // Get sum of incoming neighbors' ranks. rank_t sum = mailbox[vertex_id]; mailbox[vertex_id] = 0; - // calculate my normalized rank - rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + + // Calculate my normalized rank. + rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + (PAGE_RANK_DAMPING_FACTOR * sum); rank[vertex_id] = my_rank / (graph.vertices[vertex_id + 1] - graph.vertices[vertex_id]); } -/** - * Phase2 final kernel of the original PageRank GPU algorithm (i.e., non-vwarp). - * This kernel is similar to the compute_normalized_rank_kernel. The difference - * is that it does not normalize the rank (by dividing it by the number of - * neighbors). It is invoked in the end to get the final, un-normalized, rank. - */ +// Phase2 final kernel of the original PageRank GPU algorithm (i.e., non-vwarp). +// This kernel is similar to the compute_normalized_rank_kernel. The difference +// is that it does not normalize the rank (by dividing it by the number of +// neighbors). It is invoked in the end to get the final, un-normalized, rank. __global__ -void compute_unnormalized_rank_kernel(graph_t graph, rank_t* rank, +void compute_unnormalized_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox) { - // get the thread's linear index vid_t vertex_id = THREAD_GLOBAL_INDEX; if (vertex_id >= graph.vertex_count) return; - // get sum of neighbors' ranks rank_t sum = mailbox[vertex_id]; - // calculate my rank - rank[vertex_id] = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + + rank[vertex_id] = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + (PAGE_RANK_DAMPING_FACTOR * sum); } -/** - * The neighbors processing function. This function adds the a vertex rank to - * to the mailbox of all neighbors. The assumption is that the threads of a warp - * invoke this function to process the warp's batch of work. In each iteration - * of the for loop, each thread processes a neighbor. For example, thread 0 in - * the warp processes neighbors at indices 0, VWARP_DEFAULT_WARP_WIDTH, - * (2 * VWARP_DEFAULT_WARP_WIDTH) etc. in the edges array, while thread 1 in - * the warp processes neighbors 1, (1 + VWARP_DEFAULT_WARP_WIDTH), - * (1 + 2 * VWARP_DEFAULT_WARP_WIDTH) and so on. -*/ +// The neighbors processing function. This function adds the a vertex rank to +// to the mailbox of all neighbors. The assumption is that the threads of a warp +// invoke this function to process the warp's batch of work. In each iteration +// of the for loop, each thread processes a neighbor. For example, thread 0 in +// the warp processes neighbors at indices 0, VWARP_DEFAULT_WARP_WIDTH, +// (2 * VWARP_DEFAULT_WARP_WIDTH) etc. in the edges array, while thread 1 in +// the warp processes neighbors 1, (1 + VWARP_DEFAULT_WARP_WIDTH), +// (1 + 2 * VWARP_DEFAULT_WARP_WIDTH) and so on. __device__ inline -void vwarp_process_neighbors(vid_t warp_offset, vid_t neighbor_count, - vid_t* neighbors, rank_t my_rank, +void vwarp_process_neighbors(vid_t warp_offset, vid_t neighbor_count, + vid_t* neighbors, rank_t my_rank, rank_t* mailbox) { - for(vid_t i = warp_offset; i < neighbor_count; - i += VWARP_DEFAULT_WARP_WIDTH) { + for (vid_t i = warp_offset; i < neighbor_count; + i += VWARP_DEFAULT_WARP_WIDTH) { const vid_t neighbor_id = neighbors[i]; atomicAdd(&(mailbox[neighbor_id]), my_rank); } } -/** - * Phase1 kernel of the vwarp PageRank GPU algorithm. - * Produce the sum of the neighbors' ranks. Each vertex atomically - * adds its value to the mailbox of the destination neighbor vertex. - */ +// Phase1 kernel of the vwarp PageRank GPU algorithm. +// Produce the sum of the neighbors' ranks. Each vertex atomically +// adds its value to the mailbox of the destination neighbor vertex. __global__ -void vwarp_sum_neighbors_rank_kernel(graph_t graph, rank_t* rank, +void vwarp_sum_neighbors_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox, uint32_t thread_count) { if (THREAD_GLOBAL_INDEX >= thread_count) return; vid_t warp_offset = THREAD_GLOBAL_INDEX % VWARP_DEFAULT_WARP_WIDTH; @@ -207,18 +187,18 @@ void vwarp_sum_neighbors_rank_kernel(graph_t graph, rank_t* rank, __shared__ vwarp_mem_t shared_memory[MAX_THREADS_PER_BLOCK / VWARP_DEFAULT_WARP_WIDTH]; - vwarp_mem_t* my_space = &shared_memory[THREAD_BLOCK_INDEX / + vwarp_mem_t* my_space = &shared_memory[THREAD_BLOCK_INDEX / VWARP_DEFAULT_WARP_WIDTH]; - // copy my work to local space + // Copy my work to local space. vid_t v_ = warp_id * VWARP_DEFAULT_BATCH_SIZE; - vwarp_memcpy(my_space->rank, &rank[v_], + vwarp_memcpy(my_space->rank, &rank[v_], VWARP_DEFAULT_BATCH_SIZE, warp_offset); - vwarp_memcpy(my_space->vertices, &(graph.vertices[v_]), + vwarp_memcpy(my_space->vertices, &(graph.vertices[v_]), VWARP_DEFAULT_BATCH_SIZE + 1, warp_offset); - // iterate over my work - for(vid_t v = 0; v < VWARP_DEFAULT_BATCH_SIZE; v++) { + // Iterate over my work. + for (vid_t v = 0; v < VWARP_DEFAULT_BATCH_SIZE; v++) { vid_t neighbor_count = my_space->vertices[v + 1] - my_space->vertices[v]; vid_t* neighbors = &(graph.edges[my_space->vertices[v]]); vwarp_process_neighbors(warp_offset, neighbor_count, neighbors, @@ -236,20 +216,20 @@ void vwarp_compute_normalized_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox) { vid_t vertex_id = THREAD_GLOBAL_INDEX; if (vertex_id >= graph.vertex_count) return; - // get sum of incoming neighbors' ranks + // Get the sum of incoming neighbors' ranks. rank_t sum = mailbox[vertex_id]; mailbox[vertex_id] = 0; - // calculate my normalized rank - rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + + // Calculate my normalized rank. + rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + (PAGE_RANK_DAMPING_FACTOR * sum); rank[vertex_id] = my_rank / (graph.vertices[vertex_id + 1] - graph.vertices[vertex_id]); } /** - * Phase2 final kernel of the vwarp PageRank GPU algorithm. This kernel is - * similar to the compute_normalized_rank_kernel. The difference is that it - * does not normalize the rank (by dividing it by the number of neighbors). + * Phase2 final kernel of the vwarp PageRank GPU algorithm. This kernel is + * similar to the compute_normalized_rank_kernel. The difference is that it + * does not normalize the rank (by dividing it by the number of neighbors). * It is invoked in the end to get the final, un-normalized, rank. */ __global__ @@ -257,21 +237,21 @@ void vwarp_compute_unnormalized_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox) { vid_t vertex_id = THREAD_GLOBAL_INDEX; if (vertex_id >= graph.vertex_count) return; - // get sum of neighbors' ranks + // Get the sum of neighbors' ranks. rank_t sum = mailbox[vertex_id]; - // calculate my rank - rank[vertex_id] = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + + // Calculate my rank. + rank[vertex_id] = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + (PAGE_RANK_DAMPING_FACTOR * sum); } __host__ error_t page_rank_vwarp_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) { - // Check for special cases + // Check for special cases. bool finished = false; error_t rc = check_special_cases(graph, rank, &finished); if (finished) return rc; - // Allocate and initialize GPU state + // Allocate and initialize GPU state. graph_t* graph_d; rank_t* rank_d; rank_t* mailbox_d; @@ -280,39 +260,39 @@ error_t page_rank_vwarp_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) { CHK_SUCCESS(initialize_gpu(graph, rank_i, rank_length, &graph_d, &rank_d, &mailbox_d), err); - {// Configure the kernels. Setup the number of threads for phase1 and phase2, - // configure the on-chip memory as shared memory rather than L1 cache - dim3 blocks1, threads_per_block1, blocks2, threads_per_block2; - vid_t phase1_thread_count = - vwarp_default_thread_count(graph->vertex_count); - KERNEL_CONFIGURE(phase1_thread_count, blocks1, threads_per_block1); - KERNEL_CONFIGURE(graph->vertex_count, blocks2, threads_per_block2); - cudaFuncSetCacheConfig(vwarp_sum_neighbors_rank_kernel, - cudaFuncCachePreferShared); - - // Iterate for a specific number of rounds - for (int round = 0; round < PAGE_RANK_ROUNDS - 1; round++) { + { + // Configure the kernels. Setup the number of threads for phase1 and phase2, + // configure the on-chip memory as shared memory rather than L1 cache. + dim3 blocks1, threads_per_block1, blocks2, threads_per_block2; + vid_t phase1_thread_count = + vwarp_default_thread_count(graph->vertex_count); + KERNEL_CONFIGURE(phase1_thread_count, blocks1, threads_per_block1); + KERNEL_CONFIGURE(graph->vertex_count, blocks2, threads_per_block2); + cudaFuncSetCacheConfig(vwarp_sum_neighbors_rank_kernel, + cudaFuncCachePreferShared); + + // Iterate for a specific number of rounds. + for (int round = 0; round < PAGE_RANK_ROUNDS - 1; round++) { + vwarp_sum_neighbors_rank_kernel<<>> + (*graph_d, rank_d, mailbox_d, phase1_thread_count); + CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); + vwarp_compute_normalized_rank_kernel<<>> + (*graph_d, rank_d, mailbox_d); + CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); + } + // Final round is seprate. It computes an un-normalized final rank. vwarp_sum_neighbors_rank_kernel<<>> - (*graph_d, rank_d, mailbox_d, phase1_thread_count); + (*graph_d, rank_d, mailbox_d, phase1_thread_count); CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); - vwarp_compute_normalized_rank_kernel<<>> - (*graph_d, rank_d, mailbox_d); + vwarp_compute_unnormalized_rank_kernel<<>> + (*graph_d, rank_d, mailbox_d); CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); } - // Final round is seprate. It computes an un-normalized final rank - vwarp_sum_neighbors_rank_kernel<<>> - (*graph_d, rank_d, mailbox_d, phase1_thread_count); - CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); - vwarp_compute_unnormalized_rank_kernel<<>> - (*graph_d, rank_d, mailbox_d); - CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); - } - // Copy the result back from GPU and clean up + // Copy the result back from GPU and clean up. CHK_SUCCESS(finalize_gpu(graph_d, rank_d, mailbox_d, rank), err_free_all); return SUCCESS; - // error handlers err_free_all: totem_free(rank_d, TOTEM_MEM_DEVICE); totem_free(mailbox_d, TOTEM_MEM_DEVICE); @@ -323,44 +303,43 @@ error_t page_rank_vwarp_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) { __host__ error_t page_rank_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) { - // Check for special cases + // Check for special cases. bool finished = false; error_t rc = check_special_cases(graph, rank, &finished); if (finished) return rc; - // Allocate and initialize GPU state + // Allocate and initialize GPU state. graph_t* graph_d; rank_t* rank_d; rank_t* mailbox_d; - CHK_SUCCESS(initialize_gpu(graph, rank_i, graph->vertex_count, + CHK_SUCCESS(initialize_gpu(graph, rank_i, graph->vertex_count, &graph_d, &rank_d, &mailbox_d), err); { - dim3 blocks, threads_per_block; - KERNEL_CONFIGURE(graph->vertex_count, blocks, threads_per_block); - // Iterate for a specific number of rounds - for (int round = 0; round < PAGE_RANK_ROUNDS - 1; round++) { + dim3 blocks, threads_per_block; + KERNEL_CONFIGURE(graph->vertex_count, blocks, threads_per_block); + // Iterate for a specific number of rounds. + for (int round = 0; round < PAGE_RANK_ROUNDS - 1; round++) { + sum_neighbors_rank_kernel<<>> + (*graph_d, rank_d, mailbox_d); + CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); + compute_normalized_rank_kernel<<>> + (*graph_d, rank_d, mailbox_d); + CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); + } + // Final round is seprate. It computes an un-normalized final rank. sum_neighbors_rank_kernel<<>> - (*graph_d, rank_d, mailbox_d); + (*graph_d, rank_d, mailbox_d); CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); - compute_normalized_rank_kernel<<>> - (*graph_d, rank_d, mailbox_d); + compute_unnormalized_rank_kernel<<>> + (*graph_d, rank_d, mailbox_d); CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); } - // Final round is seprate. It computes an un-normalized final rank - sum_neighbors_rank_kernel<<>> - (*graph_d, rank_d, mailbox_d); - CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); - compute_unnormalized_rank_kernel<<>> - (*graph_d, rank_d, mailbox_d); - CHK_CU_SUCCESS(cudaGetLastError(), err_free_all); - } - // Copy the result back from GPU and clean up + // Copy the result back from GPU and clean up. CHK_SUCCESS(finalize_gpu(graph_d, rank_d, mailbox_d, rank), err_free_all); return SUCCESS; - // error handlers err_free_all: totem_free(rank_d, TOTEM_MEM_DEVICE); totem_free(mailbox_d, TOTEM_MEM_DEVICE); @@ -370,17 +349,16 @@ error_t page_rank_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) { } error_t page_rank_cpu(graph_t* graph, rank_t* rank_i, rank_t* rank) { - // Check for special cases + // Check for special cases. bool finished = false; error_t rc = check_special_cases(graph, rank, &finished); - if (finished) return rc; + if (finished) { return rc; } - // allocate buffers rank_t* mailbox; - CALL_SAFE(totem_malloc(graph->vertex_count * sizeof(rank_t), TOTEM_MEM_HOST, - (void**)&mailbox)); + CALL_SAFE(totem_malloc(graph->vertex_count * sizeof(rank_t), TOTEM_MEM_HOST, + reinterpret_cast(&mailbox))); - // initialize the rank of each vertex + // Initialize the rank of each vertex. if (rank_i == NULL) { rank_t initial_value = 1 / (rank_t)graph->vertex_count; for (vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) { @@ -395,13 +373,13 @@ error_t page_rank_cpu(graph_t* graph, rank_t* rank_i, rank_t* rank) { } for (int round = 0; round < PAGE_RANK_ROUNDS; round++) { - // iterate over all vertices to calculate the ranks for this round + // Iterate over all vertices to calculate the ranks for this round // The "runtime" scheduling clause defer the choice of thread scheduling // algorithm to the choice of the client, either via OS environment variable // or omp_set_schedule interface. OMP(omp parallel for schedule(runtime)) - for(vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) { - // calculate the sum of all neighbors' rank + for (vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) { + // Calculate the sum of the neighbors' ranks. rank_t my_rank = rank[vertex_id]; for (eid_t i = graph->vertices[vertex_id]; i < graph->vertices[vertex_id + 1]; i++) { @@ -411,24 +389,21 @@ error_t page_rank_cpu(graph_t* graph, rank_t* rank_i, rank_t* rank) { } // The loop has no load balancing issues, hence the choice of dividing - // the iterations between the threads statically via the static schedule - // clause + // the iterations between the threads statically via the static schedule + // clause. OMP(omp parallel for schedule(static)) - for(vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) { - // get sum of neighbors' ranks + for (vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) { rank_t sum = mailbox[vertex_id]; mailbox[vertex_id] = 0; - // calculate my rank - vid_t neighbors_count = + vid_t neighbors_count = graph->vertices[vertex_id + 1] - graph->vertices[vertex_id]; - rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph->vertex_count) + + rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph->vertex_count) + (PAGE_RANK_DAMPING_FACTOR * sum); rank[vertex_id] = (round == (PAGE_RANK_ROUNDS - 1)) ? my_rank : my_rank / neighbors_count; } } - // we are done! set the output and clean up. totem_free(mailbox, TOTEM_MEM_HOST); return SUCCESS; } diff --git a/src/alg/totem_page_rank_hybrid.cu b/src/alg/totem_page_rank_hybrid.cu index b5162ae..c31e24a 100644 --- a/src/alg/totem_page_rank_hybrid.cu +++ b/src/alg/totem_page_rank_hybrid.cu @@ -21,9 +21,7 @@ #include "totem_alg.h" #include "totem_engine.cuh" -/** - * PageRank specific state - */ +// PageRank-specific state. typedef struct page_rank_state_s { rank_t* rank; rank_t* rank_s; @@ -33,23 +31,15 @@ typedef struct page_rank_state_s { dim3 threads_sum; } page_rank_state_t; -/** - * Stores the final result - */ -rank_t* rank_g = NULL; +// Stores the final result. +PRIVATE rank_t* rank_g = NULL; -/** - * Used as a temporary buffer to host the final result produced by - * GPU partitions - */ -rank_t* rank_h = NULL; +// Used as a temporary buffer to host the final result produced by +// GPU partitions. +PRIVATE rank_t* rank_h = NULL; -/** - * Checks for input parameters and special cases. This is invoked at the - * beginning of public interfaces (GPU and CPU) -*/ -PRIVATE -error_t check_special_cases(rank_t* rank, bool* finished) { +// Checks for input parameters and special cases. +PRIVATE error_t check_special_cases(rank_t* rank, bool* finished) { *finished = true; if (engine_vertex_count() == 0) { return FAILURE; @@ -61,32 +51,28 @@ error_t check_special_cases(rank_t* rank, bool* finished) { return SUCCESS; } -/** - * This structure is used by virtual warp-based implementation. It stores a - * batch of work. It is typically allocated on shared memory and is processed by - * a single virtual warp. - */ +// This structure is used by virtual warp-based implementation. It stores a +// batch of work. It is typically allocated on shared memory and is processed by +// a single virtual warp. typedef struct { eid_t vertices[VWARP_DEFAULT_BATCH_SIZE + 2]; rank_t rank[VWARP_DEFAULT_BATCH_SIZE]; } vwarp_mem_t; -/** - * Phase1 kernel of the PageRank GPU algorithm. Produce the sum of - * the neighbors' ranks. Each vertex atomically adds its value to - * the temporary rank (rank_s) of the destination neighbor vertex. - */ +// Phase1 kernel of the PageRank GPU algorithm. Produce the sum of +// the neighbors' ranks. Each vertex atomically adds its value to +// the temporary rank (rank_s) of the destination neighbor vertex. __global__ -void vwarp_sum_neighbors_rank_kernel(partition_t par, rank_t* rank, +void vwarp_sum_neighbors_rank_kernel(partition_t par, rank_t* rank, rank_t* rank_s, vid_t thread_count) { - if (THREAD_GLOBAL_INDEX >= thread_count) return; + if (THREAD_GLOBAL_INDEX >= thread_count) { return; } vid_t warp_offset = THREAD_GLOBAL_INDEX % VWARP_DEFAULT_WARP_WIDTH; vid_t warp_id = THREAD_GLOBAL_INDEX / VWARP_DEFAULT_WARP_WIDTH; - // copy my work to local space - __shared__ vwarp_mem_t smem[MAX_THREADS_PER_BLOCK / + // Copy my work to local space. + __shared__ vwarp_mem_t smem[MAX_THREADS_PER_BLOCK / VWARP_DEFAULT_WARP_WIDTH]; - vwarp_mem_t* my_space = &smem[THREAD_BLOCK_INDEX / + vwarp_mem_t* my_space = &smem[THREAD_BLOCK_INDEX / VWARP_DEFAULT_WARP_WIDTH]; vid_t v_ = warp_id * VWARP_DEFAULT_BATCH_SIZE; int my_batch_size = VWARP_DEFAULT_BATCH_SIZE; @@ -97,26 +83,28 @@ void vwarp_sum_neighbors_rank_kernel(partition_t par, rank_t* rank, vwarp_memcpy(my_space->vertices, &(par.subgraph.vertices[v_]), my_batch_size + 1, warp_offset); - // iterate over my work - for(vid_t v = 0; v < my_batch_size; v++) { + // Iterate over my work. + for (vid_t v = 0; v < my_batch_size; v++) { vid_t nbr_count = my_space->vertices[v + 1] - my_space->vertices[v]; vid_t* nbrs = &(par.subgraph.edges[my_space->vertices[v]]); - for(vid_t i = warp_offset; i < nbr_count; i += VWARP_DEFAULT_WARP_WIDTH) { + for (vid_t i = warp_offset; i < nbr_count; i += VWARP_DEFAULT_WARP_WIDTH) { const vid_t nbr = nbrs[i]; - rank_t* dst = engine_get_dst_ptr(par.id, nbr, par.outbox, rank_s); + rank_t* dst = engine_get_dst_ptr(par.id, nbr, par.outbox, rank_s); atomicAdd(dst, my_space->rank[v]); } } } __global__ -void compute_normalized_rank_kernel(partition_t par, vid_t vc, rank_t* rank, +void compute_normalized_rank_kernel(partition_t par, vid_t vc, rank_t* rank, rank_t* rank_s) { vid_t v = THREAD_GLOBAL_INDEX; - if (v >= par.subgraph.vertex_count) return; - rank_t r = ((1 - PAGE_RANK_DAMPING_FACTOR) / vc) + + if (v >= par.subgraph.vertex_count) { return; } + vid_t nbr_count = (par.subgraph.vertices[v + 1] - par.subgraph.vertices[v]); + if (nbr_count == 0) { return; } + rank_t r = ((1 - PAGE_RANK_DAMPING_FACTOR) / vc) + (PAGE_RANK_DAMPING_FACTOR * rank_s[v]); - rank[v] = r / (par.subgraph.vertices[v + 1] - par.subgraph.vertices[v]); + rank[v] = r / nbr_count; rank_s[v] = 0; } @@ -125,14 +113,14 @@ void compute_unnormalized_rank_kernel(partition_t par, vid_t vc, rank_t* rank, rank_t* rank_s) { vid_t v = THREAD_GLOBAL_INDEX; if (v >= par.subgraph.vertex_count) return; - rank[v] = ((1 - PAGE_RANK_DAMPING_FACTOR) / vc) + + rank[v] = ((1 - PAGE_RANK_DAMPING_FACTOR) / vc) + (PAGE_RANK_DAMPING_FACTOR * rank_s[v]); } PRIVATE void page_rank_gpu(partition_t* par) { - page_rank_state_t* ps = (page_rank_state_t*)par->algo_state; + page_rank_state_t* ps = reinterpret_cast(par->algo_state); if (engine_superstep() > 1) { - // compute my rank + // Compute my rank. if (engine_superstep() != PAGE_RANK_ROUNDS) { compute_normalized_rank_kernel<<blocks_rank, ps->threads_rank, 0, par->streams[1]>>>(*par, engine_vertex_count(), ps->rank, ps->rank_s); @@ -143,41 +131,40 @@ PRIVATE void page_rank_gpu(partition_t* par) { CALL_CU_SAFE(cudaGetLastError()); } } - // communicate the ranks - engine_set_outbox(par->id, 0); + + engine_set_outbox(par->id, (rank_t)0); vwarp_sum_neighbors_rank_kernel<<blocks_sum, ps->threads_sum, 0, - par->streams[1]>>>(*par, ps->rank, ps->rank_s, + par->streams[1]>>>(*par, ps->rank, ps->rank_s, vwarp_default_thread_count(par->subgraph.vertex_count)); CALL_CU_SAFE(cudaGetLastError()); } PRIVATE void page_rank_cpu(partition_t* par) { - page_rank_state_t* ps = (page_rank_state_t*)par->algo_state; + page_rank_state_t* ps = reinterpret_cast(par->algo_state); graph_t* subgraph = &par->subgraph; vid_t vcount = engine_vertex_count(); int round = engine_superstep(); - if (round > 1) { - // compute my rank The loop has no load balancing issues, hence the choice + // Compute my rank The loop has no load balancing issues, hence the choice // of dividing the iterations between the threads statically via the static - // schedule clause + // schedule clause. OMP(omp parallel for schedule(static)) - for(vid_t v = 0; v < subgraph->vertex_count; v++) { - vid_t nbrs = subgraph->vertices[v + 1] - subgraph->vertices[v]; + for (vid_t v = 0; v < subgraph->vertex_count; v++) { + vid_t nbr_count = subgraph->vertices[v + 1] - subgraph->vertices[v]; + if (nbr_count == 0) { continue; } rank_t rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / vcount) + (PAGE_RANK_DAMPING_FACTOR * ps->rank_s[v]); - ps->rank[v] = (round == (PAGE_RANK_ROUNDS)) ? rank : rank / nbrs; + ps->rank[v] = (round == PAGE_RANK_ROUNDS) ? rank : rank / nbr_count; ps->rank_s[v] = 0; } } - // communicate the ranks - engine_set_outbox(par->id, 0); + engine_set_outbox(par->id, (rank_t)0); // The "runtime" scheduling clause defer the choice of thread scheduling // algorithm to the choice of the client, either via OS environment variable // or omp_set_schedule interface. OMP(omp parallel for schedule(runtime)) - for(vid_t v = 0; v < subgraph->vertex_count; v++) { + for (vid_t v = 0; v < subgraph->vertex_count; v++) { rank_t my_rank = ps->rank[v]; for (eid_t i = subgraph->vertices[v]; i < subgraph->vertices[v + 1]; i++) { vid_t nbr = subgraph->edges[i]; @@ -188,6 +175,7 @@ PRIVATE void page_rank_cpu(partition_t* par) { } PRIVATE void page_rank(partition_t* partition) { + if (partition->subgraph.vertex_count == 0) { return; } if (partition->processor.type == PROCESSOR_GPU) { page_rank_gpu(partition); } else { @@ -200,13 +188,15 @@ PRIVATE void page_rank(partition_t* partition) { } PRIVATE void page_rank_scatter(partition_t* partition) { - page_rank_state_t* ps = (page_rank_state_t*)partition->algo_state; + page_rank_state_t* ps = + reinterpret_cast(partition->algo_state); engine_scatter_inbox_add(partition->id, ps->rank_s); } PRIVATE void page_rank_aggr(partition_t* partition) { - if (!partition->subgraph.vertex_count) return; - page_rank_state_t* ps = (page_rank_state_t*)partition->algo_state; + if (partition->subgraph.vertex_count == 0) { return; } + page_rank_state_t* ps = + reinterpret_cast(partition->algo_state); graph_t* subgraph = &partition->subgraph; rank_t* src_rank = NULL; if (partition->processor.type == PROCESSOR_GPU) { @@ -226,10 +216,10 @@ PRIVATE void page_rank_aggr(partition_t* partition) { PRIVATE void page_rank_init(partition_t* par) { vid_t vcount = par->subgraph.vertex_count; - if (vcount == 0) return; + if (vcount == 0) { return; } page_rank_state_t* ps = NULL; - CALL_SAFE(totem_calloc(sizeof(page_rank_state_t), TOTEM_MEM_HOST, - (void**)&ps)); + CALL_SAFE(totem_calloc(sizeof(page_rank_state_t), TOTEM_MEM_HOST, + reinterpret_cast(&ps))); totem_mem_t type = TOTEM_MEM_HOST; if (par->processor.type == PROCESSOR_GPU) { type = TOTEM_MEM_DEVICE; @@ -237,20 +227,24 @@ PRIVATE void page_rank_init(partition_t* par) { ps->blocks_sum, ps->threads_sum); KERNEL_CONFIGURE(vcount, ps->blocks_rank, ps->threads_rank); } - CALL_SAFE(totem_calloc(vcount * sizeof(rank_t), type, (void**)&(ps->rank_s))); - CALL_SAFE(totem_malloc(vcount * sizeof(rank_t), type, (void**)&(ps->rank))); + CALL_SAFE(totem_calloc(vcount * sizeof(rank_t), type, + reinterpret_cast(&ps->rank_s))); + CALL_SAFE(totem_calloc(vcount * sizeof(rank_t), type, + reinterpret_cast(&ps->rank))); rank_t init_value = 1 / (rank_t)engine_vertex_count(); totem_memset(ps->rank, init_value, vcount, type, par->streams[1]); par->algo_state = ps; } PRIVATE void page_rank_finalize(partition_t* partition) { + if (partition->subgraph.vertex_count == 0) { return; } assert(partition->algo_state); - page_rank_state_t* ps = (page_rank_state_t*)partition->algo_state; + page_rank_state_t* ps = + reinterpret_cast(partition->algo_state); totem_mem_t type = TOTEM_MEM_HOST; if (partition->processor.type == PROCESSOR_GPU) { type = TOTEM_MEM_DEVICE; - } + } totem_free(ps->rank, type); totem_free(ps->rank_s, type); totem_free(ps, TOTEM_MEM_HOST); @@ -268,13 +262,14 @@ error_t page_rank_hybrid(rank_t *rank_i, rank_t* rank) { // initialize the engine engine_config_t config = { - NULL, page_rank, page_rank_scatter, NULL, page_rank_init, + NULL, page_rank, page_rank_scatter, NULL, page_rank_init, page_rank_finalize, page_rank_aggr, GROOVES_PUSH }; engine_config(&config); if (engine_largest_gpu_partition()) { - CALL_SAFE(totem_malloc(engine_largest_gpu_partition() * sizeof(rank_t), - TOTEM_MEM_HOST_PINNED, (void**)&rank_h)); + CALL_SAFE(totem_malloc(engine_largest_gpu_partition() * sizeof(rank_t), + TOTEM_MEM_HOST_PINNED, + reinterpret_cast(&rank_h))); } engine_execute(); diff --git a/src/alg/totem_stcon.cu b/src/alg/totem_stcon.cu deleted file mode 100644 index cff57eb..0000000 --- a/src/alg/totem_stcon.cu +++ /dev/null @@ -1,64 +0,0 @@ -/* TODO(lauro,abdullah,elizeu): Add license. - * - * This file contains an implementation of the ST Connectivity. - * - * Created on: 2011-04-01 - * Author: Lauro Beltrão Costa - */ - -// totem includes -#include "totem_alg.h" - -__host__ -error_t stcon_gpu(const graph_t* graph, vid_t source_id, vid_t destination_id, - bool* connected) { - if((graph == NULL) || (source_id >= graph->vertex_count) - || (destination_id >= graph->vertex_count)) { - *connected = false; - return FAILURE; - } - if( source_id == destination_id ) { - *connected = true; - return SUCCESS; - } - - dim3 blocks; - dim3 threads_per_block; - KERNEL_CONFIGURE(graph->vertex_count, blocks, threads_per_block); - //TODO(abdullah, lauro) handle the case (vertex_count > number of threads). - assert(graph->vertex_count <= MAX_THREAD_COUNT); - - // Create graph on GPU memory. - graph_t* graph_d; - CHK_SUCCESS(graph_initialize_device(graph, &graph_d), err); - - // TODO(lauro): Finish stcon_gpu implementation. - - graph_finalize_device(graph_d); - *connected = false; - return FAILURE; - - // error handlers - err: - *connected = false; - return FAILURE; -} - -__host__ -error_t stcon_cpu(const graph_t* graph, vid_t source_id, vid_t destination_id, - bool* connected) { - if((graph == NULL) || (source_id >= graph->vertex_count) - || (destination_id >= graph->vertex_count)) { - *connected = false; - return FAILURE; - } - if( source_id == destination_id ) { - *connected = true; - return SUCCESS; - } - - // TODO(lauro): Finish stcon_cpu implementation. - - *connected = false; - return FAILURE; -} diff --git a/src/make.defs b/src/make.defs index 4e853a0..22c175a 100644 --- a/src/make.defs +++ b/src/make.defs @@ -150,5 +150,7 @@ GTEST_SRC = $(GTEST_PATH)/src/gtest_main.cc $(GTEST_PATH)/src/gtest-all.cc TEST_PATH = $(ROOTDIR)/test TEST_DEPS = $(wildcard $(TEST_PATH)/*.[cuh,h]) +WATTSUP_PATH = $(ROOTDIR)/thirdparty/wattsup + TOTEM_PATH = $(ROOTDIR)/totem TOTEM_DEPS = $(wildcard $(TOTEM_PATH)/*.[cuh,h]) diff --git a/src/makefile b/src/makefile index 2958644..52b6a1e 100644 --- a/src/makefile +++ b/src/makefile @@ -21,6 +21,9 @@ graph500: test: @$(MAKE) -C $(TEST_PATH) +wattsup: + @$(MAKE) -C $(WATTSUP_PATH) + clean-build: @rm -rf *~ $(BUILDDIR) diff --git a/src/test/totem_betweenness_unittest.cu b/src/test/totem_betweenness_unittest.cu index e37dcdc..61e7c5c 100644 --- a/src/test/totem_betweenness_unittest.cu +++ b/src/test/totem_betweenness_unittest.cu @@ -1,5 +1,4 @@ -/* TODO(lauro,abdullah,elizeu): Add license. - * +/* * Contains unit tests for betweenness centrality. * * Created on: 2011-10-21 @@ -12,50 +11,17 @@ #if GTEST_HAS_PARAM_TEST using ::testing::TestWithParam; -using ::testing::Values; - -/** - * Wrapper for betweenness_cpu to provide the singature expected for use in - * the unit tests with the other Betweenness Centrality algorithms - */ -PRIVATE error_t betweenness_cpu_exact(const graph_t* graph, - score_t* betweenness_score) { - // call betweenness_cpu for use in unit test framework with exact precision - return betweenness_cpu(graph, CENTRALITY_EXACT, betweenness_score); -} - -/** - * Wrapper for betweenness_gpu to provide the singature expected for use in - * the unit tests with the other Betweenness Centrality algorithms - */ -PRIVATE error_t betweenness_gpu_exact(const graph_t* graph, - score_t* betweenness_score) { - // call betweenness_gpu for use in unit test framework with exact precision - return betweenness_gpu(graph, CENTRALITY_EXACT, betweenness_score); -} +using ::testing::ValuesIn; -// The following implementation relies on -// TestWithParam to test the two versions of Betweenness -// Centrality implemented: CPU and GPU. Details on how to use TestWithParam -// can be found at: -// http://code.google.com/p/googletest/source/browse/trunk/samples/sample7_unittest.cc +typedef error_t(*BetwCentralityFunction)(const graph_t*, double, score_t*); +typedef error_t(*BetwCentralityHybridFunction)(double, score_t*); -typedef error_t(*BetwCentralityFunction)(const graph_t*, score_t*); - -// This is to allow testing the vanilla BC functions and the hybrid one -// that is based on the framework. Note that have a different signature -// of the hybrid algorithm forced this work-around. -typedef struct betweenness_param_s { - totem_attr_t* attr; // totem attributes for totem-based tests - BetwCentralityFunction func; // the vanilla BC function if attr is NULL -} betweenness_param_t; - -class BetweennessCentralityTest : public TestWithParam { +class BetweennessCentralityTest : public TestWithParam { public: virtual void SetUp() { - // Ensure the minimum CUDA architecture is supported + // Ensure the minimum CUDA architecture is supported. CUDA_CHECK_VERSION(); - _betweenness_param = GetParam(); + _param = GetParam(); _graph = NULL; _betweenness_score = NULL; } @@ -69,35 +35,40 @@ class BetweennessCentralityTest : public TestWithParam { error_t TestGraph() { if (_graph->vertex_count) { - totem_malloc(_graph->vertex_count * sizeof(score_t), - TOTEM_MEM_HOST, (void**)&_betweenness_score); + totem_malloc(_graph->vertex_count * sizeof(score_t), + TOTEM_MEM_HOST, + reinterpret_cast(&_betweenness_score)); } - if (_betweenness_param->attr != NULL) { - _betweenness_param->attr->push_msg_size = + if (_param->attr != NULL) { + _param->attr->push_msg_size = sizeof(uint32_t) * BITS_PER_BYTE; - _betweenness_param->attr->pull_msg_size = + _param->attr->pull_msg_size = sizeof(score_t) * BITS_PER_BYTE; - if (totem_init(_graph, _betweenness_param->attr) == FAILURE) { + if (totem_init(_graph, _param->attr) == FAILURE) { return FAILURE; } - // Will use exact betweenness centrality for test framework - error_t err = betweenness_hybrid(CENTRALITY_EXACT, _betweenness_score); + BetwCentralityHybridFunction func = + reinterpret_cast + (_param->func); + error_t err = func(CENTRALITY_EXACT, _betweenness_score); totem_finalize(); return err; } - return _betweenness_param->func(_graph, _betweenness_score); + BetwCentralityFunction func = + reinterpret_cast(_param->func); + return func(_graph, CENTRALITY_EXACT, _betweenness_score); } protected: - betweenness_param_t* _betweenness_param; - graph_t* _graph; - score_t* _betweenness_score; + test_param_t* _param; + graph_t* _graph; + score_t* _betweenness_score; }; // Tests BetwCentrality for empty graphs. TEST_P(BetweennessCentralityTest, Empty) { - _graph = (graph_t*)calloc(sizeof(graph_t), 1); + _graph = reinterpret_cast(calloc(sizeof(graph_t), 1)); EXPECT_EQ(FAILURE, TestGraph()); free(_graph); _graph = NULL; @@ -115,16 +86,20 @@ TEST_P(BetweennessCentralityTest, SingleNodeUnweighted) { TEST_P(BetweennessCentralityTest, Chain100Unweighted) { graph_initialize(DATA_FOLDER("chain_100_nodes_weight_directed.totem"), false, &_graph); - // First vertex as source + // First vertex as source. EXPECT_EQ(SUCCESS, TestGraph()); - score_t centrality[50]; - for (vid_t i = 0; i < 50; i++) { - centrality[i] = (99 - i) * (i); + score_t* expected_centrality = + reinterpret_cast(calloc(_graph->vertex_count / 2, + sizeof(score_t))); + for (vid_t i = 0; i < (_graph->vertex_count / 2); i++) { + expected_centrality[i] = (_graph->vertex_count - 1 - i) * (i); } - for (vid_t i = 0; i < 50; i++) { - EXPECT_EQ(centrality[i], _betweenness_score[i]); - EXPECT_EQ(centrality[i], _betweenness_score[99 - i]); + for (vid_t i = 0; i < (_graph->vertex_count / 2); i++) { + EXPECT_EQ(expected_centrality[i], _betweenness_score[i]); + EXPECT_EQ(expected_centrality[i], + _betweenness_score[_graph->vertex_count - 1 - i]); } + free(expected_centrality); } // Tests BetwCentrality for a complete graph of 300 nodes. @@ -134,7 +109,7 @@ TEST_P(BetweennessCentralityTest, CompleteGraphUnweighted) { false, &_graph)); EXPECT_EQ(SUCCESS, TestGraph()); - for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++){ + for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) { EXPECT_FLOAT_EQ(0.0, _betweenness_score[vertex]); } } @@ -145,82 +120,45 @@ TEST_P(BetweennessCentralityTest, StarGraphUnweighted) { false, &_graph)); EXPECT_EQ(SUCCESS, TestGraph()); - EXPECT_FLOAT_EQ((_graph->vertex_count - 1) * (_graph->vertex_count - 2), + EXPECT_FLOAT_EQ((_graph->vertex_count - 1) * (_graph->vertex_count - 2), _betweenness_score[0]); - for(vid_t vertex = 1; vertex < _graph->vertex_count; vertex++){ + for (vid_t vertex = 1; vertex < _graph->vertex_count; vertex++) { EXPECT_FLOAT_EQ(0.0, _betweenness_score[vertex]); } } -// Functions to test in framework -betweenness_param_t betweenness_params[] = { - {NULL, &betweenness_unweighted_cpu}, - {NULL, &betweenness_unweighted_gpu}, - {NULL, &betweenness_unweighted_shi_gpu}, - {NULL, &betweenness_cpu_exact}, - {NULL, &betweenness_gpu_exact}, - {&totem_attrs[0], NULL}, - {&totem_attrs[1], NULL}, - {&totem_attrs[2], NULL}, - {&totem_attrs[3], NULL}, - {&totem_attrs[4], NULL}, - {&totem_attrs[5], NULL}, - {&totem_attrs[6], NULL}, - {&totem_attrs[7], NULL}, - {&totem_attrs[8], NULL}, - {&totem_attrs[9], NULL}, - {&totem_attrs[10], NULL}, - {&totem_attrs[11], NULL}, - {&totem_attrs[12], NULL}, - {&totem_attrs[13], NULL}, - {&totem_attrs[14], NULL}, - {&totem_attrs[15], NULL}, - {&totem_attrs[16], NULL}, - {&totem_attrs[17], NULL}, - {&totem_attrs[18], NULL}, - {&totem_attrs[19], NULL}, - {&totem_attrs[20], NULL}, - {&totem_attrs[21], NULL}, - {&totem_attrs[22], NULL}, - {&totem_attrs[23], NULL} +// Defines the set of Betweenness vanilla implementations to be tested. To test +// a new implementation, simply add it to the set below. +static void* vanilla_funcs[] = { + reinterpret_cast(&betweenness_cpu), + reinterpret_cast(&betweenness_gpu), }; +static const int vanilla_count = STATIC_ARRAY_COUNT(vanilla_funcs); + +// Defines the set of PageRank hybrid implementations to be tested. To test +// a new implementation, simply add it to the set below. +static void* hybrid_funcs[] = { + reinterpret_cast(&betweenness_hybrid), +}; +static const int hybrid_count = STATIC_ARRAY_COUNT(hybrid_funcs); + +// Maintains references to the different configurations (vanilla and hybrid) +// that will be tested by the framework. +static const int params_count = vanilla_count + + hybrid_count * hybrid_configurations_count; +static test_param_t* params[params_count]; // From Google documentation: // In order to run value-parameterized tests, we need to instantiate them, // or bind them to a list of values which will be used as test parameters. // -// Values() receives a list of parameters and the framework will execute the -// whole set of tests BetweennessCentralityTest for each element of Values() -INSTANTIATE_TEST_CASE_P(BetwCentralityGPUAndCPUTest, BetweennessCentralityTest, - Values(&betweenness_params[0], - &betweenness_params[1], - &betweenness_params[2], - &betweenness_params[3], - &betweenness_params[4], - &betweenness_params[5], - &betweenness_params[6], - &betweenness_params[7], - &betweenness_params[8], - &betweenness_params[9], - &betweenness_params[10], - &betweenness_params[11], - &betweenness_params[12], - &betweenness_params[13], - &betweenness_params[14], - &betweenness_params[15], - &betweenness_params[16], - &betweenness_params[17], - &betweenness_params[18], - &betweenness_params[19], - &betweenness_params[20], - &betweenness_params[21], - &betweenness_params[22], - &betweenness_params[23], - &betweenness_params[24], - &betweenness_params[25], - &betweenness_params[26], - &betweenness_params[27], - &betweenness_params[28])); +// ValuesIn() receives a list of parameters and the framework will execute the +// whole set of tests for each entry in the array passed to ValuesIn(). +INSTANTIATE_TEST_CASE_P(BetweennessGPUAndCPUTest, BetweennessCentralityTest, + ValuesIn(GetParameters(params, params_count, + vanilla_funcs, vanilla_count, + hybrid_funcs, hybrid_count), + params + params_count)); #else // From Google documentation: diff --git a/src/test/totem_bfs_unittest.cu b/src/test/totem_bfs_unittest.cu index 1dd9aaa..d1c9cce 100644 --- a/src/test/totem_bfs_unittest.cu +++ b/src/test/totem_bfs_unittest.cu @@ -1,4 +1,4 @@ -/* +/* * Contains unit tests for an implementation of the breadth-first search (BFS) * graph search algorithm. * @@ -12,24 +12,17 @@ #if GTEST_HAS_PARAM_TEST using ::testing::TestWithParam; -using ::testing::Values; +using ::testing::ValuesIn; // The following implementation relies on TestWithParam to test -// the two versions of BFS implemented: CPU and GPU. -// Details on how to use TestWithParam can be found at: +// the the different versions of BFS. Details on how to use TestWithParam +// can be found at: // http://code.google.com/p/googletest/source/browse/trunk/samples/sample7_unittest.cc typedef error_t(*BFSFunction)(graph_t*, vid_t, cost_t*); +typedef error_t(*BFSHybridFunction)(vid_t, cost_t*); -// This is to allow testing the vanilla bfs functions and the hybrid one -// that is based on the framework. Note that have a different signature -// of the hybrid algorithm forced this work-around. -typedef struct bfs_param_s { - totem_attr_t* attr; // totem attributes for totem-based tests - BFSFunction func; // the vanilla bfs function if attr is NULL -} bfs_param_t; - -class BFSTest : public TestWithParam { +class BFSTest : public TestWithParam { public: virtual void SetUp() { // Ensure the minimum CUDA architecture is supported @@ -46,25 +39,30 @@ class BFSTest : public TestWithParam { error_t TestGraph(vid_t src) { if (_bfs_param->attr) { _bfs_param->attr->push_msg_size = 1; + _bfs_param->attr->pull_msg_size = 1; if (totem_init(_graph, _bfs_param->attr) == FAILURE) { return FAILURE; } - error_t err = bfs_hybrid(src, _cost); + BFSHybridFunction func = + reinterpret_cast(_bfs_param->func); + error_t err = func(src, _cost); totem_finalize(); return err; } - return _bfs_param->func(_graph, src, _cost); + BFSFunction func = reinterpret_cast(_bfs_param->func); + return func(_graph, src, _cost); } + protected: - bfs_param_t* _bfs_param; - totem_mem_t _mem_type; - graph_t* _graph; - cost_t* _cost; + test_param_t* _bfs_param; + totem_mem_t _mem_type; + graph_t* _graph; + cost_t* _cost; }; // Tests BFS for empty graphs. TEST_P(BFSTest, Empty) { - _graph = (graph_t*)calloc(1, sizeof(graph_t)); + _graph = reinterpret_cast(calloc(1, sizeof(graph_t))); EXPECT_EQ(FAILURE, TestGraph(0)); EXPECT_EQ(FAILURE, TestGraph(99)); free(_graph); @@ -74,8 +72,8 @@ TEST_P(BFSTest, Empty) { // Tests BFS for single node graphs. TEST_P(BFSTest, SingleNode) { graph_initialize(DATA_FOLDER("single_node.totem"), false, &_graph); - CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, - (void**)&_cost)); + CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, + reinterpret_cast(&_cost))); EXPECT_EQ(SUCCESS, TestGraph(0)); EXPECT_EQ((cost_t)0, _cost[0]); EXPECT_EQ(FAILURE, TestGraph(1)); @@ -83,8 +81,8 @@ TEST_P(BFSTest, SingleNode) { TEST_P(BFSTest, SingleNodeLoop) { graph_initialize(DATA_FOLDER("single_node_loop.totem"), false, &_graph); - CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, - (void**)&_cost)); + CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, + reinterpret_cast(&_cost))); EXPECT_EQ(SUCCESS, TestGraph(0)); EXPECT_EQ((cost_t)0, _cost[0]); EXPECT_EQ(FAILURE, TestGraph(1)); @@ -94,14 +92,14 @@ TEST_P(BFSTest, SingleNodeLoop) { TEST_P(BFSTest, EmptyEdges) { graph_initialize(DATA_FOLDER("disconnected_1000_nodes.totem"), false, &_graph); - CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, - (void**)&_cost)); + CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, + reinterpret_cast(&_cost))); // First vertex as source vid_t source = 0; EXPECT_EQ(SUCCESS, TestGraph(source)); EXPECT_EQ((cost_t)0, _cost[source]); - for(vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++) { + for (vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++) { EXPECT_EQ(INF_COST, _cost[vertex]); } @@ -109,14 +107,14 @@ TEST_P(BFSTest, EmptyEdges) { source = _graph->vertex_count - 1; EXPECT_EQ(SUCCESS, TestGraph(source)); EXPECT_EQ((cost_t)0, _cost[source]); - for(vid_t vertex = source; vertex < _graph->vertex_count - 1; vertex++){ + for (vid_t vertex = source; vertex < _graph->vertex_count - 1; vertex++) { EXPECT_EQ(INF_COST, _cost[vertex]); } // A vertex in the middle as source source = 199; EXPECT_EQ(SUCCESS, TestGraph(source)); - for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) { + for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) { EXPECT_EQ((vertex == source) ? (cost_t)0 : INF_COST, _cost[vertex]); } @@ -127,28 +125,29 @@ TEST_P(BFSTest, EmptyEdges) { // Tests BFS for a chain of 1000 nodes. TEST_P(BFSTest, Chain) { graph_initialize(DATA_FOLDER("chain_1000_nodes.totem"), false, &_graph); - CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, - (void**)&_cost)); + CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, + reinterpret_cast(&_cost))); // First vertex as source vid_t source = 0; EXPECT_EQ(SUCCESS, TestGraph(source)); - for(vid_t vertex = source; vertex < _graph->vertex_count; vertex++){ + for (vid_t vertex = source; vertex < _graph->vertex_count; vertex++) { EXPECT_EQ(vertex, _cost[vertex]); } // Last vertex as source source = _graph->vertex_count - 1; EXPECT_EQ(SUCCESS, TestGraph(source)); - for(vid_t vertex = source; vertex < _graph->vertex_count; vertex++){ + for (vid_t vertex = source; vertex < _graph->vertex_count; vertex++) { EXPECT_EQ(source - vertex, _cost[vertex]); } // A vertex in the middle as source source = 199; EXPECT_EQ(SUCCESS, TestGraph(source)); - for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) { - EXPECT_EQ((cost_t)abs((double)source - (double)vertex), _cost[vertex]); + for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) { + EXPECT_EQ((cost_t)abs(static_cast(source) - + static_cast(vertex)), _cost[vertex]); } // Non existent vertex source @@ -159,14 +158,14 @@ TEST_P(BFSTest, Chain) { TEST_P(BFSTest, CompleteGraph) { graph_initialize(DATA_FOLDER("complete_graph_300_nodes.totem"), false, &_graph); - CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, - (void**)&_cost)); + CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, + reinterpret_cast(&_cost))); // First vertex as source vid_t source = 0; EXPECT_EQ(SUCCESS, TestGraph(source)); EXPECT_EQ((cost_t)0, _cost[source]); - for(vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++){ + for (vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++) { EXPECT_EQ((cost_t)1, _cost[vertex]); } @@ -174,14 +173,14 @@ TEST_P(BFSTest, CompleteGraph) { source = _graph->vertex_count - 1; EXPECT_EQ(SUCCESS, TestGraph(source)); EXPECT_EQ((cost_t)0, _cost[source]); - for(vid_t vertex = 0; vertex < source; vertex++) { + for (vid_t vertex = 0; vertex < source; vertex++) { EXPECT_EQ((cost_t)1, _cost[vertex]); } // A vertex source in the middle source = 199; EXPECT_EQ(SUCCESS, TestGraph(source)); - for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) { + for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) { EXPECT_EQ((cost_t)((source == vertex) ? 0 : 1), _cost[vertex]); } @@ -192,14 +191,14 @@ TEST_P(BFSTest, CompleteGraph) { // Tests BFS for a complete graph of 1000 nodes. TEST_P(BFSTest, Star) { graph_initialize(DATA_FOLDER("star_1000_nodes.totem"), false, &_graph); - CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, - (void**)&_cost)); + CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, + reinterpret_cast(&_cost))); // First vertex as source vid_t source = 0; EXPECT_EQ(SUCCESS, TestGraph(source)); EXPECT_EQ((cost_t)0, _cost[source]); - for(vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++){ + for (vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++) { EXPECT_EQ((cost_t)1, _cost[vertex]); } @@ -208,7 +207,7 @@ TEST_P(BFSTest, Star) { EXPECT_EQ(SUCCESS, TestGraph(source)); EXPECT_EQ((cost_t)0, _cost[source]); EXPECT_EQ((cost_t)1, _cost[0]); - for(vid_t vertex = 1; vertex < source - 1; vertex++) { + for (vid_t vertex = 1; vertex < source - 1; vertex++) { EXPECT_EQ((cost_t)2, _cost[vertex]); } @@ -216,7 +215,7 @@ TEST_P(BFSTest, Star) { source = 199; EXPECT_EQ(SUCCESS, TestGraph(source)); EXPECT_EQ((cost_t)1, _cost[0]); - for(vid_t vertex = 1; vertex < _graph->vertex_count; vertex++) { + for (vid_t vertex = 1; vertex < _graph->vertex_count; vertex++) { EXPECT_EQ((cost_t)((source == vertex) ? 0 : 2), _cost[vertex]); } @@ -224,42 +223,31 @@ TEST_P(BFSTest, Star) { EXPECT_EQ(FAILURE, TestGraph(_graph->vertex_count)); } -// TODO(lauro): Add test cases for not well defined structures. - -// Values() seems to accept only pointers, hence the possible parameters -// are defined here, and a pointer to each ot them is used. -bfs_param_t bfs_params[] = { - {NULL, &bfs_cpu}, - {NULL, &bfs_bu_cpu}, - {NULL, &bfs_queue_cpu}, - {NULL, &bfs_gpu}, - {NULL, &bfs_bu_gpu}, - {NULL, &bfs_vwarp_gpu}, - {&totem_attrs[0], NULL}, - {&totem_attrs[1], NULL}, - {&totem_attrs[2], NULL}, - {&totem_attrs[3], NULL}, - {&totem_attrs[4], NULL}, - {&totem_attrs[5], NULL}, - {&totem_attrs[6], NULL}, - {&totem_attrs[7], NULL}, - {&totem_attrs[8], NULL}, - {&totem_attrs[9], NULL}, - {&totem_attrs[10], NULL}, - {&totem_attrs[11], NULL}, - {&totem_attrs[12], NULL}, - {&totem_attrs[13], NULL}, - {&totem_attrs[14], NULL}, - {&totem_attrs[15], NULL}, - {&totem_attrs[16], NULL}, - {&totem_attrs[17], NULL}, - {&totem_attrs[18], NULL}, - {&totem_attrs[19], NULL}, - {&totem_attrs[20], NULL}, - {&totem_attrs[21], NULL}, - {&totem_attrs[22], NULL}, - {&totem_attrs[23], NULL} +// Defines the set of BFS vanilla implementations to be tested. To test +// a new implementation, simply add it to the set below. +void* bfs_vanilla_funcs[] = { + reinterpret_cast(&bfs_cpu), + reinterpret_cast(&bfs_bu_cpu), + reinterpret_cast(&bfs_queue_cpu), + reinterpret_cast(&bfs_gpu), + reinterpret_cast(&bfs_vwarp_gpu), + reinterpret_cast(&bfs_bu_gpu), +}; +const int bfs_vanilla_count = STATIC_ARRAY_COUNT(bfs_vanilla_funcs); + +// Defines the set of BFS hybrid implementations to be tested. To test +// a new implementation, simply add it to the set below. +void* bfs_hybrid_funcs[] = { + reinterpret_cast(&bfs_hybrid), + reinterpret_cast(&bfs_stepwise_hybrid), }; +const int bfs_hybrid_count = STATIC_ARRAY_COUNT(bfs_hybrid_funcs); + +// Maintains references to the different configurations (vanilla and hybrid) +// that will be tested by the framework. +static const int bfs_params_count = bfs_vanilla_count + + bfs_hybrid_count * hybrid_configurations_count; +static test_param_t* bfs_params[bfs_params_count]; // From Google documentation: // In order to run value-parameterized tests, we need to instantiate them, @@ -267,37 +255,12 @@ bfs_param_t bfs_params[] = { // // Values() receives a list of parameters and the framework will execute the // whole set of tests BFSTest for each element of Values() -INSTANTIATE_TEST_CASE_P(BFSGPUAndCPUTest, BFSTest, Values(&bfs_params[0], - &bfs_params[1], - &bfs_params[2], - &bfs_params[3], - &bfs_params[4], - &bfs_params[5], - &bfs_params[6], - &bfs_params[7], - &bfs_params[8], - &bfs_params[9], - &bfs_params[10], - &bfs_params[11], - &bfs_params[12], - &bfs_params[13], - &bfs_params[14], - &bfs_params[15], - &bfs_params[16], - &bfs_params[17], - &bfs_params[18], - &bfs_params[19], - &bfs_params[20], - &bfs_params[21], - &bfs_params[22], - &bfs_params[23], - &bfs_params[24], - &bfs_params[25], - &bfs_params[26], - &bfs_params[27], - &bfs_params[28], - &bfs_params[29] )); - +INSTANTIATE_TEST_CASE_P(BFSGPUAndCPUTest, BFSTest, + ValuesIn(GetParameters( + bfs_params, bfs_params_count, + bfs_vanilla_funcs, bfs_vanilla_count, + bfs_hybrid_funcs, bfs_hybrid_count), + bfs_params + bfs_params_count)); #else // From Google documentation: diff --git a/src/test/totem_common_unittest.h b/src/test/totem_common_unittest.h index bf1c823..1d3a39b 100644 --- a/src/test/totem_common_unittest.h +++ b/src/test/totem_common_unittest.h @@ -27,7 +27,7 @@ #define CUDA_CHECK_VERSION() \ do { \ - if(check_cuda_version() != SUCCESS) { \ + if (check_cuda_version() != SUCCESS) { \ exit(EXIT_FAILURE); \ } \ } while (0) @@ -47,7 +47,7 @@ if (!(stmt)) { \ printf("Error line: %d\n", __LINE__); \ } \ - } while(0) + } while (0) // Hybrid algorithms attributes const float CPU_SHARE_ZERO = 0; @@ -58,109 +58,195 @@ const bool VERTEX_IDS_SORTED = true; const bool VERTEX_IDS_NOT_SORTED = false; const int GPU_COUNT_ONE = 1; PRIVATE totem_attr_t totem_attrs[] = { - // (0) CPU only - {PAR_RANDOM, PLATFORM_CPU, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (1) GPU only - {PAR_RANDOM, PLATFORM_GPU, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (2) Multi GPU - {PAR_RANDOM, PLATFORM_GPU, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - - // (3) Hybrid CPU + 1 GPU - {PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (4) Hybrid CPU + 1 GPU - {PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (5) Hybrid CPU + 1 GPU - {PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - - // (6) Hybrid CPU + 1 GPU, sorted vertices - {PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (7) Hybrid CPU + 1 GPU, sorted vertices - {PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (8) Hybrid CPU + 1 GPU, sorted vertices - {PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - - // (9) Hybrid CPU + 1 GPU (memory mapped GPU partition) - {PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (10) Hybrid CPU + 1 GPU (memory mapped GPU partition) - {PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (11) Hybrid CPU + 1 GPU (memory mapped GPU partition) - {PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - - // (12) Hybrid CPU + all GPU - {PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (13) Hybrid CPU + all GPU - {PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (14) Hybrid CPU + all GPU - {PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - - // (15) Hybrid CPU + all GPU, sorted vertices - {PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (16) Hybrid CPU + all GPU, sorted vertices - {PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (17) Hybrid CPU + all GPU, sorted vertices - {PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - - // (18) Hybrid CPU + all GPU, randomized vertex placement - {PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (19) Hybrid CPU + all GPU, randomized vertex placement - {PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (20) Hybrid CPU + all GPU, randomized vertex placement - {PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - - // (21) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement - {PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (22) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement - {PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, - // (23) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement - {PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, - GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, - MSG_SIZE_ZERO, MSG_SIZE_ZERO}, + { // (0) CPU only + PAR_RANDOM, PLATFORM_CPU, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (1) GPU only + PAR_RANDOM, PLATFORM_GPU, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (2) Multi GPU + PAR_RANDOM, PLATFORM_GPU, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + + { // (3) Hybrid CPU + 1 GPU + PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (4) Hybrid CPU + 1 GPU + PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (5) Hybrid CPU + 1 GPU + PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + + { // (6) Hybrid CPU + 1 GPU, sorted vertices + PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (7) Hybrid CPU + 1 GPU, sorted vertices + PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (8) Hybrid CPU + 1 GPU, sorted vertices + PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + + { // (9) Hybrid CPU + 1 GPU (memory mapped GPU partition) + PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (10) Hybrid CPU + 1 GPU (memory mapped GPU partition) + PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (11) Hybrid CPU + 1 GPU (memory mapped GPU partition) + PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + + { // (12) Hybrid CPU + all GPU + PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (13) Hybrid CPU + all GPU + PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (14) Hybrid CPU + all GPU + PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + + { // (15) Hybrid CPU + all GPU, sorted vertices + PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (16) Hybrid CPU + all GPU, sorted vertices + PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (17) Hybrid CPU + all GPU, sorted vertices + PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + + { // (18) Hybrid CPU + all GPU, randomized vertex placement + PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (19) Hybrid CPU + all GPU, randomized vertex placement + PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (20) Hybrid CPU + all GPU, randomized vertex placement + PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + + { // (21) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement + PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (22) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement + PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, + { // (23) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement + PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE, + GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD, + MSG_SIZE_ZERO, MSG_SIZE_ZERO + }, }; -#endif // TOTEM_COMMON_UNITTEST_H +// A macro that computes the number of elements of a static array. +#define STATIC_ARRAY_COUNT(array) sizeof(array) / sizeof(*array); + +// The number of hybrid configurations in the totem_attr array. +static const int hybrid_configurations_count = STATIC_ARRAY_COUNT(totem_attrs); + +// This is to allow testing the vanilla and the hybrid functions that are +// based on the Totem framework. +typedef struct { + totem_attr_t* attr; // Attributes for totem-based implementations. + void* func; // The algorithm function to be tested. +} test_param_t; + +// Adds a test parameter to the passed vector of parameters. +static void PushParam(std::vector* params_vector, + totem_attr_t* attr, void* func) { + test_param_t param; + param.attr = attr; + param.func = func; + params_vector->push_back(param); +} + +// Returns a reference to an array of references to the various test parameters +// to be tested. +static test_param_t** GetParameters(test_param_t** params, int params_count, + void** vanilla_funcs, int vanilla_count, + void** hybrid_funcs, int hybrid_count) { + // When this function is passed as a parameter to "ValuesIn" in the context of + // INSTANTIATE_TEST_CASE_P macro, it gets invoked more than once within + // the macro; therefore, the following hack is used to ensure that + // initialization of the parameters array happens once. + static bool initialized = false; + if (initialized) { return params; } + + // This vector maintains the state of the different parameters during the + // the tests, and hence it is defined static. + static std::vector params_vector; + + // Add the vanilla implementations. + for (int i = 0; i < vanilla_count; i++) { + PushParam(¶ms_vector, NULL, vanilla_funcs[i]); + } + + // Add the hybrid implementations. + for (int i = 0; i < hybrid_count; i++) { + // Add the different configurations of the hybrid implementation. + for (uint32_t j = 0; j < hybrid_configurations_count; j++) { + PushParam(¶ms_vector, &totem_attrs[j], hybrid_funcs[i]); + } + } + + // Fill the params array with references to the parameters to be tested + // (maintained by params_vector throughout the execution of the tests). + assert(params_count == params_vector.size()); + for (size_t i = 0; i != params_vector.size(); i++) { + params[i] = ¶ms_vector[i]; + } + + initialized = true; + return params; +} + +#endif // TOTEM_COMMON_UNITTEST_H diff --git a/src/test/totem_page_rank_unittest.cu b/src/test/totem_page_rank_unittest.cu index 441946c..70ae4ff 100644 --- a/src/test/totem_page_rank_unittest.cu +++ b/src/test/totem_page_rank_unittest.cu @@ -1,5 +1,6 @@ /* - * Contains unit tests for an implementation of the PageRank graph algorithm. + * Contains unit tests for the different implementations of the PageRank graph + * algorithm. * * Created on: 2011-03-22 * Author: Abdullah Gharaibeh @@ -11,28 +12,26 @@ #if GTEST_HAS_PARAM_TEST using ::testing::TestWithParam; -using ::testing::Values; +using ::testing::ValuesIn; + + +// IMPORTANT NOTE: Some of the unit tests for page_rank_incoming_hybrid will +// fail if the type rank_t is float due to differences in single precision +// calculations between the CPU and the GPU. Defining rank_t as double should +// allow the tests to pass. // The following implementation relies on TestWithParam to -// test the two versions of PageRank implemented: CPU and GPU. -// Details on how to use TestWithParam can be found at: -// totem_bfs_unittest.cc and +// test the different versions of PageRank. Details on how to use +// TestWithParam can be found at: // http://code.google.com/p/googletest/source/browse/trunk/samples/sample7_unittest.cc typedef error_t(*PageRankFunction)(graph_t*, rank_t*, rank_t*); +typedef error_t(*PageRankHybridFunction)(rank_t*, rank_t*); -// This is to allow testing the vanilla bfs functions and the hybrid one -// that is based on the framework. Note that have a different signature -// of the hybrid algorithm forced this work-around. -typedef struct page_rank_param_s { - totem_attr_t* attr; // totem attributes for totem-based tests - PageRankFunction func; // the vanilla page_rank function if attr is NULL -} page_rank_param_t; - -class PageRankTest : public TestWithParam { +class PageRankTest : public TestWithParam { public: virtual void SetUp() { - // Ensure the minimum CUDA architecture is supported + // Ensure the minimum CUDA architecture is supported. CUDA_CHECK_VERSION(); _page_rank_param = GetParam(); _rank = NULL; @@ -40,39 +39,46 @@ class PageRankTest : public TestWithParam { } virtual void TearDown() { - if (_graph) graph_finalize(_graph); - if (_rank) totem_free(_rank, TOTEM_MEM_HOST_PINNED); + if (_graph) { graph_finalize(_graph); } + if (_rank) { totem_free(_rank, TOTEM_MEM_HOST_PINNED); } } error_t TestGraph() { - // the graph should be undirected because the test is shared between the + // The graph should be undirected because the test is shared between the // two versions of the PageRank algorithm: incoming- and outgoing- based. EXPECT_FALSE(_graph->directed); if (_graph->vertex_count != 0) { - CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(rank_t), - TOTEM_MEM_HOST_PINNED, (void**)&_rank)); + CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(rank_t), + TOTEM_MEM_HOST_PINNED, + reinterpret_cast(&_rank))); } if (_page_rank_param->attr != NULL) { _page_rank_param->attr->pull_msg_size = sizeof(rank_t) * BITS_PER_BYTE; + _page_rank_param->attr->push_msg_size = sizeof(rank_t) * BITS_PER_BYTE; if (totem_init(_graph, _page_rank_param->attr) == FAILURE) { return FAILURE; } - error_t err = page_rank_incoming_hybrid(NULL, _rank); + PageRankHybridFunction func = + reinterpret_cast(_page_rank_param->func); + error_t err = func(NULL, _rank); totem_finalize(); return err; + } else { + PageRankFunction func = + reinterpret_cast(_page_rank_param->func); + return func(_graph, NULL, _rank); } - return _page_rank_param->func(_graph, NULL, _rank); } protected: - page_rank_param_t* _page_rank_param; + test_param_t* _page_rank_param; rank_t* _rank; graph_t* _graph; }; // Tests PageRank for empty graphs. TEST_P(PageRankTest, Empty) { - _graph = (graph_t*)calloc(sizeof(graph_t), 1); + _graph = reinterpret_cast(calloc(sizeof(graph_t), 1)); EXPECT_EQ(FAILURE, TestGraph()); free(_graph); _graph = NULL; @@ -92,7 +98,7 @@ TEST_P(PageRankTest, Chain) { EXPECT_EQ(SUCCESS, graph_initialize(DATA_FOLDER("chain_1000_nodes.totem"), false, &_graph)); EXPECT_EQ(SUCCESS, TestGraph()); - for(vid_t vertex = 0; vertex < _graph->vertex_count/2; vertex++){ + for (vid_t vertex = 0; vertex < _graph->vertex_count / 2; vertex++) { EXPECT_FLOAT_EQ(_rank[vertex], _rank[_graph->vertex_count - vertex - 1]); } } @@ -103,7 +109,7 @@ TEST_P(PageRankTest, CompleteGraph) { graph_initialize(DATA_FOLDER("complete_graph_300_nodes.totem"), false, &_graph)); EXPECT_EQ(SUCCESS, TestGraph()); - for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++){ + for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) { EXPECT_FLOAT_EQ(_rank[0], _rank[vertex]); } } @@ -114,53 +120,48 @@ TEST_P(PageRankTest, Star) { graph_initialize(DATA_FOLDER("star_1000_nodes.totem"), false, &_graph)); EXPECT_EQ(SUCCESS, TestGraph()); - for(vid_t vertex = 1; vertex < _graph->vertex_count; vertex++){ + for (vid_t vertex = 1; vertex < _graph->vertex_count; vertex++) { EXPECT_FLOAT_EQ(_rank[1], _rank[vertex]); EXPECT_GT(_rank[0], _rank[vertex]); } } -// TODO(abdullah): Add test cases for not well defined structures. -// TODO(abdullah,lauro): Add test cases for non-empty vertex set and empty edge -// set. - -// Values() seems to accept only pointers, hence the possible parameters -// are defined here, and a pointer to each ot them is used. -page_rank_param_t page_rank_params[] = { - {NULL, &page_rank_cpu}, - {NULL, &page_rank_gpu}, - {NULL, &page_rank_vwarp_gpu}, - {NULL, &page_rank_incoming_cpu}, - {NULL, &page_rank_incoming_gpu}, - {&totem_attrs[0], NULL}, - {&totem_attrs[1], NULL}, - {&totem_attrs[2], NULL}, - {&totem_attrs[3], NULL}, - {&totem_attrs[4], NULL}, - {&totem_attrs[5], NULL}, - {&totem_attrs[6], NULL}, - {&totem_attrs[7], NULL} +// Defines the set of PageRank vanilla implementations to be tested. To test +// a new implementation, simply add it to the set below. +static void* vanilla_funcs[] = { + reinterpret_cast(&page_rank_cpu), + reinterpret_cast(&page_rank_incoming_cpu), + reinterpret_cast(&page_rank_gpu), + reinterpret_cast(&page_rank_vwarp_gpu), + reinterpret_cast(&page_rank_incoming_gpu), }; +static const int vanilla_count = STATIC_ARRAY_COUNT(vanilla_funcs); -// Values() receives a list of parameters and the framework will execute the -// whole set of tests PageRankTest for each element of Values() -// TODO(abdullah): both versions of the PageRank algorithm (the incoming- and -// outgoing- based) can share the same tests because all the graphs are -// undirected. Separate the two for cases where the graphs are directed. +// Defines the set of PageRank hybrid implementations to be tested. To test +// a new implementation, simply add it to the set below. +static void* hybrid_funcs[] = { + reinterpret_cast(&page_rank_hybrid), + reinterpret_cast(&page_rank_incoming_hybrid), +}; +static const int hybrid_count = STATIC_ARRAY_COUNT(hybrid_funcs); + +// Maintains references to the different configurations (vanilla and hybrid) +// that will be tested by the framework. +static const int params_count = vanilla_count + + hybrid_count * hybrid_configurations_count; +static test_param_t* params[params_count]; + +// From Google documentation: +// In order to run value-parameterized tests, we need to instantiate them, +// or bind them to a list of values which will be used as test parameters. +// +// ValuesIn() receives a list of parameters and the framework will execute the +// whole set of tests for each entry in the array passed to ValuesIn(). INSTANTIATE_TEST_CASE_P(PageRankGPUAndCPUTest, PageRankTest, - Values(&page_rank_params[0], - &page_rank_params[1], - &page_rank_params[2], - &page_rank_params[3], - &page_rank_params[4], - &page_rank_params[5], - &page_rank_params[6], - &page_rank_params[7], - &page_rank_params[8], - &page_rank_params[9], - &page_rank_params[10], - &page_rank_params[11], - &page_rank_params[12])); + ValuesIn(GetParameters(params, params_count, + vanilla_funcs, vanilla_count, + hybrid_funcs, hybrid_count), + params + params_count)); #else diff --git a/src/thirdparty/wattsup/makefile b/src/thirdparty/wattsup/makefile new file mode 100644 index 0000000..f9541e8 --- /dev/null +++ b/src/thirdparty/wattsup/makefile @@ -0,0 +1,17 @@ +#-------------------------------------------------------------------------- +# Wattsup makefile. +# +# Created on: 2014-08-28 +# Author: Abdullah Gharaibeh +#-------------------------------------------------------------------------- +TARGET = wattsup +ROOTDIR = ../.. +include $(ROOTDIR)/make.defs + +all: $(BINDIR)/$(TARGET) + +$(BINDIR)/$(TARGET): $(BINDIR) objects + @printf "\nLinking %s executable ...\n" $(BINDIR)/$(TARGET) + $(NVCC) -o $@ $(COBJS) $(CCOBJS) $(CUOBJS) $(LIBS) $(LFLAGS) + +include $(ROOTDIR)/make.rules