diff --git a/src/alg/totem_alg.h b/src/alg/totem_alg.h
index f013822..49ccf4e 100644
--- a/src/alg/totem_alg.h
+++ b/src/alg/totem_alg.h
@@ -108,6 +108,7 @@ error_t bfs_gpu(graph_t* graph, vid_t src_id, cost_t* cost);
 error_t bfs_bu_gpu(graph_t* graph, vid_t src_id, cost_t* cost);
 error_t bfs_vwarp_gpu(graph_t* graph, vid_t src_id, cost_t* cost);
 error_t bfs_hybrid(vid_t src_id, cost_t* cost);
+error_t bfs_stepwise_hybrid(vid_t src_id, cost_t* cost);
 
 /**
  * Given an undirected, unweighted graph and a source vertex, compute the 
diff --git a/src/alg/totem_betweenness_hybrid.cu b/src/alg/totem_betweenness_hybrid.cu
index 1147b7f..991a0bd 100644
--- a/src/alg/totem_betweenness_hybrid.cu
+++ b/src/alg/totem_betweenness_hybrid.cu
@@ -7,62 +7,58 @@
  *          Robert Woff
  */
 
-// Totem includes
+// totem includes
 #include "totem_alg.h"
 #include "totem_centrality.h"
 #include "totem_engine.cuh"
 #include <thrust/sort.h>
 
-/**
- * Per-partition specific state
- */
+
+// Per-partition specific state.
 typedef struct betweenness_state_s {
-  cost_t*   distance[MAX_PARTITION_COUNT]; // a list of distances state, one per
-                                           // partition
-  uint32_t* numSPs[MAX_PARTITION_COUNT]; // a list of number of shortest paths
-                                           // state, one per partition
-  uint32_t* numSPs_f[MAX_PARTITION_COUNT]; // a list of number of shortest paths
-                                           // state, one per partition
-  score_t*  delta[MAX_PARTITION_COUNT];    // delta BC score for a vertex
-  bool*     done;        // pointer to global finish flag
-  cost_t    level;       // current level being processed by the partition
-  score_t*  betweenness; // betweenness score
+  cost_t*   distance[MAX_PARTITION_COUNT];  // a list of distances state, one
+                                            // per partition
+  uint32_t* numSPs[MAX_PARTITION_COUNT];    // a list of number of shortest
+                                            // paths state, one per partition
+  uint32_t* numSPs_f[MAX_PARTITION_COUNT];  // a list of number of shortest
+                                            // paths state, one per partition
+  score_t*  delta[MAX_PARTITION_COUNT];     // delta BC score for a vertex
+  bool*     done;         // pointer to global finish flag
+  cost_t    level;        // current level being processed by the partition
+  score_t*  betweenness;  // betweenness score
   frontier_state_t frontier;
-  bool* comm;            // flags that indicates whether to instruct the engine 
+  bool* comm;            // flags that indicates whether to instruct the engine
                          // to perform communication or not. This array is
                          // populated during the forward phase, and used during
                          // the backward propagation phase
 } betweenness_state_t;
 
-/**
- * State shared between all partitions
- */
+// State shared between all partitions.
 typedef struct betweenness_global_state_s {
-  score_t*   betweenness_score;   // final output buffer
-  score_t*   betweenness_score_h; // used as a temporary buffer
-  vid_t      src;                 // source vertex id (id after partitioning)
-  double     epsilon;             // determines accuracy of BC computation
-  int        num_samples;         // number of samples for approximate BC
+  score_t*   betweenness_score;    // final output buffer
+  score_t*   betweenness_score_h;  // used as a temporary buffer
+  vid_t      src;                  // source vertex id (id after partitioning)
+  double     epsilon;              // determines accuracy of BC computation
+  int        num_samples;          // number of samples for approximate BC
 } betweenness_global_state_t;
 PRIVATE betweenness_global_state_t bc_g;
 
-/**
- * The neighbors forward propagation processing function. This function sets 
- * the level of the neighbors' vertex to one level more than the parent vertex.
- * The assumption is that the threads of a warp invoke this function to process
- * the warp's batch of work. In each iteration of the for loop, each thread 
- * processes a neighbor. For example, thread 0 in the warp processes neighbors 
- * at indices 0, VWARP_WIDTH, (2 * VWARP_WIDTH) etc. in the edges array, while
- * thread 1 in the warp processes neighbors 1, (1 + VWARP_WIDTH), 
- * (1 + 2 * VWARP_WIDTH) and so on.
- */
+
+// The neighbors forward propagation processing function. This function sets
+// the level of the neighbors' vertex to one level more than the parent vertex.
+// The assumption is that the threads of a warp invoke this function to process
+// the warp's batch of work. In each iteration of the for loop, each thread
+// processes a neighbor. For example, thread 0 in the warp processes neighbors
+// at indices 0, VWARP_WIDTH, (2 * VWARP_WIDTH) etc. in the edges array, while
+// thread 1 in the warp processes neighbors 1, (1 + VWARP_WIDTH),
+// (1 + 2 * VWARP_WIDTH) and so on.
 template<int VWARP_WIDTH>
 __device__ inline void
-forward_process_neighbors(int warp_offset, const vid_t* __restrict nbrs, 
-                          const vid_t nbr_count, uint32_t v_numSPs, 
+forward_process_neighbors(int warp_offset, const vid_t* __restrict nbrs,
+                          const vid_t nbr_count, uint32_t v_numSPs,
                           betweenness_state_t* state, bool& done_d) {
-  // Iterate through the portion of work
-  for(vid_t i = warp_offset; i < nbr_count; i += VWARP_WIDTH) {
+  // Iterate through the portion of work.
+  for (vid_t i = warp_offset; i < nbr_count; i += VWARP_WIDTH) {
     vid_t nbr   = GET_VERTEX_ID(nbrs[i]);
     int nbr_pid = GET_PARTITION_ID(nbrs[i]);
     cost_t* nbr_distance = state->distance[nbr_pid];
@@ -79,10 +75,10 @@ forward_process_neighbors(int warp_offset, const vid_t* __restrict nbrs,
 
 template<int VWARP_WIDTH, int VWARP_BATCH>
 __global__ void
-forward_kernel(partition_t par, betweenness_state_t state, 
+forward_kernel(partition_t par, betweenness_state_t state,
                const vid_t* __restrict frontier, vid_t count) {
-  if (THREAD_GLOBAL_INDEX >= 
-      vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH)) return;
+  if (THREAD_GLOBAL_INDEX >=
+      vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH)) { return; }
 
   const eid_t* __restrict vertices = par.subgraph.vertices;
   const uint32_t* __restrict numSPs = state.numSPs[par.id];
@@ -90,50 +86,50 @@ forward_kernel(partition_t par, betweenness_state_t state,
   // This flag is used to report the finish state of a block of threads. This
   // is useful to avoid having many threads writing to the global finished
   // flag, which can hurt performance (since "finished" is actually allocated
-  // on the host, and each write will cause a transfer over the PCI-E bus)
+  // on the host, and each write will cause a transfer over the PCI-E bus).
   __shared__ bool finished_block;
   finished_block = true;
   __syncthreads();
 
-  vid_t start_vertex = vwarp_block_start_vertex(VWARP_WIDTH, VWARP_BATCH) + 
+  vid_t start_vertex = vwarp_block_start_vertex(VWARP_WIDTH, VWARP_BATCH) +
     vwarp_warp_start_vertex(VWARP_WIDTH, VWARP_BATCH);
   vid_t end_vertex = start_vertex +
     vwarp_warp_batch_size(count, VWARP_WIDTH, VWARP_BATCH);
   int warp_offset = vwarp_thread_index(VWARP_WIDTH);
-  
-  // Iterate over my work
-  for(vid_t i = start_vertex; i < end_vertex; i++) {
+
+  // Iterate over my work.
+  for (vid_t i = start_vertex; i < end_vertex; i++) {
     vid_t v = frontier[i];
     // If the distance for this node is equal to the current level, then
     // forward process its neighbours to determine its contribution to
-    // the number of shortest paths
+    // the number of shortest paths.
     const eid_t nbr_count = vertices[v + 1] - vertices[v];
     vid_t* nbrs = par.subgraph.edges + vertices[v];
     if (v >= par.subgraph.vertex_ext) {
-      nbrs = par.subgraph.edges_ext + 
+      nbrs = par.subgraph.edges_ext +
         (vertices[v] - par.subgraph.edge_count_ext);
     }
     forward_process_neighbors<VWARP_WIDTH>
-      (warp_offset, nbrs, nbr_count, numSPs[v], &state, finished_block);    
+      (warp_offset, nbrs, nbr_count, numSPs[v], &state, finished_block);
   }
 
   __syncthreads();
-  // If there is remaining work to do, set the done flag to false
+  // If there is remaining work to do, set the done flag to false.
   if (!finished_block && THREAD_BLOCK_INDEX == 0) *(state.done) = false;
 }
 
-typedef void(*bc_gpu_func_t)(partition_t*, betweenness_state_t*, vid_t*, vid_t, 
+typedef void(*bc_gpu_func_t)(partition_t*, betweenness_state_t*, vid_t*, vid_t,
                              cudaStream_t);
 template<int VWARP_WIDTH, int VWARP_BATCH>
 #ifdef FEATURE_SM35
-PRIVATE __host__ __device__ 
+PRIVATE __host__ __device__
 #else
 PRIVATE __host__
 #endif /* FEATURE_SM35  */
 void forward_launch_gpu(partition_t* par, betweenness_state_t* state,
                         vid_t* frontier, vid_t count, cudaStream_t stream) {
-  if (count == 0) return;
-  dim3 blocks; 
+  if (count == 0) { return; }
+  dim3 blocks;
   const int threads = MAX_THREADS_PER_BLOCK;
   kernel_configure(vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH),
                    blocks, threads);
@@ -143,7 +139,7 @@ void forward_launch_gpu(partition_t* par, betweenness_state_t* state,
 
 #ifdef FEATURE_SM35
 PRIVATE __global__
-void forward_launch_at_boundary_kernel(partition_t par, 
+void forward_launch_at_boundary_kernel(partition_t par,
                                        betweenness_state_t state) {
   if (THREAD_GLOBAL_INDEX > 0 || (*state.frontier.count == 0)) {
     return;
@@ -181,12 +177,11 @@ PRIVATE const bc_gpu_func_t FORWARD_GPU_FUNC[] = {
   forward_launch_gpu<MAX_THREADS_PER_BLOCK,  VWARP_MEDIUM_BATCH_SIZE>
 };
 
-/**
- * Entry point for forward propagation on the GPU
- */
+// Entry point for forward propagation on the GPU.
 PRIVATE inline void betweenness_forward_gpu(partition_t* par) {
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
-  frontier_update_list_gpu(&state->frontier, state->level, 
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
+  frontier_update_list_gpu(&state->frontier, state->level,
                            state->distance[par->id], par->streams[1]);
 
   if (engine_partition_algorithm() == PAR_SORTED_DSC) {
@@ -202,16 +197,16 @@ PRIVATE inline void betweenness_forward_gpu(partition_t* par) {
     }
   }
 
-  // clear out the outbox buffers
+  // Clear out the outbox buffers.
   for (int rmt_pid = 0; rmt_pid < context.pset->partition_count; rmt_pid++) {
     grooves_box_table_t* outbox =  &par->outbox[rmt_pid];
-    if (rmt_pid == par->id || !outbox->count) continue;
-    cudaMemsetAsync(outbox->push_values, 0, outbox->count * sizeof(uint32_t), 
+    if (rmt_pid == par->id || !outbox->count) { continue; }
+    cudaMemsetAsync(outbox->push_values, 0, outbox->count * sizeof(uint32_t),
                     par->streams[1]);
   }
 
-  // If the vertices are sorted by degree, call a kernel that takes 
-  // advantage of that
+  // If the vertices are sorted by degree, call a kernel that takes
+  // advantage of that.
 #ifdef FEATURE_SM35
   if (engine_sorted()) {
     frontier_update_boundaries_gpu(&state->frontier, &par->subgraph,
@@ -224,34 +219,32 @@ PRIVATE inline void betweenness_forward_gpu(partition_t* par) {
 #endif /* FEATURE_SM35 */
 
   // Call the corresponding cuda kernel to perform forward propagation
-  // given the current state of the algorithm
+  // given the current state of the algorithm.
   vid_t count;
-  CALL_CU_SAFE(cudaMemcpyAsync(&count, state->frontier.count, 
-                               sizeof(vid_t), cudaMemcpyDefault, 
-                               par->streams[1]));
+  CALL_CU_SAFE(cudaMemcpyAsync(&count, state->frontier.count, sizeof(vid_t),
+                               cudaMemcpyDefault, par->streams[1]));
   CALL_CU_SAFE(cudaStreamSynchronize(par->streams[1]));
   int par_alg = engine_partition_algorithm();
-  FORWARD_GPU_FUNC[par_alg](par, state, state->frontier.list, count, 
+  FORWARD_GPU_FUNC[par_alg](par, state, state->frontier.list, count,
                             par->streams[1]);
 }
 
-/**
- * Entry point for forward propagation on the CPU
- */
+// Entry point for forward propagation on the CPU
 void betweenness_forward_cpu(partition_t* par) {
   // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
-  graph_t* subgraph = &par->subgraph;  
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
+  graph_t* subgraph = &par->subgraph;
   cost_t* distance = state->distance[par->id];
   uint32_t* numSPs = state->numSPs[par->id];
   bool done = true;
   bool comm = false;
-  // In parallel, iterate over vertices which are at the current level
-  OMP(omp parallel for schedule(runtime) reduction(& : done) 
+  // In parallel, iterate over vertices which are at the current level.
+  OMP(omp parallel for schedule(runtime) reduction(& : done)
       reduction(| : comm))
   for (vid_t v = 0; v < subgraph->vertex_count; v++) {
     if (distance[v] == state->level) {
-      for (eid_t e = subgraph->vertices[v]; e < subgraph->vertices[v + 1]; 
+      for (eid_t e = subgraph->vertices[v]; e < subgraph->vertices[v + 1];
            e++) {
         vid_t nbr = GET_VERTEX_ID(subgraph->edges[e]);
         int nbr_pid = GET_PARTITION_ID(subgraph->edges[e]);
@@ -263,7 +256,7 @@ void betweenness_forward_cpu(partition_t* par) {
         }
         if (nbr_distance[nbr] == state->level + 1) {
           uint32_t* nbr_numSPs = state->numSPs_f[nbr_pid];
-          __sync_fetch_and_add(&nbr_numSPs[nbr], numSPs[v]); 
+          __sync_fetch_and_add(&nbr_numSPs[nbr], numSPs[v]);
         }
       }
     }
@@ -273,30 +266,30 @@ void betweenness_forward_cpu(partition_t* par) {
     state->comm[state->level] = false;
   }
 
-  // If there is remaining work to do, set the done flag to false
-  if (!done) {    
+  // If there is remaining work to do, set the done flag to false.
+  if (!done) {
     *(state->done) = false;
   }
 }
 
-/**
- * Distributes work to either the CPU or GPU
- */
+// Distributes work to either the CPU or GPU.
 PRIVATE void betweenness_forward(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count) { return; }
 
-  // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  // Get the current state of the algorithm.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
 
   for (int pid = 0; pid < engine_partition_count(); pid++) {
     if (pid != par->id) {
-      state->numSPs_f[pid] = (uint32_t*)par->outbox[pid].push_values;
+      state->numSPs_f[pid] =
+          reinterpret_cast<uint32_t*>(par->outbox[pid].push_values);
     }
   }
 
   // Check which kind of processor this partition corresponds to and
-  // call the appropriate function to perform forward propagation
+  // call the appropriate function to perform forward propagation.
   if (par->processor.type == PROCESSOR_CPU) {
     betweenness_forward_cpu(par);
   } else if (par->processor.type == PROCESSOR_GPU) {
@@ -304,14 +297,12 @@ PRIVATE void betweenness_forward(partition_t* par) {
   } else {
     assert(false);
   }
-  // Increment the level for the next round of forward propagation
+  // Increment the level for the next round of forward propagation.
   state->level++;
 }
 
-/**
- * The neighbors backward propagation processing function. This function 
- * computes the delta of a vertex.
- */
+// The neighbors backward propagation processing function. This function
+// computes the delta of a vertex.
 template<int VWARP_WIDTH>
 __device__ void
 backward_process_neighbors(partition_t* par, betweenness_state_t* state,
@@ -319,13 +310,13 @@ backward_process_neighbors(partition_t* par, betweenness_state_t* state,
                            uint32_t v_numSPs, score_t* vwarp_delta_s, vid_t v) {
   int warp_offset = vwarp_thread_index(VWARP_WIDTH);
   score_t sum = 0;
-  // Iterate through the portion of work
-  for(vid_t i = warp_offset; i < nbr_count; i += VWARP_WIDTH) {
+  // Iterate through the portion of work.
+  for (vid_t i = warp_offset; i < nbr_count; i += VWARP_WIDTH) {
     vid_t nbr = GET_VERTEX_ID(nbrs[i]);
     int nbr_pid = GET_PARTITION_ID(nbrs[i]);
     cost_t* nbr_distance = state->distance[nbr_pid];
     if (nbr_distance[nbr] == state->level + 1) {
-      // Compute an intermediary delta value in shared memory
+      // Compute an intermediary delta value in shared memory.
       score_t* nbr_delta = state->delta[nbr_pid];
       uint32_t* nbr_numSPs = state->numSPs[nbr_pid];
       sum += ((((score_t)v_numSPs) / ((score_t)nbr_numSPs[nbr])) *
@@ -347,40 +338,38 @@ backward_process_neighbors(partition_t* par, betweenness_state_t* state,
   }
 }
 
-/**
- * CUDA kernel which performs backward propagation
- */
+// CUDA kernel which performs backward propagation.
 template<int VWARP_WIDTH, int VWARP_BATCH>
 __global__ void
 betweenness_backward_kernel(partition_t par, betweenness_state_t state,
                             const vid_t* __restrict frontier, vid_t count) {
-  if (THREAD_GLOBAL_INDEX >= 
-      vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH)) return;
+  if (THREAD_GLOBAL_INDEX >=
+      vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH)) { return; }
 
   const eid_t* __restrict vertices = par.subgraph.vertices;
   const uint32_t* __restrict numSPs = state.numSPs[par.id];
 
   // Each thread in every warp has an entry in the following array which will be
-  // used to calculate intermediary delta values in shared memory
+  // used to calculate intermediary delta values in shared memory.
   __shared__ score_t delta_s[MAX_THREADS_PER_BLOCK];
   const int index = THREAD_BLOCK_INDEX / VWARP_WIDTH;
   score_t* vwarp_delta_s = &delta_s[index * VWARP_WIDTH];
 
-  vid_t start_vertex = vwarp_block_start_vertex(VWARP_WIDTH, VWARP_BATCH) + 
+  vid_t start_vertex = vwarp_block_start_vertex(VWARP_WIDTH, VWARP_BATCH) +
     vwarp_warp_start_vertex(VWARP_WIDTH, VWARP_BATCH);
   vid_t end_vertex = start_vertex +
     vwarp_warp_batch_size(count, VWARP_WIDTH, VWARP_BATCH);
   int warp_offset = vwarp_thread_index(VWARP_WIDTH);
-  
-  // Iterate over my work
-  for(vid_t i = start_vertex; i < end_vertex; i++) {
+
+  // Iterate over my work.
+  for (vid_t i = start_vertex; i < end_vertex; i++) {
     vid_t v = frontier[i];
     // If the vertex is at the current level, determine its contribution
-    // to the source vertex's delta value
+    // to the source vertex's delta value.
     const eid_t nbr_count = vertices[v + 1] - vertices[v];
     vid_t* nbrs = par.subgraph.edges + vertices[v];
     if (v >= par.subgraph.vertex_ext) {
-      nbrs = par.subgraph.edges_ext + 
+      nbrs = par.subgraph.edges_ext +
         (vertices[v] - par.subgraph.edge_count_ext);
     }
     backward_process_neighbors<VWARP_WIDTH>
@@ -390,15 +379,15 @@ betweenness_backward_kernel(partition_t par, betweenness_state_t state,
 
 template<int VWARP_WIDTH, int VWARP_BATCH>
 #ifdef FEATURE_SM35
-PRIVATE __host__ __device__ 
+PRIVATE __host__ __device__
 #else
 PRIVATE __host__
-#endif /* FEATURE_SM35  */
+#endif /* FEATURE_SM35 */
 void backward_launch_gpu(partition_t* par, betweenness_state_t* state,
                          vid_t* frontier, vid_t count, cudaStream_t stream) {
-  if (count == 0) return;
+  if (count == 0) { return; }
   dim3 blocks; const int threads = MAX_THREADS_PER_BLOCK;
-  kernel_configure(vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH), 
+  kernel_configure(vwarp_thread_count(count, VWARP_WIDTH, VWARP_BATCH),
                    blocks, threads);
   betweenness_backward_kernel<VWARP_WIDTH, VWARP_BATCH>
     <<<blocks, threads, 0, stream>>>(*par, *state, frontier, count);
@@ -406,7 +395,7 @@ void backward_launch_gpu(partition_t* par, betweenness_state_t* state,
 
 #ifdef FEATURE_SM35
 PRIVATE __global__
-void backward_launch_at_boundary_kernel(partition_t par, 
+void backward_launch_at_boundary_kernel(partition_t par,
                                         betweenness_state_t state) {
   if (THREAD_GLOBAL_INDEX > 0 || (*state.frontier.count == 0)) {
     return;
@@ -433,7 +422,7 @@ void backward_launch_at_boundary_kernel(partition_t par,
     }
   }
 }
-#endif /* FEATURE_SM35  */
+#endif /* FEATURE_SM35 */
 
 PRIVATE const bc_gpu_func_t BACKWARD_GPU_FUNC[] = {
   // RANDOM algorithm
@@ -444,14 +433,13 @@ PRIVATE const bc_gpu_func_t BACKWARD_GPU_FUNC[] = {
   backward_launch_gpu<MAX_THREADS_PER_BLOCK,  VWARP_MEDIUM_BATCH_SIZE>
 };
 
-/**
- * Entry point for backward propagation on GPU
- */
+// Entry point for backward propagation on GPU.
 PRIVATE inline void betweenness_backward_gpu(partition_t* par) {
   // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
 
-  // If the vertices are sorted by degree, call a kernel that takes 
+  // If the vertices are sorted by degree, call a kernel that takes
   // advantage of that
 #ifdef FEATURE_SM35
   if (engine_sorted()) {
@@ -465,32 +453,31 @@ PRIVATE inline void betweenness_backward_gpu(partition_t* par) {
 #endif /* FEATURE_SM35 */
 
   vid_t count;
-  CALL_CU_SAFE(cudaMemcpyAsync(&count, state->frontier.count, 
-                               sizeof(vid_t), cudaMemcpyDefault, 
+  CALL_CU_SAFE(cudaMemcpyAsync(&count, state->frontier.count,
+                               sizeof(vid_t), cudaMemcpyDefault,
                                par->streams[1]));
   CALL_CU_SAFE(cudaStreamSynchronize(par->streams[1]));
   int par_alg = engine_partition_algorithm();
-  BACKWARD_GPU_FUNC[par_alg](par, state, state->frontier.list, 
+  BACKWARD_GPU_FUNC[par_alg](par, state, state->frontier.list,
                              count, par->streams[1]);
 }
 
-/**
- * Entry point for backward propagation on CPU
- */
+// Entry point for backward propagation on CPU.
 void betweenness_backward_cpu(partition_t* par) {
-  // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  // Get the current state of the algorithm.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
   graph_t* subgraph = &par->subgraph;
   cost_t* distance = state->distance[par->id];
   uint32_t* numSPs = state->numSPs[par->id];
   score_t* delta = state->delta[par->id];
 
-  // In parallel, iterate over vertices which are at the current level
+  // In parallel, iterate over vertices which are at the current level.
   OMP(omp parallel for schedule(runtime))
   for (vid_t v = 0; v < subgraph->vertex_count; v++) {
     cost_t v_distance = distance[v];
     if (v_distance == state->level) {
-      // For all neighbors of v, iterate over paths
+      // For all neighbors of v, iterate over paths.
       score_t delta_v = 0;
       for (eid_t e = subgraph->vertices[v]; e < subgraph->vertices[v + 1];
            e++) {
@@ -498,7 +485,8 @@ void betweenness_backward_cpu(partition_t* par) {
         int nbr_pid = GET_PARTITION_ID(subgraph->edges[e]);
         cost_t* nbr_distance = state->distance[nbr_pid];
 
-        // Check whether the neighbour is local or remote and update accordingly
+        // Check whether the neighbour is local or remote and update
+        // accordingly.
         if (nbr_distance[nbr] == state->level + 1) {
           score_t* nbr_delta = state->delta[nbr_pid];
           uint32_t* nbr_numSPs = state->numSPs[nbr_pid];
@@ -506,32 +494,32 @@ void betweenness_backward_cpu(partition_t* par) {
                       (nbr_delta[nbr] + 1));
         }
       }
-      // Add the dependency to the BC sum
+      // Add the dependency to the BC sum.
       delta[v] += delta_v;
       state->betweenness[v] += delta[v];
     }
   }
 }
 
-/**
- * Distributes work for backward propagation to either the CPU or GPU
- */
+// Distributes work for backward propagation to either the CPU or GPU.
 PRIVATE void betweenness_backward(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;
+  // Get the current state of the algorithm.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
 
-  // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count || state->level == 0) { return; }
 
   for (int pid = 0; pid < engine_partition_count(); pid++) {
     if (pid != par->id) {
-      state->delta[pid] = (score_t*)par->outbox[pid].pull_values;
+      state->delta[pid] =
+          reinterpret_cast<score_t*>(par->outbox[pid].pull_values);
     }
   }
 
   if (engine_superstep() > 1) {
     // Check what kind of processing unit corresponds to this partition and
-    // then call the appropriate function to perform backward propagation
+    // then call the appropriate function to perform backward propagation.
     if (par->processor.type == PROCESSOR_CPU) {
       betweenness_backward_cpu(par);
     } else if (par->processor.type == PROCESSOR_GPU) {
@@ -540,10 +528,11 @@ PRIVATE void betweenness_backward(partition_t* par) {
       assert(false);
     }
   }
-  // Decrement the level for the next round of backward propagation
+
+  // Decrement the level for the next round of backward propagation.
   state->level--;
 
-  // Check whether backward propagation is finished
+  // Check whether backward propagation is finished.
   if (state->level > 0) {
     engine_report_not_finished();
     if (!state->comm[state->level]) {
@@ -552,26 +541,24 @@ PRIVATE void betweenness_backward(partition_t* par) {
   }
 }
 
-/*
- * Parallel CPU implementation of betweenness scatter function
- */
+// Parallel CPU implementation of betweenness scatter function.
 PRIVATE inline void betweenness_scatter_cpu(int pid, grooves_box_table_t* inbox,
                                             betweenness_state_t* state) {
   cost_t* distance = state->distance[pid];
   uint32_t* numSPs = state->numSPs[pid];
-  // Get the values that have been pushed to this vertex
-  uint32_t* inbox_values = (uint32_t*)inbox->push_values;
+  // Get the values that have been pushed to this vertex.
+  uint32_t* inbox_values = reinterpret_cast<uint32_t*>(inbox->push_values);
   OMP(omp parallel for schedule(runtime))
   for (vid_t index = 0; index < inbox->count; index++) {
     if (inbox_values[index] != 0) {
       vid_t vid = inbox->rmt_nbrs[index];
       // If the distance was previously infinity, initialize it to the
-      // current level 
+      // current level.
       if (distance[vid] == INF_COST) {
         distance[vid] = state->level;
       }
-      // If the distance is equal to the current level, update the 
-      // nodes number of shortest paths with the pushed value
+      // If the distance is equal to the current level, update the nodes
+      // number of shortest paths with the pushed value.
       if (distance[vid] == state->level) {
         numSPs[vid] += inbox_values[index];
       }
@@ -579,65 +566,60 @@ PRIVATE inline void betweenness_scatter_cpu(int pid, grooves_box_table_t* inbox,
   }
 }
 
-/*
- * Kernel for betweenness_scatter_gpu
- */
-__global__ void betweenness_scatter_kernel(grooves_box_table_t inbox, 
+__global__ void betweenness_scatter_kernel(grooves_box_table_t inbox,
                                            cost_t* distance, uint32_t* numSPs,
                                            cost_t level) {
   vid_t index = THREAD_GLOBAL_INDEX;
-  if (index >= inbox.count) return;
+  if (index >= inbox.count) { return; }
 
-  // Get the values that have been pushed to this vertex
-  uint32_t* inbox_values = (uint32_t*)inbox.push_values;
+  // Get the values that have been pushed to this vertex.
+  uint32_t* inbox_values = reinterpret_cast<uint32_t*>(inbox.push_values);
   if (inbox_values[index] != 0) {
     vid_t vid = inbox.rmt_nbrs[index];
     // If the distance was previously infinity, initialize it to the
-    // current level   
+    // current level.
     if (distance[vid] == INF_COST) {
       distance[vid] = level;
     }
-    // If the distance is equal to the current level, update the 
-    // nodes number of shortest paths with the pushed value
+    // If the distance is equal to the current level, update the
+    // nodes number of shortest paths with the pushed value.
     if (distance[vid] == level) {
       numSPs[vid] += inbox_values[index];
     }
   }
 }
 
-/*
- * Parallel GPU implementation of betweenness scatter function
- */
-PRIVATE inline void betweenness_scatter_gpu(partition_t* par, 
+// Parallel GPU implementation of betweenness scatter function.
+PRIVATE inline void betweenness_scatter_gpu(partition_t* par,
                                             grooves_box_table_t* inbox,
                                             betweenness_state_t* state) {
   dim3 blocks, threads;
   KERNEL_CONFIGURE(inbox->count, blocks, threads);
-  // Invoke the appropriate CUDA kernel to perform the scatter functionality
+  // Invoke the appropriate CUDA kernel to perform the scatter functionality.
   betweenness_scatter_kernel<<<blocks, threads, 0, par->streams[1]>>>
     (*inbox, state->distance[par->id], state->numSPs[par->id], state->level);
   CALL_CU_SAFE(cudaGetLastError());
 }
 
-/**
- * Update the number of shortest paths from remote vertices
- * Also update distance if it has yet to be initialized
- */
+
+// Update the number of shortest paths from remote vertices
+// Also update distance if it has yet to be initialized.
 PRIVATE void betweenness_scatter_forward(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count) { return; }
 
-  // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  // Get the current state of the algorithm.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
 
   for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) {
-    if (rmt_pid == par->id) continue;
-    // For all remote partitions, get the corresponding inbox
+    if (rmt_pid == par->id) { continue; }
+    // For all remote partitions, get the corresponding inbox.
     grooves_box_table_t* inbox = &par->inbox[rmt_pid];
-    if (!inbox->count) continue;
+    if (!inbox->count) { continue; }
     // If the inbox has some values, determine which type of processing unit
-    // corresponds to this partition and call the appropriate scatter function
-    if (!engine_get_comm_prev(rmt_pid)) continue;
+    // corresponds to this partition and call the appropriate scatter function.
+    if (!engine_get_comm_prev(rmt_pid)) { continue; }
     if (par->processor.type == PROCESSOR_CPU) {
       betweenness_scatter_cpu(par->id, inbox, state);
     } else if (par->processor.type == PROCESSOR_GPU) {
@@ -648,10 +630,8 @@ PRIVATE void betweenness_scatter_forward(partition_t* par) {
   }
 }
 
-/*
- * Parallel CPU implementation of betweenness gather function
- */
-PRIVATE inline void betweenness_gather_cpu(int pid, grooves_box_table_t* inbox, 
+// Parallel CPU implementation of betweenness gather function.
+PRIVATE inline void betweenness_gather_cpu(int pid, grooves_box_table_t* inbox,
                                            betweenness_state_t* state,
                                            score_t* values) {
   cost_t* distance = state->distance[pid];
@@ -659,79 +639,73 @@ PRIVATE inline void betweenness_gather_cpu(int pid, grooves_box_table_t* inbox,
   OMP(omp parallel for schedule(runtime))
   for (vid_t index = 0; index < inbox->count; index++) {
     vid_t vid = inbox->rmt_nbrs[index];
-    // Check whether the vertex's distance is equal to level + 1
+    // Check whether the vertex's distance is equal to level + 1.
     if (distance[vid] == (state->level + 1)) {
       // If it is, we'll pass the vertex's current delta value to neighbouring
-      // nodes to be used during their next backward propagation phase
-      values[index]  = delta[vid];
+      // nodes to be used during their next backward propagation phase.
+      values[index] = delta[vid];
     }
   }
 }
 
-/*
- * Kernel for betweenness_gather_gpu
- */
-__global__ 
-void betweenness_gather_kernel(const vid_t* __restrict rmt_nbrs, 
+__global__
+void betweenness_gather_kernel(const vid_t* __restrict rmt_nbrs,
                                const vid_t rmt_nbrs_count,
-                               const cost_t* __restrict distance, 
-                               const cost_t level, 
-                               const score_t* __restrict delta, 
+                               const cost_t* __restrict distance,
+                               const cost_t level,
+                               const score_t* __restrict delta,
                                score_t* values) {
   vid_t index = THREAD_GLOBAL_INDEX;
-  if (index >= rmt_nbrs_count) return;
+  if (index >= rmt_nbrs_count) { return; }
   vid_t vid = rmt_nbrs[index];
-  // Check whether the vertex's distance is equal to level + 1
+  // Check whether the vertex's distance is equal to level + 1.
   if (distance[vid] == level + 1) {
-      // If it is, we'll pass the vertex's current delta value to neighbouring
-      // nodes to be used during their next backward propagation phase
-    values[index]  = delta[vid];
+    // If it is, we'll pass the vertex's current delta value to neighbouring
+    // nodes to be used during their next backward propagation phase.
+    values[index] = delta[vid];
   }
 }
 
-/*
- * Parallel GPU implementation of betweenness gather function
- */
-PRIVATE inline 
+// Parallel GPU implementation of betweenness gather function.
+PRIVATE inline
 void betweenness_gather_gpu(partition_t* par, grooves_box_table_t* inbox,
                             betweenness_state_t* state, score_t* values) {
   dim3 blocks, threads;
-  KERNEL_CONFIGURE(inbox->count, blocks, threads); 
-  // Invoke the appropriate CUDA kernel to perform the gather functionality
+  KERNEL_CONFIGURE(inbox->count, blocks, threads);
+  // Invoke the appropriate CUDA kernel to perform the gather functionality.
   betweenness_gather_kernel<<<blocks, threads, 0, par->streams[1]>>>
-    (inbox->rmt_nbrs, inbox->count, state->distance[par->id], 
+    (inbox->rmt_nbrs, inbox->count, state->distance[par->id],
      state->level, state->delta[par->id], values);
   CALL_CU_SAFE(cudaGetLastError());
 }
 
-/**
- * Pass the number of shortest paths and delta values to neighbouring
- * vertices to be used in the backwards propagation phase
- */
+// Pass the number of shortest paths and delta values to neighbouring
+// vertices to be used in the backwards propagation phase.
 PRIVATE void betweenness_gather_backward(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count) { return; }
 
-  // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  // Get the current state of the algorithm.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
 
   if (par->processor.type == PROCESSOR_GPU) {
     if (!state->comm[state->level]) {
       cudaMemsetAsync(state->frontier.count, 0, sizeof(vid_t), par->streams[1]);
     } else {
-      frontier_update_list_gpu(&state->frontier, state->level, 
+      frontier_update_list_gpu(&state->frontier, state->level,
                                state->distance[par->id], par->streams[1]);
     }
   }
-  
+
   for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) {
-    if (rmt_pid == par->id) continue;
-    grooves_box_table_t* inbox = &par->inbox[rmt_pid]; 
-    // For all remote partitions, get the corresponding inbox
-    if (!inbox->count) continue;
-    score_t* values = (score_t*)inbox->pull_values;
+    if (rmt_pid == par->id) { continue; }
+    grooves_box_table_t* inbox = &par->inbox[rmt_pid];
+    // For all remote partitions, get the corresponding inbox.
+    if (!inbox->count) { continue; }
+    score_t* values = reinterpret_cast<score_t*>(inbox->pull_values);
 
-    if (!engine_get_comm_curr(rmt_pid)) continue;
+    if (!engine_get_comm_curr(rmt_pid)) { continue; }
     // If the inbox has some values, determine which type of processing unit
     // corresponds to this partition and call the appropriate gather function
     if (par->processor.type == PROCESSOR_CPU) {
@@ -740,59 +714,56 @@ PRIVATE void betweenness_gather_backward(partition_t* par) {
       betweenness_gather_gpu(par, inbox, state, values);
     } else {
       assert(false);
-    }   
+    }
   }
 }
 
-/**
- * Initializes the state for a round of backward propagation
- */
+// Initializes the state for a round of backward propagation.
 PRIVATE void betweenness_init_backward(partition_t* par) {
-  if (!par->subgraph.vertex_count) return;
-  // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  if (!par->subgraph.vertex_count) { return; }
+  // Get the current state of the algorithm.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
   assert(state);
   vid_t vcount = par->subgraph.vertex_count;
 
-  // Determine which type of memory this partition corresponds to
-  totem_mem_t type = TOTEM_MEM_HOST; 
-  if (par->processor.type == PROCESSOR_GPU) { 
+  // Determine which type of memory this partition corresponds to.
+  totem_mem_t type = TOTEM_MEM_HOST;
+  if (par->processor.type == PROCESSOR_GPU) {
     type = TOTEM_MEM_DEVICE;
   }
 
-  // Initialize the delta values to 0
-  CALL_SAFE(totem_memset(state->delta[par->id], (score_t)0, vcount, type, 
+  // Initialize the delta values to 0.
+  CALL_SAFE(totem_memset(state->delta[par->id], (score_t)0, vcount, type,
                          par->streams[1]));
-
   state->level--;
 }
 
-/**
- * Initializes the state for a round of forward propagation
- */
+// Initializes the state for a round of forward propagation.
 PRIVATE void betweenness_init_forward(partition_t* par) {
-  if (!par->subgraph.vertex_count) return;
-  // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  if (!par->subgraph.vertex_count) { return; }
+  // Get the current state of the algorithm.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
   assert(state);
-  // Get the source partition and source vertex values
+  // Get the source partition and source vertex values.
   id_t src_pid = GET_PARTITION_ID(bc_g.src);
   id_t src_vid = GET_VERTEX_ID(bc_g.src);
   vid_t vcount = par->subgraph.vertex_count;
 
-  // Determine which type of memory this partition corresponds to
-  totem_mem_t type = TOTEM_MEM_HOST; 
-  if (par->processor.type == PROCESSOR_GPU) { 
+  // Determine which type of memory this partition corresponds to.
+  totem_mem_t type = TOTEM_MEM_HOST;
+  if (par->processor.type == PROCESSOR_GPU) {
     type = TOTEM_MEM_DEVICE;
   }
 
-  // Initialize the distances to infinity and numSPs to 0
+  // Initialize the distances to infinity and numSPs to 0.
   for (int pid = 0; pid < engine_partition_count(); pid++) {
     vid_t count = (pid != par->id) ? par->outbox[pid].count : vcount;
     if (count) {
-      CALL_SAFE(totem_memset((state->distance[pid]), INF_COST, count, type, 
+      CALL_SAFE(totem_memset((state->distance[pid]), INF_COST, count, type,
                              par->streams[1]));
-      CALL_SAFE(totem_memset((state->numSPs[pid]), (uint32_t)0, count, type, 
+      CALL_SAFE(totem_memset((state->numSPs[pid]), (uint32_t)0, count, type,
                              par->streams[1]));
     }
   }
@@ -803,9 +774,9 @@ PRIVATE void betweenness_init_forward(partition_t* par) {
     CALL_SAFE(totem_memset(&((state->numSPs[par->id])[src_vid]), (uint32_t)1,
                            1, type, par->streams[1]));
   }
-  
+
   // Initialize the outbox to 0 and set the level to 0
-  engine_set_outbox(par->id, 0); 
+  engine_set_outbox(par->id, 0);
   state->level = 0;
   totem_memset(state->comm, true, engine_vertex_count(), TOTEM_MEM_HOST);
 }
@@ -814,68 +785,69 @@ PRIVATE void betweenness_init_forward(partition_t* par) {
  * Allocates and initializes the state for Betweenness Centrality
  */
 PRIVATE void betweenness_init(partition_t* par) {
-  if (!par->subgraph.vertex_count) return;
+  if (!par->subgraph.vertex_count) { return; }
   // Allocate memory for the per-partition state
-  betweenness_state_t* state = (betweenness_state_t*)
-                               calloc(1, sizeof(betweenness_state_t));
-  assert(state); 
-  // Set the partition's state variable to the previously allocated state
+  betweenness_state_t* state = reinterpret_cast<betweenness_state_t*>
+      (calloc(1, sizeof(betweenness_state_t)));
+  assert(state);
+  // Set the partition's state variable to the previously allocated state.
   par->algo_state = state;
   vid_t vcount = par->subgraph.vertex_count;
 
-  // Determine which type of memory this partition corresponds to
-  totem_mem_t type = TOTEM_MEM_HOST; 
-  if (par->processor.type == PROCESSOR_GPU) { 
+  // Determine which type of memory this partition corresponds to.
+  totem_mem_t type = TOTEM_MEM_HOST;
+  if (par->processor.type == PROCESSOR_GPU) {
     type = TOTEM_MEM_DEVICE;
     frontier_init_gpu(&state->frontier, par->subgraph.vertex_count);
   }
-  
+
   CALL_SAFE(totem_calloc(vcount * sizeof(score_t), type,
-                         (void**)&(state->delta[par->id])));
+                         reinterpret_cast<void**>(&(state->delta[par->id]))));
   CALL_SAFE(totem_calloc(vcount * sizeof(score_t), type,
-                         (void**)&(state->betweenness)));
+                         reinterpret_cast<void**>(&(state->betweenness))));
 
   // Allocate memory for the various pieces of data required for the
-  // Betweenness Centrality algorithm
+  // Betweenness Centrality algorithm.
   for (int pid = 0; pid < engine_partition_count(); pid++) {
     vid_t count = (pid != par->id) ? par->outbox[pid].count : vcount;
     if (count) {
-      CALL_SAFE(totem_malloc(count * sizeof(cost_t), type, 
-                             (void**)&(state->distance[pid])));
-      CALL_SAFE(totem_calloc(count * sizeof(uint32_t), type, 
-                             (void**)&(state->numSPs[pid])));
-    }    
+      CALL_SAFE(totem_malloc(count * sizeof(cost_t), type,
+                             reinterpret_cast<void**>
+                             (&(state->distance[pid]))));
+      CALL_SAFE(totem_calloc(count * sizeof(uint32_t), type,
+                             reinterpret_cast<void**>(&(state->numSPs[pid]))));
+    }
     state->numSPs_f[pid] = state->numSPs[pid];
   }
 
-  // Initialize the state's done flag
+  // Initialize the state's done flag.
   state->done = engine_get_finished_ptr(par->id);
 
-  // Initialize the comm array
-  totem_calloc(engine_vertex_count(), TOTEM_MEM_HOST, (void**)&state->comm);
+  // Initialize the comm array.
+  totem_calloc(engine_vertex_count(), TOTEM_MEM_HOST,
+               reinterpret_cast<void**>(&state->comm));
 
-  // Initialize the state
+  // Initialize the state.
   betweenness_init_forward(par);
 }
 
-/**
- * Cleans up allocated memory on the CPU and GPU
- */
+// Cleans up allocated memory on the CPU and GPU.
 PRIVATE void betweenness_finalize(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;  
-
-  // Free the allocated memory
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
- 
-  // Determine which type of memory this partition corresponds to
-  totem_mem_t type = TOTEM_MEM_HOST; 
-  if (par->processor.type == PROCESSOR_GPU) { 
-    type = TOTEM_MEM_DEVICE; 
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count) { return; }
+
+  // Free the allocated memory.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
+
+  // Determine which type of memory this partition corresponds to.
+  totem_mem_t type = TOTEM_MEM_HOST;
+  if (par->processor.type == PROCESSOR_GPU) {
+    type = TOTEM_MEM_DEVICE;
     frontier_finalize_gpu(&state->frontier);
   }
 
-  // Free the memory allocated for the algorithm
+  // Free the memory allocated for the algorithm.
   for (int pid = 0; pid < engine_partition_count(); pid++) {
     totem_free(state->distance[pid], type);
     totem_free(state->numSPs[pid], type);
@@ -884,34 +856,34 @@ PRIVATE void betweenness_finalize(partition_t* par) {
   totem_free(state->betweenness, type);
   totem_free(state->comm, TOTEM_MEM_HOST);
 
-  // Free the per-partition state and set it to NULL
+  // Free the per-partition state and set it to NULL.
   free(state);
   par->algo_state = NULL;
 }
 
-/**
- * Aggregates the final result to be returned at the end
- */
-PRIVATE void betweenness_aggr(partition_t* par) {  
-  if (!par->subgraph.vertex_count) return;
-  // Get the current state of the algorithm
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+// Aggregates the final result to be returned at the end
+PRIVATE void betweenness_aggr(partition_t* par) {
+  if (!par->subgraph.vertex_count) { return; }
+  // Get the current state of the algorithm.
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
   graph_t* subgraph = &par->subgraph;
   score_t* betweenness_values = NULL;
-  // Determine which type of processor this partition corresponds to
+  // Determine which type of processor this partition corresponds to.
   if (par->processor.type == PROCESSOR_CPU) {
-    // If it is a CPU partition, grab the computed betweenness value directly
+    // If it is a CPU partition, grab the computed betweenness value directly.
     betweenness_values = state->betweenness;
   } else if (par->processor.type == PROCESSOR_GPU) {
-    // If it is a GPU partition, copy the computed score back to the host
+    // If it is a GPU partition, copy the computed score back to the host.
     assert(bc_g.betweenness_score_h);
-    CALL_CU_SAFE(cudaMemcpy(bc_g.betweenness_score_h, state->betweenness, 
+    CALL_CU_SAFE(cudaMemcpy(bc_g.betweenness_score_h, state->betweenness,
                             subgraph->vertex_count * sizeof(score_t),
                             cudaMemcpyDefault));
     betweenness_values = bc_g.betweenness_score_h;
   } else {
     assert(false);
   }
+
   // Aggregate the results
   assert(bc_g.betweenness_score);
   OMP(omp parallel for schedule(static))
@@ -925,42 +897,44 @@ PRIVATE void betweenness_aggr(partition_t* par) {
       // computed using a subset of the total nodes within the graph
       // The scaling value is: (Total Number of Nodes / Subset of Nodes Used)
       bc_g.betweenness_score[par->map[v]] = betweenness_values[v] *
-        (score_t)(((double)(engine_vertex_count())) / bc_g.num_samples); 
+          static_cast<score_t>((static_cast<double>(engine_vertex_count())) /
+                  bc_g.num_samples);
     }
   }
 }
 
-/**
- * The following two functions are the kernel and gather callbacks of a single 
- * BSP cycle that synchronizes the distance of remote vertices
- */
+// The following two functions are the kernel and gather callbacks of a single
+// BSP cycle that synchronizes the distance of remote vertices.
 PRIVATE void betweenness_gather_distance(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count) { return; }
   if (engine_superstep() == 1) {
-    betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+    betweenness_state_t* state =
+        reinterpret_cast<betweenness_state_t*>(par->algo_state);
     assert(state);
     engine_gather_inbox(par->id, state->distance[par->id]);
   }
 }
 PRIVATE void betweenness_synch_distance(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count) { return; }
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
   assert(state);
   if (engine_superstep() == 1) {
     engine_report_not_finished();
   } else {
     for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) {
-      if (par->id == rmt_pid) continue;
+      if (par->id == rmt_pid) { continue; }
       if (par->processor.type == PROCESSOR_GPU) {
-        CALL_CU_SAFE(cudaMemcpyAsync(state->distance[rmt_pid], 
+        CALL_CU_SAFE(cudaMemcpyAsync(state->distance[rmt_pid],
                                      par->outbox[rmt_pid].pull_values,
-                                     par->outbox[rmt_pid].count * 
+                                     par->outbox[rmt_pid].count *
                                      sizeof(cost_t),
                                      cudaMemcpyDefault, par->streams[1]));
       } else {
-        cost_t* src = (cost_t*)par->outbox[rmt_pid].pull_values;
+        cost_t* src =
+            reinterpret_cast<cost_t*>(par->outbox[rmt_pid].pull_values);
         cost_t* dst = state->distance[rmt_pid];
         OMP(omp parallel for schedule(static))
         for (int i = 0; i < par->outbox[rmt_pid].count; i++) {
@@ -971,37 +945,38 @@ PRIVATE void betweenness_synch_distance(partition_t* par) {
   }
 }
 
-/**
- * The following two functions are the kernel and gather callbacks of a single 
- * BSP cycle that synchronizes the numSPs of remote vertices
- */
+// The following two functions are the kernel and gather callbacks of a single
+// BSP cycle that synchronizes the numSPs of remote vertices.
 PRIVATE void betweenness_gather_numSPs(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count) { return; }
   if (engine_superstep() == 1) {
-    betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+    betweenness_state_t* state =
+        reinterpret_cast<betweenness_state_t*>(par->algo_state);
     assert(state);
     engine_gather_inbox(par->id, state->numSPs[par->id]);
   }
 }
 PRIVATE void betweenness_synch_numSPs(partition_t* par) {
-  // Check if there is no work to be done
-  if (!par->subgraph.vertex_count) return;
-  betweenness_state_t* state = (betweenness_state_t*)par->algo_state;
+  // Check if there is no work to be done.
+  if (!par->subgraph.vertex_count) { return; }
+  betweenness_state_t* state =
+      reinterpret_cast<betweenness_state_t*>(par->algo_state);
   assert(state);
   if (engine_superstep() == 1) {
     engine_report_not_finished();
   } else {
     for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) {
-      if (par->id == rmt_pid) continue;
+      if (par->id == rmt_pid) { continue; }
       if (par->processor.type == PROCESSOR_GPU) {
-        CALL_CU_SAFE(cudaMemcpyAsync(state->numSPs[rmt_pid], 
+        CALL_CU_SAFE(cudaMemcpyAsync(state->numSPs[rmt_pid],
                                      par->outbox[rmt_pid].pull_values,
-                                     par->outbox[rmt_pid].count * 
+                                     par->outbox[rmt_pid].count *
                                      sizeof(uint32_t),
                                      cudaMemcpyDefault, par->streams[1]));
       } else {
-        uint32_t* src = (uint32_t*)par->outbox[rmt_pid].pull_values;
+        uint32_t* src =
+            reinterpret_cast<uint32_t*>(par->outbox[rmt_pid].pull_values);
         uint32_t* dst = state->numSPs[rmt_pid];
         OMP(omp parallel for schedule(static))
         for (int i = 0; i < par->outbox[rmt_pid].count; i++) {
@@ -1012,33 +987,31 @@ PRIVATE void betweenness_synch_numSPs(partition_t* par) {
   }
 }
 
-/**
- * Core functionality for main for loop within the BC computation
- */
+// Core functionality for main for loop within the BC computation.
 void betweenness_hybrid_core(vid_t source, bool is_first_iteration,
                              bool is_last_iteration) {
-  // Set the source node for this iteration
+  // Set the source node for this iteration.
   bc_g.src  = engine_vertex_id_in_partition(source);
 
-  // Forward propagation
+  // Forward propagation.
   engine_par_init_func_t init_forward = betweenness_init_forward;
   if (is_first_iteration) {
     init_forward = betweenness_init;
   }
   // Configure the parameters for forward propagation given the current
-  // iteration of the overall computation
+  // iteration of the overall computation.
   engine_config_t config_forward = {
-    NULL, betweenness_forward, betweenness_scatter_forward, NULL, 
+    NULL, betweenness_forward, betweenness_scatter_forward, NULL,
     init_forward, NULL, NULL, GROOVES_PUSH
   };
-  // Call Totem to begin the computation phase given the specified 
-  // configuration
+  // Call Totem to begin the computation phase given the specified
+  // configuration.
   engine_config(&config_forward);
   engine_execute();
 
   // Synchronize the distance and numSPs state, which have been calculated in
   // the forward phase, across all partitions. This state will be used in the
-  // backward propagation phase
+  // backward propagation phase.
   engine_config_t config_distance_state = {
     NULL, betweenness_synch_distance, NULL, betweenness_gather_distance,
     NULL, NULL, NULL, GROOVES_PULL
@@ -1077,55 +1050,56 @@ void betweenness_hybrid_core(vid_t source, bool is_first_iteration,
 error_t betweenness_hybrid(double epsilon, score_t* betweenness_score) {
   // Sanity check on input
   bool finished = false;
-  error_t rc = betweenness_check_special_cases(engine_get_graph(), 
+  error_t rc = betweenness_check_special_cases(engine_get_graph(),
                                                &finished, betweenness_score);
-  if (finished) return rc;
+  if (finished) { return rc; }
 
   // Initialize the global state
   memset(&bc_g, 0, sizeof(bc_g));
   bc_g.betweenness_score = betweenness_score;
-  CALL_SAFE(totem_memset(bc_g.betweenness_score, (score_t)0, 
+  CALL_SAFE(totem_memset(bc_g.betweenness_score, (score_t)0,
                          engine_vertex_count(), TOTEM_MEM_HOST));
   bc_g.epsilon = epsilon;
 
   if (engine_largest_gpu_partition()) {
     CALL_SAFE(totem_malloc(engine_largest_gpu_partition() * sizeof(score_t),
-                           TOTEM_MEM_HOST_PINNED, 
-                           (void**)&bc_g.betweenness_score_h));
+                           TOTEM_MEM_HOST_PINNED,
+                           reinterpret_cast<void**>
+                           (&bc_g.betweenness_score_h)));
   }
 
   // Determine whether we will compute exact or approximate BC values
   if (epsilon == CENTRALITY_EXACT) {
     // Compute exact values for Betweenness Centrality
     vid_t vcount = engine_vertex_count();
-    for (vid_t source = 0; source < vcount; source++) { 
-      betweenness_hybrid_core(source, (source == 0), (source == (vcount-1)));  
+    for (vid_t source = 0; source < vcount; source++) {
+      betweenness_hybrid_core(source, (source == 0), (source == (vcount-1)));
     }
   } else {
     // Compute approximate values based on the value of epsilon provided
     // Select a subset of source nodes to make the computation faster
     int num_samples = centrality_get_number_sample_nodes(engine_vertex_count(),
                                                          epsilon);
-    // Store the number of samples used in the global state to be used for 
+    // Store the number of samples used in the global state to be used for
     // scaling the computed metric during aggregation
     bc_g.num_samples = num_samples;
     // Populate the array of indices to sample
     vid_t* sample_nodes = centrality_select_sampling_nodes(
                           engine_get_graph(), num_samples);
- 
+
     for (int source_index = 0; source_index < num_samples; source_index++) {
       // Get the next sample node in the array to use as a source
-      vid_t source = sample_nodes[source_index];    
-      betweenness_hybrid_core(source, (source_index == 0), 
-                              (source_index == (num_samples-1)));  
-    } 
- 
+      vid_t source = sample_nodes[source_index];
+      betweenness_hybrid_core(source, (source_index == 0),
+                              (source_index == (num_samples-1)));
+    }
+
     // Clean up the allocated memory
     free(sample_nodes);
   }
- 
+
   // Clean up and return
-  if (engine_largest_gpu_partition()) { 
+  if (engine_largest_gpu_partition()) {
     totem_free(bc_g.betweenness_score_h, TOTEM_MEM_HOST_PINNED);
   }
   memset(&bc_g, 0, sizeof(betweenness_global_state_t));
diff --git a/src/alg/totem_bfs_stepwise_hybrid.cu b/src/alg/totem_bfs_stepwise_hybrid.cu
new file mode 100644
index 0000000..c5748cd
--- /dev/null
+++ b/src/alg/totem_bfs_stepwise_hybrid.cu
@@ -0,0 +1,447 @@
+/**
+ * This file contains an implementation of the breadth-first search (BFS) graph
+ * search algorithm using the totem framework. This is a modified version that
+ * performs the algorithm in a Bottom Up fashion.
+ * 
+ * This implementation only works for undirected graphs.
+ * 
+ * Based off of the work by Scott Beamer et al.
+ * Searching for a Parent Instead of Fighting Over Children: A Fast
+ * Breadth-First Search Implementation for Graph500.
+ * http://www.eecs.berkeley.edu/Pubs/TechRpts/2011/EECS-2011-117.pdf
+ * 
+ * TODO(scott): Modify the algorithm to swap between top down and bottom up
+ *              steps.
+ *
+ *  Created on: 2014-08-26
+ *  Authors:    Scott Sallinen
+ *              Abdullah Gharaibeh
+ */
+
+#include "totem_alg.h"
+#include "totem_engine.cuh"
+
+// Per-partition specific state.
+typedef struct bfs_state_s {
+  cost_t*   cost;              // One slot per vertex in the partition.
+  bitmap_t  visited[MAX_PARTITION_COUNT];   // A list of bitmaps, one for each
+                                            // remote partition.
+  bitmap_t  frontier[MAX_PARTITION_COUNT];  // A list of bitmaps, one for each
+                                            // remote partition.
+  bool*     finished;          // Points to Totem's finish flag.
+  cost_t    level;             // Current level to process by the partition.
+  frontier_state_t frontier_state;   // Frontier management state.
+} bfs_state_t;
+
+// State shared between all partitions.
+typedef struct bfs_global_state_s {
+  cost_t*   cost;    // Final output buffer.
+  cost_t*   cost_h;  // Used as a temporary buffer to receive the final
+                     // result copied back from GPU partitions before being
+                     // copied again to the final output buffer.
+                     // TODO(abdullah): push this buffer to be managed by Totem
+  vid_t     src;     // Source vertex id. (The id after partitioning.)
+} bfs_global_state_t;
+PRIVATE bfs_global_state_t state_g = {NULL, NULL, 0};
+
+// Checks for input parameters and special cases. This is invoked at the
+// beginning of public interfaces (GPU and CPU)
+PRIVATE error_t check_special_cases(vid_t src, cost_t* cost, bool* finished) {
+  *finished = true;
+  if ((src >= engine_vertex_count()) || (cost == NULL)) {
+    return FAILURE;
+  } else if (engine_vertex_count() == 1) {
+    cost[0] = 0;
+    return SUCCESS;
+  } else if (engine_edge_count() == 0) {
+    // Initialize cost to INFINITE.
+    totem_memset(cost, INF_COST, engine_vertex_count(), TOTEM_MEM_HOST);
+    cost[src] = 0;
+    return SUCCESS;
+  }
+  *finished = false;
+  return SUCCESS;
+}
+
+// A step that iterates across unvisited vertices and determines
+// their status in the next frontier.
+PRIVATE void bfs_bu_step(partition_t* par, bfs_state_t* state) {
+  graph_t* subgraph = &par->subgraph;
+  bool finished = true;
+
+  // Iterate across all of our vertices.
+  OMP(omp parallel for schedule(runtime) reduction(& : finished))
+  for (vid_t vertex_id = 0; vertex_id < subgraph->vertex_count; vertex_id++) {
+    // Ignore the local vertex if it has already been visited.
+    if (bitmap_is_set(state->visited[par->id], vertex_id)) { continue; }
+
+    // Iterate across the neighbours of this vertex.
+    for (eid_t i = subgraph->vertices[vertex_id];
+         i < subgraph->vertices[vertex_id + 1]; i++) {
+      int nbr_pid = GET_PARTITION_ID(subgraph->edges[i]);
+      int nbr = GET_VERTEX_ID(subgraph->edges[i]);
+
+      // Check if the bitmap corresponding to the vertices PID is set.
+      // This means the partition that the vertex belongs to, has explored it.
+      if (bitmap_is_set(state->frontier[nbr_pid], nbr)) {
+        // Add the vertex we are exploring to the next frontier.
+        bitmap_set_cpu(state->visited[par->id], vertex_id);
+
+        // Increment the level of this vertex.
+        state->cost[vertex_id] = state->level + 1;
+        finished = false;
+        break;
+      }
+    }  // End of neighbour check - vertex examined.
+  }  // All vertices examined in level.
+
+  // Move over the finished variable.
+  if (!finished) *(state->finished) = false;
+}
+
+// This is a CPU version of the Bottom-up/Top-down BFS algorithm.
+// See file header for full details.
+void bfs_stepwise_cpu(partition_t* par, bfs_state_t* state) {
+  // Update the frontier.
+  frontier_update_bitmap_cpu(&state->frontier_state, state->visited[par->id]);
+  state->frontier[par->id] = state->frontier_state.current;
+
+  // Execute a step.
+  bfs_bu_step(par, state);
+}
+
+// A gpu version of the Bottom-up step as a kernel.
+__global__ void bfs_bu_kernel(partition_t par, bfs_state_t state) {
+  const vid_t vertex_id = THREAD_GLOBAL_INDEX;
+  graph_t subgraph = par.subgraph;
+  if (vertex_id >= subgraph.vertex_count) { return; }
+  bool finished = true;
+
+  // Ignore the local vertex if it has already been visited.
+  if (bitmap_is_set(state.visited[par.id], vertex_id)) { return; }
+
+  // Iterate across all neighbours of the vertex.
+  for (eid_t i = subgraph.vertices[vertex_id];
+       i < subgraph.vertices[vertex_id + 1]; i++) {
+    int nbr_pid = GET_PARTITION_ID(subgraph.edges[i]);
+    int nbr = GET_VERTEX_ID(subgraph.edges[i]);
+
+    // Check if neighbour is in the current frontier.
+    if (bitmap_is_set(state.frontier[nbr_pid], nbr)) {
+      // Add the vertex we are exploring to the next frontier.
+      bitmap_set_gpu(state.visited[par.id], vertex_id);
+
+      // Increment the level of this vertex.
+      state.cost[vertex_id] = state.level + 1;
+      finished = false;
+      break;
+    }
+  }
+
+  // Move over the finished variable.
+  if (!finished) *(state.finished) = false;
+}
+
+// This is a GPU version of the Bottom-up/Top-down BFS algorithm.
+// See file header for full details.
+__host__ error_t bfs_stepwise_gpu(partition_t* par, bfs_state_t* state) {
+  dim3 blocks;
+  dim3 threads_per_block;
+  KERNEL_CONFIGURE(par->subgraph.vertex_count, blocks, threads_per_block);
+
+  // Update the frontier.
+  frontier_update_bitmap_gpu(&state->frontier_state, state->visited[par->id],
+                             par->streams[1]);
+  state->frontier[par->id] = state->frontier_state.current;
+
+  // Execute a step.
+  bfs_bu_kernel<<<blocks, threads_per_block, 0, par->streams[1]>>>
+    (*par, *state);
+
+  return SUCCESS;
+}
+
+// The execution phase - based off of the partition we are, launch an approach.
+PRIVATE void bfs(partition_t* par) {
+  if (par->subgraph.vertex_count == 0) { return; }
+
+  // Ignore the first round - this allows us to communicate the frontier with
+  // an updated visited status of the source vertex.
+  if (engine_superstep() == 1) {
+    engine_report_not_finished();
+    return;
+  }
+
+  bfs_state_t* state = reinterpret_cast<bfs_state_t*>(par->algo_state);
+
+  // Launch the processor specific algorithm.
+  if (par->processor.type == PROCESSOR_CPU) {
+    bfs_stepwise_cpu(par, state);
+  } else if (par->processor.type == PROCESSOR_GPU) {
+    bfs_stepwise_gpu(par, state);
+  } else {
+    assert(false);
+  }
+
+  // At the end of the round, increase our BFS level.
+  state->level++;
+}
+
+// Gather for the CPU bitmap to inbox.
+PRIVATE void bfs_gather_cpu(partition_t* par, bfs_state_t* state,
+                            grooves_box_table_t* inbox) {
+  // Iterate across the items in the inbox.
+  OMP(omp parallel for schedule(static))
+  for (vid_t index = 0; index < inbox->count; index++) {
+    // Lookup the local vertex it points to.
+    vid_t vid = inbox->rmt_nbrs[index];
+
+    // Set the bit state to our local state.
+    if (bitmap_is_set(state->visited[par->id], vid)) {
+      bitmap_set_cpu(reinterpret_cast<bitmap_t>(inbox->pull_values), index);
+    }
+  }
+}
+
+// Gather for the GPU bitmap to inbox.
+__global__ void bfs_gather_gpu(partition_t par, bfs_state_t state,
+                               grooves_box_table_t inbox) {
+  const vid_t index = THREAD_GLOBAL_INDEX;
+  if (index >= inbox.count) { return; }
+
+  // Lookup the local vertex it points to.
+  vid_t vid = inbox.rmt_nbrs[index];
+
+  // Set the bit state to our local state.
+  if (bitmap_is_set(state.visited[par.id], vid)) {
+    bitmap_set_gpu(reinterpret_cast<bitmap_t>(inbox.pull_values), index);
+  }
+}
+
+// The gather phase - apply values from the inboxes to the partitions' local
+// variables.
+PRIVATE void bfs_gather(partition_t* par) {
+  bfs_state_t* state = reinterpret_cast<bfs_state_t*>(par->algo_state);
+
+  // Across all partitions that are not us.
+  for (int rmt_pid = 0; rmt_pid < engine_partition_count(); rmt_pid++) {
+    if (rmt_pid == par->id) { continue; }
+
+    // Select the inbox to apply to.
+    grooves_box_table_t* inbox = &par->inbox[rmt_pid];
+    if (inbox->count == 0) { continue; }
+
+    // Select a method based off of our processor type.
+    if (par->processor.type == PROCESSOR_CPU) {
+      bfs_gather_cpu(par, state, inbox);
+    } else if (par->processor.type == PROCESSOR_GPU) {
+      // Set up to iterate across the items in the inbox.
+      dim3 blocks;
+      dim3 threads_per_block;
+      KERNEL_CONFIGURE(inbox->count, blocks, threads_per_block);
+      bfs_gather_gpu<<<blocks, threads_per_block, 0, par->streams[1]>>>
+        (*par, *state, *inbox);
+    } else {
+      assert(false);
+    }
+  }
+}
+
+// The aggregate phase - combine results to be presented.
+PRIVATE void bfs_aggregate(partition_t* par) {
+  if (!par->subgraph.vertex_count) { return; }
+
+  bfs_state_t* state    = reinterpret_cast<bfs_state_t*>(par->algo_state);
+  graph_t*     subgraph = &par->subgraph;
+  cost_t*      src_cost = NULL;
+
+  // Apply the cost from our partition into the final cost array.
+  if (par->processor.type == PROCESSOR_CPU) {
+    src_cost = state->cost;
+  } else if (par->processor.type == PROCESSOR_GPU) {
+    assert(state_g.cost_h);
+    CALL_CU_SAFE(cudaMemcpy(state_g.cost_h, state->cost,
+                            subgraph->vertex_count * sizeof(cost_t),
+                            cudaMemcpyDefault));
+    src_cost = state_g.cost_h;
+  } else {
+    assert(false);
+  }
+
+  // Aggregate the results.
+  assert(state_g.cost);
+  OMP(omp parallel for schedule(runtime))
+  for (vid_t v = 0; v < subgraph->vertex_count; v++) {
+    state_g.cost[par->map[v]] = src_cost[v];
+  }
+}
+
+// A simple kernel that sets the source vertex to visited on the GPU.
+__global__ void bfs_init_bu_kernel(bitmap_t visited, vid_t src) {
+  if (THREAD_GLOBAL_INDEX != 0) { return; }
+  bitmap_set_gpu(visited, src);
+}
+
+// Initialize the GPU memory - bitmaps and frontier.
+PRIVATE inline void bfs_init_gpu(partition_t* par) {
+  bfs_state_t* state = reinterpret_cast<bfs_state_t*>(par->algo_state);
+
+  // Initialize our visited bitmap.
+  state->visited[par->id] = bitmap_init_gpu(par->subgraph.vertex_count);
+
+  // Initialize other partitions frontier bitmaps.
+  for (int pid = 0; pid < engine_partition_count(); pid++) {
+    // Assign the outboxes to our frontier bitmap pointers.
+    if (pid != par->id && par->outbox[pid].count != 0) {
+      state->frontier[pid] =
+        reinterpret_cast<bitmap_t>(par->outbox[pid].pull_values);
+    }
+    // Clear the inboxes, and also their shadows.
+    if (pid != par->id && par->inbox[pid].count != 0) {
+      bitmap_reset_gpu(reinterpret_cast<bitmap_t>
+                         (par->inbox[pid].pull_values),
+                       par->inbox[pid].count,
+                       par->streams[1]);
+      bitmap_reset_gpu(reinterpret_cast<bitmap_t>
+                         (par->inbox[pid].pull_values_s),
+                       par->inbox[pid].count,
+                       par->streams[1]);
+    }
+  }
+
+  // Set the source vertex as visited, if it is in our partition.
+  if (GET_PARTITION_ID(state_g.src) == par->id) {
+    bfs_init_bu_kernel<<<1, 1, 0, par->streams[1]>>>
+      (state->visited[par->id], GET_VERTEX_ID(state_g.src));
+    CALL_CU_SAFE(cudaGetLastError());
+  }
+
+  // Initialize our local frontier.
+  frontier_init_gpu(&state->frontier_state, par->subgraph.vertex_count);
+}
+
+// Initialize the CPU memory - bitmaps and frontier.
+PRIVATE inline void bfs_init_cpu(partition_t* par) {
+  bfs_state_t* state = reinterpret_cast<bfs_state_t*>(par->algo_state);
+
+  // Initialize our visited bitmap.
+  state->visited[par->id] = bitmap_init_cpu(par->subgraph.vertex_count);
+
+  // Initialize other partitions bitmaps.
+  for (int pid = 0; pid < engine_partition_count(); pid++) {
+    // Assign the outboxes to our frontier bitmap pointers.
+    if (pid != par->id && par->outbox[pid].count != 0) {
+      state->frontier[pid] =
+        reinterpret_cast<bitmap_t>(par->outbox[pid].pull_values);
+    }
+    // Clear the inboxes, and also their shadows.
+    if (pid != par->id && par->inbox[pid].count != 0) {
+      bitmap_reset_cpu(reinterpret_cast<bitmap_t>
+                         (par->inbox[pid].pull_values),
+                       par->inbox[pid].count);
+      bitmap_reset_cpu(reinterpret_cast<bitmap_t>
+                         (par->inbox[pid].pull_values_s),
+                       par->inbox[pid].count);
+    }
+  }
+
+  // Set the source vertex as visited, if it is in our partition.
+  if (GET_PARTITION_ID(state_g.src) == par->id) {
+    bitmap_set_cpu(state->visited[par->id], GET_VERTEX_ID(state_g.src));
+  }
+
+  // Initialize our local frontier.
+  frontier_init_cpu(&state->frontier_state, par->subgraph.vertex_count);
+}
+
+// The init phase - Set up the memory and statuses.
+PRIVATE void bfs_init(partition_t* par) {
+  if (par->subgraph.vertex_count == 0) { return; }
+  bfs_state_t* state =
+    reinterpret_cast<bfs_state_t*>(calloc(1, sizeof(bfs_state_t)));
+  assert(state);
+
+  // Initialize based off of our processor type.
+  par->algo_state = state;
+  totem_mem_t type = TOTEM_MEM_HOST;
+  if (par->processor.type == PROCESSOR_CPU) {
+    bfs_init_cpu(par);
+  } else if (par->processor.type == PROCESSOR_GPU) {
+    type = TOTEM_MEM_DEVICE;
+    bfs_init_gpu(par);
+  } else {
+    assert(false);
+  }
+
+  // Allocate memory for the cost array, and set it to INFINITE cost.
+  CALL_SAFE(totem_malloc(par->subgraph.vertex_count * sizeof(cost_t), type,
+                         reinterpret_cast<void**>(&(state->cost))));
+  totem_memset(state->cost, INF_COST, par->subgraph.vertex_count, type,
+               par->streams[1]);
+
+  if (GET_PARTITION_ID(state_g.src) == par->id) {
+    // For the source vertex, initialize cost.
+    totem_memset(&((state->cost)[GET_VERTEX_ID(state_g.src)]), (cost_t)0, 1,
+                 type, par->streams[1]);
+  }
+
+  // Set level 0 to start, and finished pointer.
+  state->finished = engine_get_finished_ptr(par->id);
+  state->level = 0;
+}
+
+// The finalize phase - clean up.
+PRIVATE void bfs_finalize(partition_t* par) {
+  if (par->subgraph.vertex_count == 0) { return; }
+  bfs_state_t* state = reinterpret_cast<bfs_state_t*>(par->algo_state);
+  totem_mem_t type = TOTEM_MEM_HOST;
+
+  // Finalize frontiers.
+  if (par->processor.type == PROCESSOR_CPU) {
+    bitmap_finalize_cpu(state->visited[par->id]);
+    frontier_finalize_cpu(&state->frontier_state);
+  } else if (par->processor.type == PROCESSOR_GPU) {
+    bitmap_finalize_gpu(state->visited[par->id]);
+    type = TOTEM_MEM_DEVICE;
+    frontier_finalize_gpu(&state->frontier_state);
+  } else {
+    assert(false);
+  }
+
+  // Free memory.
+  totem_free(state->cost, type);
+  free(state);
+  par->algo_state = NULL;
+}
+
+// The launch point for the algorithm - set up engine, cost, and launch.
+error_t bfs_stepwise_hybrid(vid_t src, cost_t* cost) {
+  // Check for special cases.
+  bool finished = false;
+  error_t rc = check_special_cases(src, cost, &finished);
+  if (finished) { return rc; }
+
+  // Initialize the global state.
+  state_g.cost = cost;
+  state_g.src  = engine_vertex_id_in_partition(src);
+
+  // Initialize the engine.
+  engine_config_t config = {
+    NULL, bfs, NULL, bfs_gather, bfs_init, bfs_finalize, bfs_aggregate,
+    GROOVES_PULL
+  };
+  engine_config(&config);
+  if (engine_largest_gpu_partition()) {
+    CALL_SAFE(totem_malloc(engine_largest_gpu_partition() * sizeof(cost_t),
+                           TOTEM_MEM_HOST,
+                           reinterpret_cast<void**>(&state_g.cost_h)));
+  }
+  engine_execute();
+
+  // Clean up and return.
+  if (engine_largest_gpu_partition()) {
+    totem_free(state_g.cost_h, TOTEM_MEM_HOST);
+  }
+  memset(&state_g, 0, sizeof(bfs_global_state_t));
+  return SUCCESS;
+}
diff --git a/src/alg/totem_page_rank.cu b/src/alg/totem_page_rank.cu
index 0f4321f..c939e81 100644
--- a/src/alg/totem_page_rank.cu
+++ b/src/alg/totem_page_rank.cu
@@ -27,19 +27,17 @@
  */
 typedef struct {
   // One is added to make it easy to calculate the number of neighbors of the
-  // last vertex. Another one is added to ensure 8 bytes alignment irrespective 
-  // whether sizeof(eid_t) is 4 or 8. Alignment is enforced for performance 
+  // last vertex. Another one is added to ensure 8 bytes alignment irrespective
+  // whether sizeof(eid_t) is 4 or 8. Alignment is enforced for performance
   // reasons.
   eid_t  vertices[VWARP_DEFAULT_BATCH_SIZE + 2];
   rank_t rank[VWARP_DEFAULT_BATCH_SIZE];
 } vwarp_mem_t;
 
-/**
- * Checks for input parameters and special cases. This is invoked at the
- * beginning of public interfaces (GPU and CPU)
-*/
+// Checks for input parameters and special cases. This is invoked at the
+// beginning of public interfaces (GPU and CPU).
 PRIVATE
-error_t check_special_cases(const graph_t* graph, rank_t* rank, 
+error_t check_special_cases(const graph_t* graph, rank_t* rank,
                             bool* finished) {
   *finished = true;
   if (graph == NULL) {
@@ -54,26 +52,23 @@ error_t check_special_cases(const graph_t* graph, rank_t* rank,
   return SUCCESS;
 }
 
-/**
- * A common initialization function for GPU implementations. It allocates and
- * initalizes state on the GPU
-*/
+
+// A common initialization function for GPU implementations. It allocates and
+// initalizes state on the GPU.
 PRIVATE
-error_t initialize_gpu(const graph_t* graph, rank_t* rank_i, vid_t rank_length, 
+error_t initialize_gpu(const graph_t* graph, rank_t* rank_i, vid_t rank_length,
                        graph_t** graph_d, rank_t **rank_d, rank_t** mailbox_d) {
   totem_mem_t type = TOTEM_MEM_DEVICE;
-
-  // will be passed to the kernel
   CHK_SUCCESS(graph_initialize_device(graph, graph_d), err);
-
-  // allocate mailbox and outbox device buffers
   CHK_SUCCESS(totem_calloc(graph->vertex_count * sizeof(rank_t), type,
-                           (void**)mailbox_d), err_free_graph_d);
+                           reinterpret_cast<void**>(mailbox_d)),
+              err_free_graph_d);
   CHK_SUCCESS(totem_malloc(rank_length * sizeof(rank_t), type,
-                           (void**)rank_d), err_free_mailbox);
+                           reinterpret_cast<void**>(rank_d)),
+              err_free_mailbox);
 
   if (rank_i == NULL) {
-    rank_t initial_value = 1 / (rank_t)graph->vertex_count;
+    rank_t initial_value = 1 / static_cast<rank_t>(graph->vertex_count);
     totem_memset(*rank_d, initial_value, rank_length, type);
   } else {
     CHK_CU_SUCCESS(cudaMemcpy(*rank_d, rank_i, rank_length * sizeof(rank_t),
@@ -93,15 +88,13 @@ error_t initialize_gpu(const graph_t* graph, rank_t* rank_i, vid_t rank_length,
   return FAILURE;
 }
 
-/**
- * A common finalize function for GPU implementations. It allocates the host
- * output buffer, moves the final results from GPU to the host buffers and
- * frees up some resources.
-*/
+// A common finalize function for GPU implementations. It allocates the host
+// output buffer, moves the final results from GPU to the host buffers and
+// frees up some resources.
 PRIVATE
-error_t finalize_gpu(graph_t* graph_d, rank_t* rank_d, rank_t* mailbox_d, 
+error_t finalize_gpu(graph_t* graph_d, rank_t* rank_d, rank_t* mailbox_d,
                      rank_t* rank) {
-  // Copy back the final result
+  // Copy back the final result.
   CHK_CU_SUCCESS(cudaMemcpy(rank, rank_d, graph_d->vertex_count *
                             sizeof(rank_t), cudaMemcpyDeviceToHost), err);
   totem_free(rank_d, TOTEM_MEM_DEVICE);
@@ -113,14 +106,12 @@ error_t finalize_gpu(graph_t* graph_d, rank_t* rank_d, rank_t* mailbox_d,
   return FAILURE;
 }
 
-/**
- * Phase1 kernel of the original PageRank GPU algorithm (i.e., non-vwarp).
- * Produce the sum of the neighbors' ranks. Each vertex atomically
- * adds its value to the mailbox of the destination neighbor vertex.
- */
+// Phase1 kernel of the original PageRank GPU algorithm (i.e., non-vwarp).
+// Produce the sum of the neighbors' ranks. Each vertex atomically
+// adds its value to the mailbox of the destination neighbor vertex.
 __global__
 void sum_neighbors_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox) {
-  // get the thread's linear index
+  // Get the thread's linear index.
   vid_t vertex_id = THREAD_GLOBAL_INDEX;
   if (vertex_id >= graph.vertex_count) return;
 
@@ -132,74 +123,63 @@ void sum_neighbors_rank_kernel(graph_t graph, rank_t* rank, rank_t* mailbox) {
   }
 }
 
-/**
- * Phase2 kernel of the original PageRank GPU algorithm (i.e., non-vwarp).
- * Produce the rank of each vertex. The sum of ranks coming from the incoming
- * edges is stored in the mailbox of the vertex.
- */
+// Phase2 kernel of the original PageRank GPU algorithm (i.e., non-vwarp).
+// Produce the rank of each vertex. The sum of ranks coming from the incoming
+// edges is stored in the mailbox of the vertex.
 __global__
 void compute_normalized_rank_kernel(graph_t graph, rank_t* rank,
                                     rank_t* mailbox) {
-  // get the thread's linear index
+  // Get the thread's linear index.
   vid_t vertex_id = THREAD_GLOBAL_INDEX;
   if (vertex_id >= graph.vertex_count) return;
-  // get sum of incoming neighbors' ranks
+  // Get sum of incoming neighbors' ranks.
   rank_t sum = mailbox[vertex_id];
   mailbox[vertex_id] = 0;
-  // calculate my normalized rank
-  rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + 
+  // Calculate my normalized rank.
+  rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) +
     (PAGE_RANK_DAMPING_FACTOR * sum);
   rank[vertex_id] =  my_rank /
     (graph.vertices[vertex_id + 1] - graph.vertices[vertex_id]);
 }
 
-/**
- * Phase2 final kernel of the original PageRank GPU algorithm (i.e., non-vwarp).
- * This kernel is similar to the compute_normalized_rank_kernel. The difference
- * is that it does not normalize the rank (by dividing it by the number of 
- * neighbors). It is invoked in the end to get the final, un-normalized, rank.
- */
+// Phase2 final kernel of the original PageRank GPU algorithm (i.e., non-vwarp).
+// This kernel is similar to the compute_normalized_rank_kernel. The difference
+// is that it does not normalize the rank (by dividing it by the number of
+// neighbors). It is invoked in the end to get the final, un-normalized, rank.
 __global__
-void compute_unnormalized_rank_kernel(graph_t graph, rank_t* rank, 
+void compute_unnormalized_rank_kernel(graph_t graph, rank_t* rank,
                                       rank_t* mailbox) {
-  // get the thread's linear index
   vid_t vertex_id = THREAD_GLOBAL_INDEX;
   if (vertex_id >= graph.vertex_count) return;
-  // get sum of neighbors' ranks
   rank_t sum = mailbox[vertex_id];
-  // calculate my rank
-  rank[vertex_id] = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + 
+  rank[vertex_id] = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) +
     (PAGE_RANK_DAMPING_FACTOR * sum);
 }
 
-/**
- * The neighbors processing function. This function adds the a vertex rank to
- * to the mailbox of all neighbors. The assumption is that the threads of a warp
- * invoke this function to process the warp's batch of work. In each iteration
- * of the for loop, each thread processes a neighbor. For example, thread 0 in
- * the warp processes neighbors at indices 0, VWARP_DEFAULT_WARP_WIDTH,
- * (2 * VWARP_DEFAULT_WARP_WIDTH) etc. in the edges array, while thread 1 in 
- * the warp processes neighbors 1, (1 + VWARP_DEFAULT_WARP_WIDTH), 
- * (1 + 2 * VWARP_DEFAULT_WARP_WIDTH) and so on.
-*/
+// The neighbors processing function. This function adds the a vertex rank to
+// to the mailbox of all neighbors. The assumption is that the threads of a warp
+// invoke this function to process the warp's batch of work. In each iteration
+// of the for loop, each thread processes a neighbor. For example, thread 0 in
+// the warp processes neighbors at indices 0, VWARP_DEFAULT_WARP_WIDTH,
+// (2 * VWARP_DEFAULT_WARP_WIDTH) etc. in the edges array, while thread 1 in
+// the warp processes neighbors 1, (1 + VWARP_DEFAULT_WARP_WIDTH),
+// (1 + 2 * VWARP_DEFAULT_WARP_WIDTH) and so on.
 __device__ inline
-void vwarp_process_neighbors(vid_t warp_offset, vid_t neighbor_count, 
-                             vid_t* neighbors, rank_t my_rank, 
+void vwarp_process_neighbors(vid_t warp_offset, vid_t neighbor_count,
+                             vid_t* neighbors, rank_t my_rank,
                              rank_t* mailbox) {
-  for(vid_t i = warp_offset; i < neighbor_count; 
-      i += VWARP_DEFAULT_WARP_WIDTH) {
+  for (vid_t i = warp_offset; i < neighbor_count;
+       i += VWARP_DEFAULT_WARP_WIDTH) {
     const vid_t neighbor_id = neighbors[i];
     atomicAdd(&(mailbox[neighbor_id]), my_rank);
   }
 }
 
-/**
- * Phase1 kernel of the vwarp PageRank GPU algorithm.
- * Produce the sum of the neighbors' ranks. Each vertex atomically
- * adds its value to the mailbox of the destination neighbor vertex.
- */
+// Phase1 kernel of the vwarp PageRank GPU algorithm.
+// Produce the sum of the neighbors' ranks. Each vertex atomically
+// adds its value to the mailbox of the destination neighbor vertex.
 __global__
-void vwarp_sum_neighbors_rank_kernel(graph_t graph, rank_t* rank, 
+void vwarp_sum_neighbors_rank_kernel(graph_t graph, rank_t* rank,
                                      rank_t* mailbox, uint32_t thread_count) {
   if (THREAD_GLOBAL_INDEX >= thread_count) return;
   vid_t warp_offset = THREAD_GLOBAL_INDEX % VWARP_DEFAULT_WARP_WIDTH;
@@ -207,18 +187,18 @@ void vwarp_sum_neighbors_rank_kernel(graph_t graph, rank_t* rank,
 
   __shared__ vwarp_mem_t shared_memory[MAX_THREADS_PER_BLOCK /
                                        VWARP_DEFAULT_WARP_WIDTH];
-  vwarp_mem_t* my_space = &shared_memory[THREAD_BLOCK_INDEX / 
+  vwarp_mem_t* my_space = &shared_memory[THREAD_BLOCK_INDEX /
                                          VWARP_DEFAULT_WARP_WIDTH];
 
-  // copy my work to local space
+  // Copy my work to local space.
   vid_t v_ = warp_id * VWARP_DEFAULT_BATCH_SIZE;
-  vwarp_memcpy(my_space->rank, &rank[v_], 
+  vwarp_memcpy(my_space->rank, &rank[v_],
                VWARP_DEFAULT_BATCH_SIZE, warp_offset);
-  vwarp_memcpy(my_space->vertices, &(graph.vertices[v_]), 
+  vwarp_memcpy(my_space->vertices, &(graph.vertices[v_]),
                VWARP_DEFAULT_BATCH_SIZE + 1, warp_offset);
 
-  // iterate over my work
-  for(vid_t v = 0; v < VWARP_DEFAULT_BATCH_SIZE; v++) {
+  // Iterate over my work.
+  for (vid_t v = 0; v < VWARP_DEFAULT_BATCH_SIZE; v++) {
     vid_t neighbor_count = my_space->vertices[v + 1] - my_space->vertices[v];
     vid_t* neighbors = &(graph.edges[my_space->vertices[v]]);
     vwarp_process_neighbors(warp_offset, neighbor_count, neighbors,
@@ -236,20 +216,20 @@ void vwarp_compute_normalized_rank_kernel(graph_t graph, rank_t* rank,
                                           rank_t* mailbox) {
   vid_t vertex_id = THREAD_GLOBAL_INDEX;
   if (vertex_id >= graph.vertex_count) return;
-  // get sum of incoming neighbors' ranks
+  // Get the sum of incoming neighbors' ranks.
   rank_t sum = mailbox[vertex_id];
   mailbox[vertex_id] = 0;
-  // calculate my normalized rank
-  rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + 
+  // Calculate my normalized rank.
+  rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) +
     (PAGE_RANK_DAMPING_FACTOR * sum);
   rank[vertex_id] =  my_rank /
     (graph.vertices[vertex_id + 1] - graph.vertices[vertex_id]);
 }
 
 /**
- * Phase2 final kernel of the vwarp PageRank GPU algorithm. This kernel is 
- * similar to the compute_normalized_rank_kernel. The difference is that it 
- * does not normalize the rank (by dividing it by the number of neighbors). 
+ * Phase2 final kernel of the vwarp PageRank GPU algorithm. This kernel is
+ * similar to the compute_normalized_rank_kernel. The difference is that it
+ * does not normalize the rank (by dividing it by the number of neighbors).
  * It is invoked in the end to get the final, un-normalized, rank.
  */
 __global__
@@ -257,21 +237,21 @@ void vwarp_compute_unnormalized_rank_kernel(graph_t graph, rank_t* rank,
                                             rank_t* mailbox) {
   vid_t vertex_id = THREAD_GLOBAL_INDEX;
   if (vertex_id >= graph.vertex_count) return;
-  // get sum of neighbors' ranks
+  // Get the sum of neighbors' ranks.
   rank_t sum = mailbox[vertex_id];
-  // calculate my rank
-  rank[vertex_id] = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) + 
+  // Calculate my rank.
+  rank[vertex_id] = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph.vertex_count) +
     (PAGE_RANK_DAMPING_FACTOR * sum);
 }
 
 __host__
 error_t page_rank_vwarp_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) {
-  // Check for special cases
+  // Check for special cases.
   bool finished = false;
   error_t rc = check_special_cases(graph, rank, &finished);
   if (finished) return rc;
 
-  // Allocate and initialize GPU state
+  // Allocate and initialize GPU state.
   graph_t* graph_d;
   rank_t* rank_d;
   rank_t* mailbox_d;
@@ -280,39 +260,39 @@ error_t page_rank_vwarp_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) {
   CHK_SUCCESS(initialize_gpu(graph, rank_i, rank_length, &graph_d, &rank_d,
                              &mailbox_d), err);
 
-  {// Configure the kernels. Setup the number of threads for phase1 and phase2,
-  // configure the on-chip memory as shared memory rather than L1 cache
-  dim3 blocks1, threads_per_block1, blocks2, threads_per_block2;
-  vid_t phase1_thread_count = 
-    vwarp_default_thread_count(graph->vertex_count);
-  KERNEL_CONFIGURE(phase1_thread_count, blocks1, threads_per_block1);
-  KERNEL_CONFIGURE(graph->vertex_count, blocks2, threads_per_block2);
-  cudaFuncSetCacheConfig(vwarp_sum_neighbors_rank_kernel,
-                         cudaFuncCachePreferShared);
-
-  // Iterate for a specific number of rounds
-  for (int round = 0; round < PAGE_RANK_ROUNDS - 1; round++) {
+  {
+    // Configure the kernels. Setup the number of threads for phase1 and phase2,
+    // configure the on-chip memory as shared memory rather than L1 cache.
+    dim3 blocks1, threads_per_block1, blocks2, threads_per_block2;
+    vid_t phase1_thread_count =
+        vwarp_default_thread_count(graph->vertex_count);
+    KERNEL_CONFIGURE(phase1_thread_count, blocks1, threads_per_block1);
+    KERNEL_CONFIGURE(graph->vertex_count, blocks2, threads_per_block2);
+    cudaFuncSetCacheConfig(vwarp_sum_neighbors_rank_kernel,
+                           cudaFuncCachePreferShared);
+
+    // Iterate for a specific number of rounds.
+    for (int round = 0; round < PAGE_RANK_ROUNDS - 1; round++) {
+      vwarp_sum_neighbors_rank_kernel<<<blocks1, threads_per_block1>>>
+          (*graph_d, rank_d, mailbox_d, phase1_thread_count);
+      CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
+      vwarp_compute_normalized_rank_kernel<<<blocks2, threads_per_block2>>>
+          (*graph_d, rank_d, mailbox_d);
+      CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
+    }
+    // Final round is seprate. It computes an un-normalized final rank.
     vwarp_sum_neighbors_rank_kernel<<<blocks1, threads_per_block1>>>
-      (*graph_d, rank_d, mailbox_d, phase1_thread_count);
+        (*graph_d, rank_d, mailbox_d, phase1_thread_count);
     CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
-    vwarp_compute_normalized_rank_kernel<<<blocks2, threads_per_block2>>>
-      (*graph_d, rank_d, mailbox_d);
+    vwarp_compute_unnormalized_rank_kernel<<<blocks2, threads_per_block2>>>
+        (*graph_d, rank_d, mailbox_d);
     CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
   }
-  // Final round is seprate. It computes an un-normalized final rank
-  vwarp_sum_neighbors_rank_kernel<<<blocks1, threads_per_block1>>>
-    (*graph_d, rank_d, mailbox_d, phase1_thread_count);
-  CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
-  vwarp_compute_unnormalized_rank_kernel<<<blocks2, threads_per_block2>>>
-    (*graph_d, rank_d, mailbox_d);
-  CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
-  }
 
-  // Copy the result back from GPU and clean up
+  // Copy the result back from GPU and clean up.
   CHK_SUCCESS(finalize_gpu(graph_d, rank_d, mailbox_d, rank), err_free_all);
   return SUCCESS;
 
-  // error handlers
  err_free_all:
   totem_free(rank_d, TOTEM_MEM_DEVICE);
   totem_free(mailbox_d, TOTEM_MEM_DEVICE);
@@ -323,44 +303,43 @@ error_t page_rank_vwarp_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) {
 
 __host__
 error_t page_rank_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) {
-  // Check for special cases
+  // Check for special cases.
   bool finished = false;
   error_t rc = check_special_cases(graph, rank, &finished);
   if (finished) return rc;
 
-  // Allocate and initialize GPU state
+  // Allocate and initialize GPU state.
   graph_t* graph_d;
   rank_t* rank_d;
   rank_t* mailbox_d;
-  CHK_SUCCESS(initialize_gpu(graph, rank_i, graph->vertex_count, 
+  CHK_SUCCESS(initialize_gpu(graph, rank_i, graph->vertex_count,
                              &graph_d, &rank_d, &mailbox_d), err);
 
   {
-  dim3 blocks, threads_per_block;
-  KERNEL_CONFIGURE(graph->vertex_count, blocks, threads_per_block);
-  // Iterate for a specific number of rounds
-  for (int round = 0; round < PAGE_RANK_ROUNDS - 1; round++) {
+    dim3 blocks, threads_per_block;
+    KERNEL_CONFIGURE(graph->vertex_count, blocks, threads_per_block);
+    // Iterate for a specific number of rounds.
+    for (int round = 0; round < PAGE_RANK_ROUNDS - 1; round++) {
+      sum_neighbors_rank_kernel<<<blocks, threads_per_block>>>
+          (*graph_d, rank_d, mailbox_d);
+      CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
+      compute_normalized_rank_kernel<<<blocks, threads_per_block>>>
+          (*graph_d, rank_d, mailbox_d);
+      CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
+    }
+    // Final round is seprate. It computes an un-normalized final rank.
     sum_neighbors_rank_kernel<<<blocks, threads_per_block>>>
-      (*graph_d, rank_d, mailbox_d);
+        (*graph_d, rank_d, mailbox_d);
     CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
-    compute_normalized_rank_kernel<<<blocks, threads_per_block>>>
-      (*graph_d, rank_d, mailbox_d);
+    compute_unnormalized_rank_kernel<<<blocks, threads_per_block>>>
+        (*graph_d, rank_d, mailbox_d);
     CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
   }
-  // Final round is seprate. It computes an un-normalized final rank
-  sum_neighbors_rank_kernel<<<blocks, threads_per_block>>>
-    (*graph_d, rank_d, mailbox_d);
-  CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
-  compute_unnormalized_rank_kernel<<<blocks, threads_per_block>>>
-    (*graph_d, rank_d, mailbox_d);
-  CHK_CU_SUCCESS(cudaGetLastError(), err_free_all);
-  }
 
-  // Copy the result back from GPU and clean up
+  // Copy the result back from GPU and clean up.
   CHK_SUCCESS(finalize_gpu(graph_d, rank_d, mailbox_d, rank), err_free_all);
   return SUCCESS;
 
-  // error handlers
  err_free_all:
   totem_free(rank_d, TOTEM_MEM_DEVICE);
   totem_free(mailbox_d, TOTEM_MEM_DEVICE);
@@ -370,17 +349,16 @@ error_t page_rank_gpu(graph_t* graph, rank_t* rank_i, rank_t* rank) {
 }
 
 error_t page_rank_cpu(graph_t* graph, rank_t* rank_i, rank_t* rank) {
-  // Check for special cases
+  // Check for special cases.
   bool finished = false;
   error_t rc = check_special_cases(graph, rank, &finished);
-  if (finished) return rc;
+  if (finished) { return rc; }
 
-  // allocate buffers
   rank_t* mailbox;
-  CALL_SAFE(totem_malloc(graph->vertex_count * sizeof(rank_t), TOTEM_MEM_HOST, 
-                         (void**)&mailbox));
+  CALL_SAFE(totem_malloc(graph->vertex_count * sizeof(rank_t), TOTEM_MEM_HOST,
+                         reinterpret_cast<void**>(&mailbox)));
 
-  // initialize the rank of each vertex
+  // Initialize the rank of each vertex.
   if (rank_i == NULL) {
     rank_t initial_value = 1 / (rank_t)graph->vertex_count;
     for (vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) {
@@ -395,13 +373,13 @@ error_t page_rank_cpu(graph_t* graph, rank_t* rank_i, rank_t* rank) {
   }
 
   for (int round = 0; round < PAGE_RANK_ROUNDS; round++) {
-    // iterate over all vertices to calculate the ranks for this round
+    // Iterate over all vertices to calculate the ranks for this round
     // The "runtime" scheduling clause defer the choice of thread scheduling
     // algorithm to the choice of the client, either via OS environment variable
     // or omp_set_schedule interface.
     OMP(omp parallel for schedule(runtime))
-    for(vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) {
-      // calculate the sum of all neighbors' rank
+    for (vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) {
+      // Calculate the sum of the neighbors' ranks.
       rank_t my_rank = rank[vertex_id];
       for (eid_t i = graph->vertices[vertex_id];
            i < graph->vertices[vertex_id + 1]; i++) {
@@ -411,24 +389,21 @@ error_t page_rank_cpu(graph_t* graph, rank_t* rank_i, rank_t* rank) {
     }
 
     // The loop has no load balancing issues, hence the choice of dividing
-    // the iterations between the threads statically via the static schedule 
-    // clause
+    // the iterations between the threads statically via the static schedule
+    // clause.
     OMP(omp parallel for schedule(static))
-    for(vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) {
-      // get sum of neighbors' ranks
+    for (vid_t vertex_id = 0; vertex_id < graph->vertex_count; vertex_id++) {
       rank_t sum = mailbox[vertex_id];
       mailbox[vertex_id] = 0;
-      // calculate my rank
-      vid_t neighbors_count = 
+      vid_t neighbors_count =
         graph->vertices[vertex_id + 1] - graph->vertices[vertex_id];
-      rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph->vertex_count) + 
+      rank_t my_rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / graph->vertex_count) +
         (PAGE_RANK_DAMPING_FACTOR * sum);
       rank[vertex_id] =
         (round == (PAGE_RANK_ROUNDS - 1)) ? my_rank : my_rank / neighbors_count;
     }
   }
 
-  // we are done! set the output and clean up.
   totem_free(mailbox, TOTEM_MEM_HOST);
   return SUCCESS;
 }
diff --git a/src/alg/totem_page_rank_hybrid.cu b/src/alg/totem_page_rank_hybrid.cu
index b5162ae..c31e24a 100644
--- a/src/alg/totem_page_rank_hybrid.cu
+++ b/src/alg/totem_page_rank_hybrid.cu
@@ -21,9 +21,7 @@
 #include "totem_alg.h"
 #include "totem_engine.cuh"
 
-/**
- * PageRank specific state
- */
+// PageRank-specific state.
 typedef struct page_rank_state_s {
   rank_t* rank;
   rank_t* rank_s;
@@ -33,23 +31,15 @@ typedef struct page_rank_state_s {
   dim3 threads_sum;
 } page_rank_state_t;
 
-/**
- * Stores the final result
- */
-rank_t* rank_g = NULL;
+// Stores the final result.
+PRIVATE rank_t* rank_g = NULL;
 
-/**
- * Used as a temporary buffer to host the final result produced by
- * GPU partitions
- */
-rank_t* rank_h = NULL;
+// Used as a temporary buffer to host the final result produced by
+// GPU partitions.
+PRIVATE rank_t* rank_h = NULL;
 
-/**
- * Checks for input parameters and special cases. This is invoked at the
- * beginning of public interfaces (GPU and CPU)
-*/
-PRIVATE
-error_t check_special_cases(rank_t* rank, bool* finished) {
+// Checks for input parameters and special cases.
+PRIVATE error_t check_special_cases(rank_t* rank, bool* finished) {
   *finished = true;
   if (engine_vertex_count() == 0) {
     return FAILURE;
@@ -61,32 +51,28 @@ error_t check_special_cases(rank_t* rank, bool* finished) {
   return SUCCESS;
 }
 
-/**
- * This structure is used by virtual warp-based implementation. It stores a
- * batch of work. It is typically allocated on shared memory and is processed by
- * a single virtual warp.
- */
+// This structure is used by virtual warp-based implementation. It stores a
+// batch of work. It is typically allocated on shared memory and is processed by
+// a single virtual warp.
 typedef struct {
   eid_t vertices[VWARP_DEFAULT_BATCH_SIZE + 2];
   rank_t rank[VWARP_DEFAULT_BATCH_SIZE];
 } vwarp_mem_t;
 
-/**
- * Phase1 kernel of the PageRank GPU algorithm. Produce the sum of
- * the neighbors' ranks. Each vertex atomically adds its value to
- * the temporary rank (rank_s) of the destination neighbor vertex.
- */
+// Phase1 kernel of the PageRank GPU algorithm. Produce the sum of
+// the neighbors' ranks. Each vertex atomically adds its value to
+// the temporary rank (rank_s) of the destination neighbor vertex.
 __global__
-void vwarp_sum_neighbors_rank_kernel(partition_t par, rank_t* rank, 
+void vwarp_sum_neighbors_rank_kernel(partition_t par, rank_t* rank,
                                      rank_t* rank_s, vid_t thread_count) {
-  if (THREAD_GLOBAL_INDEX >= thread_count) return;
+  if (THREAD_GLOBAL_INDEX >= thread_count) { return; }
   vid_t warp_offset = THREAD_GLOBAL_INDEX % VWARP_DEFAULT_WARP_WIDTH;
   vid_t warp_id     = THREAD_GLOBAL_INDEX / VWARP_DEFAULT_WARP_WIDTH;
 
-  // copy my work to local space
-  __shared__ vwarp_mem_t smem[MAX_THREADS_PER_BLOCK / 
+  // Copy my work to local space.
+  __shared__ vwarp_mem_t smem[MAX_THREADS_PER_BLOCK /
                               VWARP_DEFAULT_WARP_WIDTH];
-  vwarp_mem_t* my_space = &smem[THREAD_BLOCK_INDEX / 
+  vwarp_mem_t* my_space = &smem[THREAD_BLOCK_INDEX /
                                 VWARP_DEFAULT_WARP_WIDTH];
   vid_t v_ = warp_id * VWARP_DEFAULT_BATCH_SIZE;
   int my_batch_size = VWARP_DEFAULT_BATCH_SIZE;
@@ -97,26 +83,28 @@ void vwarp_sum_neighbors_rank_kernel(partition_t par, rank_t* rank,
   vwarp_memcpy(my_space->vertices, &(par.subgraph.vertices[v_]),
                my_batch_size + 1, warp_offset);
 
-  // iterate over my work
-  for(vid_t v = 0; v < my_batch_size; v++) {
+  // Iterate over my work.
+  for (vid_t v = 0; v < my_batch_size; v++) {
     vid_t nbr_count = my_space->vertices[v + 1] - my_space->vertices[v];
     vid_t* nbrs = &(par.subgraph.edges[my_space->vertices[v]]);
-    for(vid_t i = warp_offset; i < nbr_count; i += VWARP_DEFAULT_WARP_WIDTH) {
+    for (vid_t i = warp_offset; i < nbr_count; i += VWARP_DEFAULT_WARP_WIDTH) {
       const vid_t nbr = nbrs[i];
-      rank_t* dst = engine_get_dst_ptr(par.id, nbr, par.outbox, rank_s);      
+      rank_t* dst = engine_get_dst_ptr(par.id, nbr, par.outbox, rank_s);
       atomicAdd(dst, my_space->rank[v]);
     }
   }
 }
 
 __global__
-void compute_normalized_rank_kernel(partition_t par, vid_t vc, rank_t* rank, 
+void compute_normalized_rank_kernel(partition_t par, vid_t vc, rank_t* rank,
                                     rank_t* rank_s) {
   vid_t v = THREAD_GLOBAL_INDEX;
-  if (v >= par.subgraph.vertex_count) return;
-  rank_t r = ((1 - PAGE_RANK_DAMPING_FACTOR) / vc) + 
+  if (v >= par.subgraph.vertex_count) { return; }
+  vid_t nbr_count = (par.subgraph.vertices[v + 1] - par.subgraph.vertices[v]);
+  if (nbr_count == 0) { return; }
+  rank_t r = ((1 - PAGE_RANK_DAMPING_FACTOR) / vc) +
     (PAGE_RANK_DAMPING_FACTOR * rank_s[v]);
-  rank[v] = r / (par.subgraph.vertices[v + 1] - par.subgraph.vertices[v]);
+  rank[v] = r / nbr_count;
   rank_s[v] = 0;
 }
 
@@ -125,14 +113,14 @@ void compute_unnormalized_rank_kernel(partition_t par, vid_t vc, rank_t* rank,
                                       rank_t* rank_s) {
   vid_t v = THREAD_GLOBAL_INDEX;
   if (v >= par.subgraph.vertex_count) return;
-  rank[v] = ((1 - PAGE_RANK_DAMPING_FACTOR) / vc) + 
+  rank[v] = ((1 - PAGE_RANK_DAMPING_FACTOR) / vc) +
     (PAGE_RANK_DAMPING_FACTOR * rank_s[v]);
 }
 
 PRIVATE void page_rank_gpu(partition_t* par) {
-  page_rank_state_t* ps = (page_rank_state_t*)par->algo_state;
+  page_rank_state_t* ps = reinterpret_cast<page_rank_state_t*>(par->algo_state);
   if (engine_superstep() > 1) {
-    // compute my rank
+    // Compute my rank.
     if (engine_superstep() != PAGE_RANK_ROUNDS) {
       compute_normalized_rank_kernel<<<ps->blocks_rank, ps->threads_rank, 0,
         par->streams[1]>>>(*par, engine_vertex_count(), ps->rank, ps->rank_s);
@@ -143,41 +131,40 @@ PRIVATE void page_rank_gpu(partition_t* par) {
       CALL_CU_SAFE(cudaGetLastError());
     }
   }
-  // communicate the ranks
-  engine_set_outbox(par->id, 0);
+
+  engine_set_outbox(par->id, (rank_t)0);
   vwarp_sum_neighbors_rank_kernel<<<ps->blocks_sum, ps->threads_sum, 0,
-    par->streams[1]>>>(*par, ps->rank, ps->rank_s, 
+    par->streams[1]>>>(*par, ps->rank, ps->rank_s,
                        vwarp_default_thread_count(par->subgraph.vertex_count));
   CALL_CU_SAFE(cudaGetLastError());
 }
 
 PRIVATE void page_rank_cpu(partition_t* par) {
-  page_rank_state_t* ps = (page_rank_state_t*)par->algo_state;
+  page_rank_state_t* ps = reinterpret_cast<page_rank_state_t*>(par->algo_state);
   graph_t* subgraph = &par->subgraph;
   vid_t vcount = engine_vertex_count();
   int round = engine_superstep();
-
   if (round > 1) {
-    // compute my rank The loop has no load balancing issues, hence the choice
+    // Compute my rank The loop has no load balancing issues, hence the choice
     // of dividing the iterations between the threads statically via the static
-    // schedule clause
+    // schedule clause.
     OMP(omp parallel for schedule(static))
-    for(vid_t v = 0; v < subgraph->vertex_count; v++) {
-      vid_t nbrs = subgraph->vertices[v + 1] - subgraph->vertices[v];
+    for (vid_t v = 0; v < subgraph->vertex_count; v++) {
+      vid_t nbr_count = subgraph->vertices[v + 1] - subgraph->vertices[v];
+      if (nbr_count == 0) { continue; }
       rank_t rank = ((1 - PAGE_RANK_DAMPING_FACTOR) / vcount) +
         (PAGE_RANK_DAMPING_FACTOR * ps->rank_s[v]);
-      ps->rank[v] = (round == (PAGE_RANK_ROUNDS)) ? rank : rank / nbrs;
+      ps->rank[v] = (round == PAGE_RANK_ROUNDS) ? rank : rank / nbr_count;
       ps->rank_s[v] = 0;
     }
   }
 
-  // communicate the ranks
-  engine_set_outbox(par->id, 0);
+  engine_set_outbox(par->id, (rank_t)0);
   // The "runtime" scheduling clause defer the choice of thread scheduling
   // algorithm to the choice of the client, either via OS environment variable
   // or omp_set_schedule interface.
   OMP(omp parallel for schedule(runtime))
-  for(vid_t v = 0; v < subgraph->vertex_count; v++) {
+  for (vid_t v = 0; v < subgraph->vertex_count; v++) {
     rank_t my_rank = ps->rank[v];
     for (eid_t i = subgraph->vertices[v]; i < subgraph->vertices[v + 1]; i++) {
       vid_t nbr = subgraph->edges[i];
@@ -188,6 +175,7 @@ PRIVATE void page_rank_cpu(partition_t* par) {
 }
 
 PRIVATE void page_rank(partition_t* partition) {
+  if (partition->subgraph.vertex_count == 0) { return; }
   if (partition->processor.type == PROCESSOR_GPU) {
     page_rank_gpu(partition);
   } else {
@@ -200,13 +188,15 @@ PRIVATE void page_rank(partition_t* partition) {
 }
 
 PRIVATE void page_rank_scatter(partition_t* partition) {
-  page_rank_state_t* ps = (page_rank_state_t*)partition->algo_state;
+  page_rank_state_t* ps =
+      reinterpret_cast<page_rank_state_t*>(partition->algo_state);
   engine_scatter_inbox_add(partition->id, ps->rank_s);
 }
 
 PRIVATE void page_rank_aggr(partition_t* partition) {
-  if (!partition->subgraph.vertex_count) return;
-  page_rank_state_t* ps = (page_rank_state_t*)partition->algo_state;
+  if (partition->subgraph.vertex_count == 0) { return; }
+  page_rank_state_t* ps =
+      reinterpret_cast<page_rank_state_t*>(partition->algo_state);
   graph_t* subgraph = &partition->subgraph;
   rank_t* src_rank = NULL;
   if (partition->processor.type == PROCESSOR_GPU) {
@@ -226,10 +216,10 @@ PRIVATE void page_rank_aggr(partition_t* partition) {
 
 PRIVATE void page_rank_init(partition_t* par) {
   vid_t vcount = par->subgraph.vertex_count;
-  if (vcount == 0) return;
+  if (vcount == 0) { return; }
   page_rank_state_t* ps = NULL;
-  CALL_SAFE(totem_calloc(sizeof(page_rank_state_t), TOTEM_MEM_HOST, 
-                         (void**)&ps));
+  CALL_SAFE(totem_calloc(sizeof(page_rank_state_t), TOTEM_MEM_HOST,
+                         reinterpret_cast<void**>(&ps)));
   totem_mem_t type = TOTEM_MEM_HOST;
   if (par->processor.type == PROCESSOR_GPU) {
     type = TOTEM_MEM_DEVICE;
@@ -237,20 +227,24 @@ PRIVATE void page_rank_init(partition_t* par) {
                      ps->blocks_sum, ps->threads_sum);
     KERNEL_CONFIGURE(vcount, ps->blocks_rank, ps->threads_rank);
   }
-  CALL_SAFE(totem_calloc(vcount * sizeof(rank_t), type, (void**)&(ps->rank_s)));
-  CALL_SAFE(totem_malloc(vcount * sizeof(rank_t), type, (void**)&(ps->rank)));
+  CALL_SAFE(totem_calloc(vcount * sizeof(rank_t), type,
+                         reinterpret_cast<void**>(&ps->rank_s)));
+  CALL_SAFE(totem_calloc(vcount * sizeof(rank_t), type,
+                         reinterpret_cast<void**>(&ps->rank)));
   rank_t init_value = 1 / (rank_t)engine_vertex_count();
   totem_memset(ps->rank, init_value, vcount, type, par->streams[1]);
   par->algo_state = ps;
 }
 
 PRIVATE void page_rank_finalize(partition_t* partition) {
+  if (partition->subgraph.vertex_count == 0) { return; }
   assert(partition->algo_state);
-  page_rank_state_t* ps = (page_rank_state_t*)partition->algo_state;
+  page_rank_state_t* ps =
+      reinterpret_cast<page_rank_state_t*>(partition->algo_state);
   totem_mem_t type = TOTEM_MEM_HOST;
   if (partition->processor.type == PROCESSOR_GPU) {
     type = TOTEM_MEM_DEVICE;
-  } 
+  }
   totem_free(ps->rank, type);
   totem_free(ps->rank_s, type);
   totem_free(ps, TOTEM_MEM_HOST);
@@ -268,13 +262,14 @@ error_t page_rank_hybrid(rank_t *rank_i, rank_t* rank) {
 
   // initialize the engine
   engine_config_t config = {
-    NULL, page_rank, page_rank_scatter, NULL, page_rank_init, 
+    NULL, page_rank, page_rank_scatter, NULL, page_rank_init,
     page_rank_finalize, page_rank_aggr, GROOVES_PUSH
   };
   engine_config(&config);
   if (engine_largest_gpu_partition()) {
-    CALL_SAFE(totem_malloc(engine_largest_gpu_partition() * sizeof(rank_t), 
-                           TOTEM_MEM_HOST_PINNED, (void**)&rank_h));
+    CALL_SAFE(totem_malloc(engine_largest_gpu_partition() * sizeof(rank_t),
+                           TOTEM_MEM_HOST_PINNED,
+                           reinterpret_cast<void**>(&rank_h)));
   }
   engine_execute();
 
diff --git a/src/alg/totem_stcon.cu b/src/alg/totem_stcon.cu
deleted file mode 100644
index cff57eb..0000000
--- a/src/alg/totem_stcon.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-/* TODO(lauro,abdullah,elizeu): Add license.
- *
- * This file contains an implementation of the ST Connectivity.
- *
- *  Created on: 2011-04-01
- *      Author: Lauro Beltrão Costa
- */
-
-// totem includes
-#include "totem_alg.h"
-
-__host__
-error_t stcon_gpu(const graph_t* graph, vid_t source_id, vid_t destination_id,
-                  bool* connected) {
-  if((graph == NULL) || (source_id >= graph->vertex_count)
-     || (destination_id >= graph->vertex_count)) {
-    *connected = false;
-    return FAILURE;
-  }
-  if( source_id == destination_id ) {
-    *connected = true;
-    return SUCCESS;
-  }
-
-  dim3 blocks;
-  dim3 threads_per_block;
-  KERNEL_CONFIGURE(graph->vertex_count, blocks, threads_per_block);
-  //TODO(abdullah, lauro) handle the case (vertex_count > number of threads).
-  assert(graph->vertex_count <= MAX_THREAD_COUNT);
-
-  // Create graph on GPU memory.
-  graph_t* graph_d;
-  CHK_SUCCESS(graph_initialize_device(graph, &graph_d), err);
-
-  // TODO(lauro): Finish stcon_gpu implementation.
-
-  graph_finalize_device(graph_d);
-  *connected = false;
-  return FAILURE;
-
-  // error handlers
-  err:
-    *connected = false;
-    return FAILURE;
-}
-
-__host__
-error_t stcon_cpu(const graph_t* graph, vid_t source_id, vid_t destination_id,
-                  bool* connected) {
-  if((graph == NULL) || (source_id >= graph->vertex_count)
-     || (destination_id >= graph->vertex_count)) {
-    *connected = false;
-    return FAILURE;
-  }
-  if( source_id == destination_id ) {
-    *connected = true;
-    return SUCCESS;
-  }
-
-  // TODO(lauro): Finish stcon_cpu implementation.
-
-  *connected = false;
-  return FAILURE;
-}
diff --git a/src/make.defs b/src/make.defs
index 4e853a0..22c175a 100644
--- a/src/make.defs
+++ b/src/make.defs
@@ -150,5 +150,7 @@ GTEST_SRC  = $(GTEST_PATH)/src/gtest_main.cc $(GTEST_PATH)/src/gtest-all.cc
 TEST_PATH   = $(ROOTDIR)/test
 TEST_DEPS   = $(wildcard $(TEST_PATH)/*.[cuh,h])
 
+WATTSUP_PATH   = $(ROOTDIR)/thirdparty/wattsup
+
 TOTEM_PATH   = $(ROOTDIR)/totem
 TOTEM_DEPS   = $(wildcard $(TOTEM_PATH)/*.[cuh,h])
diff --git a/src/makefile b/src/makefile
index 2958644..52b6a1e 100644
--- a/src/makefile
+++ b/src/makefile
@@ -21,6 +21,9 @@ graph500:
 test:
 	@$(MAKE) -C $(TEST_PATH)
 
+wattsup:
+	@$(MAKE) -C $(WATTSUP_PATH)
+
 clean-build:
 	@rm -rf *~ $(BUILDDIR)
 
diff --git a/src/test/totem_betweenness_unittest.cu b/src/test/totem_betweenness_unittest.cu
index e37dcdc..61e7c5c 100644
--- a/src/test/totem_betweenness_unittest.cu
+++ b/src/test/totem_betweenness_unittest.cu
@@ -1,5 +1,4 @@
-/* TODO(lauro,abdullah,elizeu): Add license.
- *
+/*
  * Contains unit tests for betweenness centrality.
  *
  *  Created on: 2011-10-21
@@ -12,50 +11,17 @@
 #if GTEST_HAS_PARAM_TEST
 
 using ::testing::TestWithParam;
-using ::testing::Values;
-
-/**
- * Wrapper for betweenness_cpu to provide the singature expected for use in
- * the unit tests with the other Betweenness Centrality algorithms
- */
-PRIVATE error_t betweenness_cpu_exact(const graph_t* graph,
-                                      score_t* betweenness_score) {
-  // call betweenness_cpu for use in unit test framework with exact precision
-  return betweenness_cpu(graph, CENTRALITY_EXACT, betweenness_score);
-}   
-
-/**
- * Wrapper for betweenness_gpu to provide the singature expected for use in
- * the unit tests with the other Betweenness Centrality algorithms
- */
-PRIVATE error_t betweenness_gpu_exact(const graph_t* graph,
-                                      score_t* betweenness_score) {
-  // call betweenness_gpu for use in unit test framework with exact precision
-  return betweenness_gpu(graph, CENTRALITY_EXACT, betweenness_score);
-}
+using ::testing::ValuesIn;
 
-// The following implementation relies on
-// TestWithParam<BetwCentralityFunction> to test the two versions of Betweenness
-// Centrality implemented: CPU and GPU.  Details on how to use TestWithParam<T>
-// can be found at:
-// http://code.google.com/p/googletest/source/browse/trunk/samples/sample7_unittest.cc
+typedef error_t(*BetwCentralityFunction)(const graph_t*, double, score_t*);
+typedef error_t(*BetwCentralityHybridFunction)(double, score_t*);
 
-typedef error_t(*BetwCentralityFunction)(const graph_t*, score_t*);
-
-// This is to allow testing the vanilla BC functions and the hybrid one
-// that is based on the framework. Note that have a different signature
-// of the hybrid algorithm forced this work-around.
-typedef struct betweenness_param_s {
-  totem_attr_t* attr;          // totem attributes for totem-based tests
-  BetwCentralityFunction func; // the vanilla BC function if attr is NULL
-} betweenness_param_t;
-
-class BetweennessCentralityTest : public TestWithParam<betweenness_param_t*> {
+class BetweennessCentralityTest : public TestWithParam<test_param_t*> {
  public:
   virtual void SetUp() {
-    // Ensure the minimum CUDA architecture is supported
+    // Ensure the minimum CUDA architecture is supported.
     CUDA_CHECK_VERSION();
-    _betweenness_param = GetParam();
+    _param = GetParam();
     _graph = NULL;
     _betweenness_score = NULL;
   }
@@ -69,35 +35,40 @@ class BetweennessCentralityTest : public TestWithParam<betweenness_param_t*> {
 
   error_t TestGraph() {
     if (_graph->vertex_count) {
-      totem_malloc(_graph->vertex_count * sizeof(score_t), 
-                   TOTEM_MEM_HOST, (void**)&_betweenness_score);
+      totem_malloc(_graph->vertex_count * sizeof(score_t),
+                   TOTEM_MEM_HOST,
+                   reinterpret_cast<void**>(&_betweenness_score));
     }
-    if (_betweenness_param->attr != NULL) {
-      _betweenness_param->attr->push_msg_size = 
+    if (_param->attr != NULL) {
+      _param->attr->push_msg_size =
         sizeof(uint32_t) * BITS_PER_BYTE;
-      _betweenness_param->attr->pull_msg_size =
+      _param->attr->pull_msg_size =
         sizeof(score_t) * BITS_PER_BYTE;
-      if (totem_init(_graph, _betweenness_param->attr) == FAILURE) {
+      if (totem_init(_graph, _param->attr) == FAILURE) {
         return FAILURE;
       }
 
-      // Will use exact betweenness centrality for test framework
-      error_t err = betweenness_hybrid(CENTRALITY_EXACT, _betweenness_score);
+      BetwCentralityHybridFunction func =
+          reinterpret_cast<BetwCentralityHybridFunction>
+          (_param->func);
+      error_t err = func(CENTRALITY_EXACT, _betweenness_score);
       totem_finalize();
       return err;
     }
-    return _betweenness_param->func(_graph, _betweenness_score);
+    BetwCentralityFunction func =
+        reinterpret_cast<BetwCentralityFunction>(_param->func);
+    return func(_graph, CENTRALITY_EXACT, _betweenness_score);
   }
 
  protected:
-   betweenness_param_t* _betweenness_param;
-   graph_t* _graph;
-   score_t* _betweenness_score;
+  test_param_t* _param;
+  graph_t*      _graph;
+  score_t*      _betweenness_score;
 };
 
 // Tests BetwCentrality for empty graphs.
 TEST_P(BetweennessCentralityTest, Empty) {
-  _graph = (graph_t*)calloc(sizeof(graph_t), 1);
+  _graph = reinterpret_cast<graph_t*>(calloc(sizeof(graph_t), 1));
   EXPECT_EQ(FAILURE, TestGraph());
   free(_graph);
   _graph = NULL;
@@ -115,16 +86,20 @@ TEST_P(BetweennessCentralityTest, SingleNodeUnweighted) {
 TEST_P(BetweennessCentralityTest, Chain100Unweighted) {
   graph_initialize(DATA_FOLDER("chain_100_nodes_weight_directed.totem"), false,
                    &_graph);
-  // First vertex as source
+  // First vertex as source.
   EXPECT_EQ(SUCCESS, TestGraph());
-  score_t centrality[50];
-  for (vid_t i = 0; i < 50; i++) {
-    centrality[i] = (99 - i) * (i);
+  score_t* expected_centrality =
+      reinterpret_cast<score_t*>(calloc(_graph->vertex_count / 2,
+                                        sizeof(score_t)));
+  for (vid_t i = 0; i < (_graph->vertex_count / 2); i++) {
+    expected_centrality[i] = (_graph->vertex_count - 1 - i) * (i);
   }
-  for (vid_t i = 0; i < 50; i++) {
-    EXPECT_EQ(centrality[i], _betweenness_score[i]);
-    EXPECT_EQ(centrality[i], _betweenness_score[99 - i]);
+  for (vid_t i = 0; i < (_graph->vertex_count / 2); i++) {
+    EXPECT_EQ(expected_centrality[i], _betweenness_score[i]);
+    EXPECT_EQ(expected_centrality[i],
+              _betweenness_score[_graph->vertex_count - 1 - i]);
   }
+  free(expected_centrality);
 }
 
 // Tests BetwCentrality for a complete graph of 300 nodes.
@@ -134,7 +109,7 @@ TEST_P(BetweennessCentralityTest, CompleteGraphUnweighted) {
                              false, &_graph));
 
   EXPECT_EQ(SUCCESS, TestGraph());
-  for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++){
+  for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) {
     EXPECT_FLOAT_EQ(0.0, _betweenness_score[vertex]);
   }
 }
@@ -145,82 +120,45 @@ TEST_P(BetweennessCentralityTest, StarGraphUnweighted) {
                                       false, &_graph));
 
   EXPECT_EQ(SUCCESS, TestGraph());
-  EXPECT_FLOAT_EQ((_graph->vertex_count - 1) * (_graph->vertex_count - 2), 
+  EXPECT_FLOAT_EQ((_graph->vertex_count - 1) * (_graph->vertex_count - 2),
                   _betweenness_score[0]);
-  for(vid_t vertex = 1; vertex < _graph->vertex_count; vertex++){
+  for (vid_t vertex = 1; vertex < _graph->vertex_count; vertex++) {
     EXPECT_FLOAT_EQ(0.0, _betweenness_score[vertex]);
   }
 }
 
-// Functions to test in framework
-betweenness_param_t betweenness_params[] = {
-  {NULL, &betweenness_unweighted_cpu},
-  {NULL, &betweenness_unweighted_gpu},
-  {NULL, &betweenness_unweighted_shi_gpu},
-  {NULL, &betweenness_cpu_exact},
-  {NULL, &betweenness_gpu_exact},
-  {&totem_attrs[0], NULL},
-  {&totem_attrs[1], NULL},
-  {&totem_attrs[2], NULL},
-  {&totem_attrs[3], NULL},
-  {&totem_attrs[4], NULL},
-  {&totem_attrs[5], NULL},
-  {&totem_attrs[6], NULL},
-  {&totem_attrs[7], NULL},
-  {&totem_attrs[8], NULL},
-  {&totem_attrs[9], NULL},
-  {&totem_attrs[10], NULL},
-  {&totem_attrs[11], NULL},
-  {&totem_attrs[12], NULL},
-  {&totem_attrs[13], NULL},
-  {&totem_attrs[14], NULL},
-  {&totem_attrs[15], NULL},
-  {&totem_attrs[16], NULL},
-  {&totem_attrs[17], NULL},
-  {&totem_attrs[18], NULL},
-  {&totem_attrs[19], NULL},
-  {&totem_attrs[20], NULL},
-  {&totem_attrs[21], NULL},
-  {&totem_attrs[22], NULL},
-  {&totem_attrs[23], NULL}
+// Defines the set of Betweenness vanilla implementations to be tested. To test
+// a new implementation, simply add it to the set below.
+static void* vanilla_funcs[] = {
+  reinterpret_cast<void*>(&betweenness_cpu),
+  reinterpret_cast<void*>(&betweenness_gpu),
 };
+static const int vanilla_count = STATIC_ARRAY_COUNT(vanilla_funcs);
+
+// Defines the set of PageRank hybrid implementations to be tested. To test
+// a new implementation, simply add it to the set below.
+static void* hybrid_funcs[] = {
+  reinterpret_cast<void*>(&betweenness_hybrid),
+};
+static const int hybrid_count = STATIC_ARRAY_COUNT(hybrid_funcs);
+
+// Maintains references to the different configurations (vanilla and hybrid)
+// that will be tested by the framework.
+static const int params_count = vanilla_count +
+    hybrid_count * hybrid_configurations_count;
+static test_param_t* params[params_count];
 
 // From Google documentation:
 // In order to run value-parameterized tests, we need to instantiate them,
 // or bind them to a list of values which will be used as test parameters.
 //
-// Values() receives a list of parameters and the framework will execute the
-// whole set of tests BetweennessCentralityTest for each element of Values()
-INSTANTIATE_TEST_CASE_P(BetwCentralityGPUAndCPUTest, BetweennessCentralityTest,
-                        Values(&betweenness_params[0],
-                               &betweenness_params[1],
-                               &betweenness_params[2],
-                               &betweenness_params[3],
-                               &betweenness_params[4],
-                               &betweenness_params[5],
-                               &betweenness_params[6],
-                               &betweenness_params[7],
-                               &betweenness_params[8],
-                               &betweenness_params[9],
-                               &betweenness_params[10],
-                               &betweenness_params[11],
-                               &betweenness_params[12],
-                               &betweenness_params[13],
-                               &betweenness_params[14],
-                               &betweenness_params[15],
-                               &betweenness_params[16],
-                               &betweenness_params[17],
-                               &betweenness_params[18],
-                               &betweenness_params[19],
-                               &betweenness_params[20],
-                               &betweenness_params[21],
-                               &betweenness_params[22],
-                               &betweenness_params[23],
-                               &betweenness_params[24],
-                               &betweenness_params[25],
-                               &betweenness_params[26],
-                               &betweenness_params[27],
-                               &betweenness_params[28]));
+// ValuesIn() receives a list of parameters and the framework will execute the
+// whole set of tests for each entry in the array passed to ValuesIn().
+INSTANTIATE_TEST_CASE_P(BetweennessGPUAndCPUTest, BetweennessCentralityTest,
+                        ValuesIn(GetParameters(params, params_count,
+                                               vanilla_funcs, vanilla_count,
+                                               hybrid_funcs, hybrid_count),
+                                 params + params_count));
 #else
 
 // From Google documentation:
diff --git a/src/test/totem_bfs_unittest.cu b/src/test/totem_bfs_unittest.cu
index 1dd9aaa..d1c9cce 100644
--- a/src/test/totem_bfs_unittest.cu
+++ b/src/test/totem_bfs_unittest.cu
@@ -1,4 +1,4 @@
-/* 
+/*
  * Contains unit tests for an implementation of the breadth-first search (BFS)
  * graph search algorithm.
  *
@@ -12,24 +12,17 @@
 #if GTEST_HAS_PARAM_TEST
 
 using ::testing::TestWithParam;
-using ::testing::Values;
+using ::testing::ValuesIn;
 
 // The following implementation relies on TestWithParam<BFSFunction> to test
-// the two versions of BFS implemented: CPU and GPU.
-// Details on how to use TestWithParam<T> can be found at:
+// the the different versions of BFS. Details on how to use TestWithParam<T>
+// can be found at:
 // http://code.google.com/p/googletest/source/browse/trunk/samples/sample7_unittest.cc
 
 typedef error_t(*BFSFunction)(graph_t*, vid_t, cost_t*);
+typedef error_t(*BFSHybridFunction)(vid_t, cost_t*);
 
-// This is to allow testing the vanilla bfs functions and the hybrid one
-// that is based on the framework. Note that have a different signature
-// of the hybrid algorithm forced this work-around.
-typedef struct bfs_param_s {
-  totem_attr_t* attr;   // totem attributes for totem-based tests
-  BFSFunction   func;   // the vanilla bfs function if attr is NULL
-} bfs_param_t;
-
-class BFSTest : public TestWithParam<bfs_param_t*> {
+class BFSTest : public TestWithParam<test_param_t*> {
  public:
   virtual void SetUp() {
     // Ensure the minimum CUDA architecture is supported
@@ -46,25 +39,30 @@ class BFSTest : public TestWithParam<bfs_param_t*> {
   error_t TestGraph(vid_t src) {
     if (_bfs_param->attr) {
       _bfs_param->attr->push_msg_size = 1;
+      _bfs_param->attr->pull_msg_size = 1;
       if (totem_init(_graph, _bfs_param->attr) == FAILURE) {
         return FAILURE;
       }
-      error_t err = bfs_hybrid(src, _cost);
+      BFSHybridFunction func =
+          reinterpret_cast<BFSHybridFunction>(_bfs_param->func);
+      error_t err = func(src, _cost);
       totem_finalize();
       return err;
     }
-    return _bfs_param->func(_graph, src, _cost);
+    BFSFunction func = reinterpret_cast<BFSFunction>(_bfs_param->func);
+    return func(_graph, src, _cost);
   }
+
  protected:
-  bfs_param_t* _bfs_param;
-  totem_mem_t _mem_type;
-  graph_t* _graph;
-  cost_t* _cost;
+  test_param_t* _bfs_param;
+  totem_mem_t   _mem_type;
+  graph_t*      _graph;
+  cost_t*       _cost;
 };
 
 // Tests BFS for empty graphs.
 TEST_P(BFSTest, Empty) {
-  _graph = (graph_t*)calloc(1, sizeof(graph_t));
+  _graph = reinterpret_cast<graph_t*>(calloc(1, sizeof(graph_t)));
   EXPECT_EQ(FAILURE, TestGraph(0));
   EXPECT_EQ(FAILURE, TestGraph(99));
   free(_graph);
@@ -74,8 +72,8 @@ TEST_P(BFSTest, Empty) {
 // Tests BFS for single node graphs.
 TEST_P(BFSTest, SingleNode) {
   graph_initialize(DATA_FOLDER("single_node.totem"), false, &_graph);
-  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, 
-                         (void**)&_cost));
+  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type,
+                         reinterpret_cast<void**>(&_cost)));
   EXPECT_EQ(SUCCESS, TestGraph(0));
   EXPECT_EQ((cost_t)0, _cost[0]);
   EXPECT_EQ(FAILURE, TestGraph(1));
@@ -83,8 +81,8 @@ TEST_P(BFSTest, SingleNode) {
 
 TEST_P(BFSTest, SingleNodeLoop) {
   graph_initialize(DATA_FOLDER("single_node_loop.totem"), false, &_graph);
-  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, 
-                         (void**)&_cost));
+  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type,
+                         reinterpret_cast<void**>(&_cost)));
   EXPECT_EQ(SUCCESS, TestGraph(0));
   EXPECT_EQ((cost_t)0, _cost[0]);
   EXPECT_EQ(FAILURE, TestGraph(1));
@@ -94,14 +92,14 @@ TEST_P(BFSTest, SingleNodeLoop) {
 TEST_P(BFSTest, EmptyEdges) {
   graph_initialize(DATA_FOLDER("disconnected_1000_nodes.totem"), false,
                    &_graph);
-  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, 
-                         (void**)&_cost));
+  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type,
+                         reinterpret_cast<void**>(&_cost)));
 
   // First vertex as source
   vid_t source = 0;
   EXPECT_EQ(SUCCESS, TestGraph(source));
   EXPECT_EQ((cost_t)0, _cost[source]);
-  for(vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++) {
+  for (vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++) {
     EXPECT_EQ(INF_COST, _cost[vertex]);
   }
 
@@ -109,14 +107,14 @@ TEST_P(BFSTest, EmptyEdges) {
   source = _graph->vertex_count - 1;
   EXPECT_EQ(SUCCESS, TestGraph(source));
   EXPECT_EQ((cost_t)0, _cost[source]);
-  for(vid_t vertex = source; vertex < _graph->vertex_count - 1; vertex++){
+  for (vid_t vertex = source; vertex < _graph->vertex_count - 1; vertex++) {
     EXPECT_EQ(INF_COST, _cost[vertex]);
   }
 
   // A vertex in the middle as source
   source = 199;
   EXPECT_EQ(SUCCESS, TestGraph(source));
-  for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) {
+  for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) {
     EXPECT_EQ((vertex == source) ? (cost_t)0 : INF_COST, _cost[vertex]);
   }
 
@@ -127,28 +125,29 @@ TEST_P(BFSTest, EmptyEdges) {
 // Tests BFS for a chain of 1000 nodes.
 TEST_P(BFSTest, Chain) {
   graph_initialize(DATA_FOLDER("chain_1000_nodes.totem"), false, &_graph);
-  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, 
-                         (void**)&_cost));
+  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type,
+                         reinterpret_cast<void**>(&_cost)));
 
   // First vertex as source
   vid_t source = 0;
   EXPECT_EQ(SUCCESS, TestGraph(source));
-  for(vid_t vertex = source; vertex < _graph->vertex_count; vertex++){
+  for (vid_t vertex = source; vertex < _graph->vertex_count; vertex++) {
     EXPECT_EQ(vertex, _cost[vertex]);
   }
 
   // Last vertex as source
   source = _graph->vertex_count - 1;
   EXPECT_EQ(SUCCESS, TestGraph(source));
-  for(vid_t vertex = source; vertex < _graph->vertex_count; vertex++){
+  for (vid_t vertex = source; vertex < _graph->vertex_count; vertex++) {
     EXPECT_EQ(source - vertex, _cost[vertex]);
   }
 
   // A vertex in the middle as source
   source = 199;
   EXPECT_EQ(SUCCESS, TestGraph(source));
-  for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) {
-    EXPECT_EQ((cost_t)abs((double)source - (double)vertex), _cost[vertex]);
+  for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) {
+    EXPECT_EQ((cost_t)abs(static_cast<int64_t>(source) -
+                          static_cast<int64_t>(vertex)), _cost[vertex]);
   }
 
   // Non existent vertex source
@@ -159,14 +158,14 @@ TEST_P(BFSTest, Chain) {
 TEST_P(BFSTest, CompleteGraph) {
   graph_initialize(DATA_FOLDER("complete_graph_300_nodes.totem"), false,
                    &_graph);
-  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, 
-                         (void**)&_cost));
+  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type,
+                         reinterpret_cast<void**>(&_cost)));
 
   // First vertex as source
   vid_t source = 0;
   EXPECT_EQ(SUCCESS, TestGraph(source));
   EXPECT_EQ((cost_t)0, _cost[source]);
-  for(vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++){
+  for (vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++) {
     EXPECT_EQ((cost_t)1, _cost[vertex]);
   }
 
@@ -174,14 +173,14 @@ TEST_P(BFSTest, CompleteGraph) {
   source = _graph->vertex_count - 1;
   EXPECT_EQ(SUCCESS, TestGraph(source));
   EXPECT_EQ((cost_t)0, _cost[source]);
-  for(vid_t vertex = 0; vertex < source; vertex++) {
+  for (vid_t vertex = 0; vertex < source; vertex++) {
     EXPECT_EQ((cost_t)1, _cost[vertex]);
   }
 
   // A vertex source in the middle
   source = 199;
   EXPECT_EQ(SUCCESS, TestGraph(source));
-  for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) {
+  for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) {
     EXPECT_EQ((cost_t)((source == vertex) ? 0 : 1), _cost[vertex]);
   }
 
@@ -192,14 +191,14 @@ TEST_P(BFSTest, CompleteGraph) {
 // Tests BFS for a complete graph of 1000 nodes.
 TEST_P(BFSTest, Star) {
   graph_initialize(DATA_FOLDER("star_1000_nodes.totem"), false, &_graph);
-  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type, 
-                         (void**)&_cost));
+  CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(cost_t), _mem_type,
+                         reinterpret_cast<void**>(&_cost)));
 
   // First vertex as source
   vid_t source = 0;
   EXPECT_EQ(SUCCESS, TestGraph(source));
   EXPECT_EQ((cost_t)0, _cost[source]);
-  for(vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++){
+  for (vid_t vertex = source + 1; vertex < _graph->vertex_count; vertex++) {
     EXPECT_EQ((cost_t)1, _cost[vertex]);
   }
 
@@ -208,7 +207,7 @@ TEST_P(BFSTest, Star) {
   EXPECT_EQ(SUCCESS, TestGraph(source));
   EXPECT_EQ((cost_t)0, _cost[source]);
   EXPECT_EQ((cost_t)1, _cost[0]);
-  for(vid_t vertex = 1; vertex < source - 1; vertex++) {
+  for (vid_t vertex = 1; vertex < source - 1; vertex++) {
     EXPECT_EQ((cost_t)2, _cost[vertex]);
   }
 
@@ -216,7 +215,7 @@ TEST_P(BFSTest, Star) {
   source = 199;
   EXPECT_EQ(SUCCESS, TestGraph(source));
   EXPECT_EQ((cost_t)1, _cost[0]);
-  for(vid_t vertex = 1; vertex < _graph->vertex_count; vertex++) {
+  for (vid_t vertex = 1; vertex < _graph->vertex_count; vertex++) {
     EXPECT_EQ((cost_t)((source == vertex) ? 0 : 2), _cost[vertex]);
   }
 
@@ -224,42 +223,31 @@ TEST_P(BFSTest, Star) {
   EXPECT_EQ(FAILURE, TestGraph(_graph->vertex_count));
 }
 
-// TODO(lauro): Add test cases for not well defined structures.
-
-// Values() seems to accept only pointers, hence the possible parameters
-// are defined here, and a pointer to each ot them is used.
-bfs_param_t bfs_params[] = {
-  {NULL, &bfs_cpu},
-  {NULL, &bfs_bu_cpu},
-  {NULL, &bfs_queue_cpu},
-  {NULL, &bfs_gpu},
-  {NULL, &bfs_bu_gpu},
-  {NULL, &bfs_vwarp_gpu},
-  {&totem_attrs[0], NULL},
-  {&totem_attrs[1], NULL},
-  {&totem_attrs[2], NULL},
-  {&totem_attrs[3], NULL},
-  {&totem_attrs[4], NULL},
-  {&totem_attrs[5], NULL},
-  {&totem_attrs[6], NULL},
-  {&totem_attrs[7], NULL},
-  {&totem_attrs[8], NULL},
-  {&totem_attrs[9], NULL},
-  {&totem_attrs[10], NULL},
-  {&totem_attrs[11], NULL},
-  {&totem_attrs[12], NULL},
-  {&totem_attrs[13], NULL},
-  {&totem_attrs[14], NULL},
-  {&totem_attrs[15], NULL},
-  {&totem_attrs[16], NULL},
-  {&totem_attrs[17], NULL},
-  {&totem_attrs[18], NULL},
-  {&totem_attrs[19], NULL},
-  {&totem_attrs[20], NULL},
-  {&totem_attrs[21], NULL},
-  {&totem_attrs[22], NULL},
-  {&totem_attrs[23], NULL}
+// Defines the set of BFS vanilla implementations to be tested. To test
+// a new implementation, simply add it to the set below.
+void* bfs_vanilla_funcs[] = {
+  reinterpret_cast<void*>(&bfs_cpu),
+  reinterpret_cast<void*>(&bfs_bu_cpu),
+  reinterpret_cast<void*>(&bfs_queue_cpu),
+  reinterpret_cast<void*>(&bfs_gpu),
+  reinterpret_cast<void*>(&bfs_vwarp_gpu),
+  reinterpret_cast<void*>(&bfs_bu_gpu),
+};
+const int bfs_vanilla_count = STATIC_ARRAY_COUNT(bfs_vanilla_funcs);
+
+// Defines the set of BFS hybrid implementations to be tested. To test
+// a new implementation, simply add it to the set below.
+void* bfs_hybrid_funcs[] = {
+  reinterpret_cast<void*>(&bfs_hybrid),
+  reinterpret_cast<void*>(&bfs_stepwise_hybrid),
 };
+const int bfs_hybrid_count = STATIC_ARRAY_COUNT(bfs_hybrid_funcs);
+
+// Maintains references to the different configurations (vanilla and hybrid)
+// that will be tested by the framework.
+static const int bfs_params_count = bfs_vanilla_count +
+    bfs_hybrid_count * hybrid_configurations_count;
+static test_param_t* bfs_params[bfs_params_count];
 
 // From Google documentation:
 // In order to run value-parameterized tests, we need to instantiate them,
@@ -267,37 +255,12 @@ bfs_param_t bfs_params[] = {
 //
 // Values() receives a list of parameters and the framework will execute the
 // whole set of tests BFSTest for each element of Values()
-INSTANTIATE_TEST_CASE_P(BFSGPUAndCPUTest, BFSTest, Values(&bfs_params[0],
-                                                          &bfs_params[1],
-                                                          &bfs_params[2],
-                                                          &bfs_params[3],
-                                                          &bfs_params[4],
-                                                          &bfs_params[5],
-                                                          &bfs_params[6],
-                                                          &bfs_params[7],
-                                                          &bfs_params[8],
-                                                          &bfs_params[9],
-                                                          &bfs_params[10],
-                                                          &bfs_params[11],
-                                                          &bfs_params[12],
-                                                          &bfs_params[13],
-                                                          &bfs_params[14],
-                                                          &bfs_params[15],
-                                                          &bfs_params[16],
-                                                          &bfs_params[17],
-                                                          &bfs_params[18],
-                                                          &bfs_params[19],
-                                                          &bfs_params[20],
-                                                          &bfs_params[21],
-                                                          &bfs_params[22],
-                                                          &bfs_params[23],
-                                                          &bfs_params[24],
-                                                          &bfs_params[25],
-                                                          &bfs_params[26],
-                                                          &bfs_params[27],
-                                                          &bfs_params[28],
-                                                          &bfs_params[29] ));
-
+INSTANTIATE_TEST_CASE_P(BFSGPUAndCPUTest, BFSTest,
+                        ValuesIn(GetParameters(
+                            bfs_params, bfs_params_count,
+                            bfs_vanilla_funcs, bfs_vanilla_count,
+                            bfs_hybrid_funcs, bfs_hybrid_count),
+                                 bfs_params + bfs_params_count));
 #else
 
 // From Google documentation:
diff --git a/src/test/totem_common_unittest.h b/src/test/totem_common_unittest.h
index bf1c823..1d3a39b 100644
--- a/src/test/totem_common_unittest.h
+++ b/src/test/totem_common_unittest.h
@@ -27,7 +27,7 @@
 
 #define CUDA_CHECK_VERSION()               \
   do {                                     \
-    if(check_cuda_version() != SUCCESS) {  \
+    if (check_cuda_version() != SUCCESS) { \
       exit(EXIT_FAILURE);                  \
     }                                      \
   } while (0)
@@ -47,7 +47,7 @@
     if (!(stmt)) {                              \
       printf("Error line: %d\n", __LINE__);     \
     }                                           \
-  } while(0)
+  } while (0)
 
 // Hybrid algorithms attributes
 const float CPU_SHARE_ZERO = 0;
@@ -58,109 +58,195 @@ const bool VERTEX_IDS_SORTED = true;
 const bool VERTEX_IDS_NOT_SORTED = false;
 const int  GPU_COUNT_ONE = 1;
 PRIVATE totem_attr_t totem_attrs[] = {
-  // (0) CPU only
-  {PAR_RANDOM, PLATFORM_CPU, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (1) GPU only
-  {PAR_RANDOM, PLATFORM_GPU, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (2) Multi GPU
-  {PAR_RANDOM, PLATFORM_GPU, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-
-  // (3) Hybrid CPU + 1 GPU
-  {PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (4) Hybrid CPU + 1 GPU
-  {PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (5) Hybrid CPU + 1 GPU
-  {PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-
-  // (6) Hybrid CPU + 1 GPU, sorted vertices
-  {PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (7) Hybrid CPU + 1 GPU, sorted vertices
-  {PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (8) Hybrid CPU + 1 GPU, sorted vertices
-  {PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-
-  // (9) Hybrid CPU + 1 GPU (memory mapped GPU partition)
-  {PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (10) Hybrid CPU + 1 GPU (memory mapped GPU partition)
-  {PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (11) Hybrid CPU + 1 GPU (memory mapped GPU partition)
-  {PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-
-  // (12) Hybrid CPU + all GPU
-  {PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (13) Hybrid CPU + all GPU
-  {PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (14) Hybrid CPU + all GPU
-  {PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-
-  // (15) Hybrid CPU + all GPU, sorted vertices
-  {PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (16) Hybrid CPU + all GPU, sorted vertices
-  {PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (17) Hybrid CPU + all GPU, sorted vertices
-  {PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-
-  // (18) Hybrid CPU + all GPU, randomized vertex placement
-  {PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (19) Hybrid CPU + all GPU, randomized vertex placement
-  {PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (20) Hybrid CPU + all GPU, randomized vertex placement
-  {PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-
-  // (21) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement
-  {PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (22) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement
-  {PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
-  // (23) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement
-  {PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
-   GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
-   MSG_SIZE_ZERO, MSG_SIZE_ZERO},
+  {  // (0) CPU only
+    PAR_RANDOM, PLATFORM_CPU, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (1) GPU only
+    PAR_RANDOM, PLATFORM_GPU, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (2) Multi GPU
+    PAR_RANDOM, PLATFORM_GPU, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ZERO,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+
+  {  // (3) Hybrid CPU + 1 GPU
+    PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (4) Hybrid CPU + 1 GPU
+    PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (5) Hybrid CPU + 1 GPU
+    PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+
+  {  // (6) Hybrid CPU + 1 GPU, sorted vertices
+    PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (7) Hybrid CPU + 1 GPU, sorted vertices
+    PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (8) Hybrid CPU + 1 GPU, sorted vertices
+    PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+
+  {  // (9) Hybrid CPU + 1 GPU (memory mapped GPU partition)
+    PAR_RANDOM, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (10) Hybrid CPU + 1 GPU (memory mapped GPU partition)
+    PAR_SORTED_ASC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (11) Hybrid CPU + 1 GPU (memory mapped GPU partition)
+    PAR_SORTED_DSC, PLATFORM_HYBRID, GPU_COUNT_ONE, GPU_GRAPH_MEM_MAPPED,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+
+  {  // (12) Hybrid CPU + all GPU
+    PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (13) Hybrid CPU + all GPU
+    PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (14) Hybrid CPU + all GPU
+    PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+
+  {  // (15) Hybrid CPU + all GPU, sorted vertices
+    PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (16) Hybrid CPU + all GPU, sorted vertices
+    PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (17) Hybrid CPU + all GPU, sorted vertices
+    PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_DISABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+
+  {  // (18) Hybrid CPU + all GPU, randomized vertex placement
+    PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (19) Hybrid CPU + all GPU, randomized vertex placement
+    PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (20) Hybrid CPU + all GPU, randomized vertex placement
+    PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_NOT_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+
+  {  // (21) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement
+    PAR_RANDOM, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (22) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement
+    PAR_SORTED_ASC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
+  {  // (23) Hybrid CPU + all GPU, sorted vertices, randomized vertex placement
+    PAR_SORTED_DSC, PLATFORM_HYBRID, get_gpu_count(), GPU_GRAPH_MEM_DEVICE,
+    GPU_PAR_RANDOMIZED_ENABLED, VERTEX_IDS_SORTED, CPU_SHARE_ONE_THIRD,
+    MSG_SIZE_ZERO, MSG_SIZE_ZERO
+  },
 };
 
-#endif // TOTEM_COMMON_UNITTEST_H
+// A macro that computes the number of elements of a static array.
+#define STATIC_ARRAY_COUNT(array) sizeof(array) / sizeof(*array);
+
+// The number of hybrid configurations in the totem_attr array.
+static const int hybrid_configurations_count = STATIC_ARRAY_COUNT(totem_attrs);
+
+// This is to allow testing the vanilla and the hybrid functions that are
+// based on the Totem framework.
+typedef struct {
+  totem_attr_t* attr;  // Attributes for totem-based implementations.
+  void*         func;  // The algorithm function to be tested.
+} test_param_t;
+
+// Adds a test parameter to the passed vector of parameters.
+static void PushParam(std::vector<test_param_t>* params_vector,
+                      totem_attr_t* attr, void* func) {
+  test_param_t param;
+  param.attr = attr;
+  param.func = func;
+  params_vector->push_back(param);
+}
+
+// Returns a reference to an array of references to the various test parameters
+// to be tested.
+static test_param_t** GetParameters(test_param_t** params, int params_count,
+                                    void** vanilla_funcs, int vanilla_count,
+                                    void** hybrid_funcs, int hybrid_count) {
+  // When this function is passed as a parameter to "ValuesIn" in the context of
+  // INSTANTIATE_TEST_CASE_P macro, it gets invoked more than once within
+  // the macro; therefore, the following hack is used to ensure that
+  // initialization of the parameters array happens once.
+  static bool initialized = false;
+  if (initialized) { return params; }
+
+  // This vector maintains the state of the different parameters during the
+  // the tests, and hence it is defined static.
+  static std::vector<test_param_t> params_vector;
+
+  // Add the vanilla implementations.
+  for (int i = 0; i < vanilla_count; i++) {
+    PushParam(&params_vector, NULL, vanilla_funcs[i]);
+  }
+
+  // Add the hybrid implementations.
+  for (int i = 0; i < hybrid_count; i++) {
+    // Add the different configurations of the hybrid implementation.
+    for (uint32_t j = 0; j < hybrid_configurations_count; j++) {
+      PushParam(&params_vector, &totem_attrs[j], hybrid_funcs[i]);
+    }
+  }
+
+  // Fill the params array with references to the parameters to be tested
+  // (maintained by params_vector throughout the execution of the tests).
+  assert(params_count == params_vector.size());
+  for (size_t i = 0; i != params_vector.size(); i++) {
+    params[i] = &params_vector[i];
+  }
+
+  initialized = true;
+  return params;
+}
+
+#endif  // TOTEM_COMMON_UNITTEST_H
diff --git a/src/test/totem_page_rank_unittest.cu b/src/test/totem_page_rank_unittest.cu
index 441946c..70ae4ff 100644
--- a/src/test/totem_page_rank_unittest.cu
+++ b/src/test/totem_page_rank_unittest.cu
@@ -1,5 +1,6 @@
 /*
- * Contains unit tests for an implementation of the PageRank graph algorithm.
+ * Contains unit tests for the different implementations of the PageRank graph
+ * algorithm.
  *
  *  Created on: 2011-03-22
  *      Author: Abdullah Gharaibeh
@@ -11,28 +12,26 @@
 #if GTEST_HAS_PARAM_TEST
 
 using ::testing::TestWithParam;
-using ::testing::Values;
+using ::testing::ValuesIn;
+
+
+// IMPORTANT NOTE: Some of the unit tests for page_rank_incoming_hybrid will
+// fail if the type rank_t is float due to differences in single precision
+// calculations between the CPU and the GPU. Defining rank_t as double should
+// allow the tests to pass.
 
 // The following implementation relies on TestWithParam<PageRankFunction> to
-// test the two versions of PageRank implemented: CPU and GPU.
-// Details on how to use TestWithParam<T> can be found at:
-// totem_bfs_unittest.cc and
+// test the different versions of PageRank. Details on how to use
+// TestWithParam<T> can be found at:
 // http://code.google.com/p/googletest/source/browse/trunk/samples/sample7_unittest.cc
 
 typedef error_t(*PageRankFunction)(graph_t*, rank_t*, rank_t*);
+typedef error_t(*PageRankHybridFunction)(rank_t*, rank_t*);
 
-// This is to allow testing the vanilla bfs functions and the hybrid one
-// that is based on the framework. Note that have a different signature
-// of the hybrid algorithm forced this work-around.
-typedef struct page_rank_param_s {
-  totem_attr_t* attr;    // totem attributes for totem-based tests
-  PageRankFunction func; // the vanilla page_rank function if attr is NULL
-} page_rank_param_t;
-
-class PageRankTest : public TestWithParam<page_rank_param_t*> {
+class PageRankTest : public TestWithParam<test_param_t*> {
  public:
   virtual void SetUp() {
-    // Ensure the minimum CUDA architecture is supported
+    // Ensure the minimum CUDA architecture is supported.
     CUDA_CHECK_VERSION();
     _page_rank_param = GetParam();
     _rank = NULL;
@@ -40,39 +39,46 @@ class PageRankTest : public TestWithParam<page_rank_param_t*> {
   }
 
   virtual void TearDown() {
-    if (_graph) graph_finalize(_graph);
-    if (_rank) totem_free(_rank, TOTEM_MEM_HOST_PINNED);
+    if (_graph) { graph_finalize(_graph); }
+    if (_rank) { totem_free(_rank, TOTEM_MEM_HOST_PINNED); }
   }
 
   error_t TestGraph() {
-    // the graph should be undirected because the test is shared between the
+    // The graph should be undirected because the test is shared between the
     // two versions of the PageRank algorithm: incoming- and outgoing- based.
     EXPECT_FALSE(_graph->directed);
     if (_graph->vertex_count != 0) {
-      CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(rank_t), 
-                             TOTEM_MEM_HOST_PINNED, (void**)&_rank));
+      CALL_SAFE(totem_malloc(_graph->vertex_count * sizeof(rank_t),
+                             TOTEM_MEM_HOST_PINNED,
+                             reinterpret_cast<void**>(&_rank)));
     }
     if (_page_rank_param->attr != NULL) {
       _page_rank_param->attr->pull_msg_size = sizeof(rank_t) * BITS_PER_BYTE;
+      _page_rank_param->attr->push_msg_size = sizeof(rank_t) * BITS_PER_BYTE;
       if (totem_init(_graph, _page_rank_param->attr) == FAILURE) {
         return FAILURE;
       }
-      error_t err = page_rank_incoming_hybrid(NULL, _rank);
+      PageRankHybridFunction func =
+          reinterpret_cast<PageRankHybridFunction>(_page_rank_param->func);
+      error_t err = func(NULL, _rank);
       totem_finalize();
       return err;
+    } else {
+      PageRankFunction func =
+          reinterpret_cast<PageRankFunction>(_page_rank_param->func);
+      return func(_graph, NULL, _rank);
     }
-    return _page_rank_param->func(_graph, NULL, _rank);
   }
 
  protected:
-  page_rank_param_t* _page_rank_param;
+  test_param_t* _page_rank_param;
   rank_t* _rank;
   graph_t* _graph;
 };
 
 // Tests PageRank for empty graphs.
 TEST_P(PageRankTest, Empty) {
-  _graph = (graph_t*)calloc(sizeof(graph_t), 1);
+  _graph = reinterpret_cast<graph_t*>(calloc(sizeof(graph_t), 1));
   EXPECT_EQ(FAILURE, TestGraph());
   free(_graph);
   _graph = NULL;
@@ -92,7 +98,7 @@ TEST_P(PageRankTest, Chain) {
   EXPECT_EQ(SUCCESS, graph_initialize(DATA_FOLDER("chain_1000_nodes.totem"),
                                       false, &_graph));
   EXPECT_EQ(SUCCESS, TestGraph());
-  for(vid_t vertex = 0; vertex < _graph->vertex_count/2; vertex++){
+  for (vid_t vertex = 0; vertex < _graph->vertex_count / 2; vertex++) {
     EXPECT_FLOAT_EQ(_rank[vertex], _rank[_graph->vertex_count - vertex - 1]);
   }
 }
@@ -103,7 +109,7 @@ TEST_P(PageRankTest, CompleteGraph) {
             graph_initialize(DATA_FOLDER("complete_graph_300_nodes.totem"),
                              false, &_graph));
   EXPECT_EQ(SUCCESS, TestGraph());
-  for(vid_t vertex = 0; vertex < _graph->vertex_count; vertex++){
+  for (vid_t vertex = 0; vertex < _graph->vertex_count; vertex++) {
     EXPECT_FLOAT_EQ(_rank[0], _rank[vertex]);
   }
 }
@@ -114,53 +120,48 @@ TEST_P(PageRankTest, Star) {
             graph_initialize(DATA_FOLDER("star_1000_nodes.totem"),
                              false, &_graph));
   EXPECT_EQ(SUCCESS, TestGraph());
-  for(vid_t vertex = 1; vertex < _graph->vertex_count; vertex++){
+  for (vid_t vertex = 1; vertex < _graph->vertex_count; vertex++) {
     EXPECT_FLOAT_EQ(_rank[1], _rank[vertex]);
     EXPECT_GT(_rank[0], _rank[vertex]);
   }
 }
 
-// TODO(abdullah): Add test cases for not well defined structures.
-// TODO(abdullah,lauro): Add test cases for non-empty vertex set and empty edge
-// set.
-
-// Values() seems to accept only pointers, hence the possible parameters
-// are defined here, and a pointer to each ot them is used.
-page_rank_param_t page_rank_params[] = {
-  {NULL, &page_rank_cpu},
-  {NULL, &page_rank_gpu},
-  {NULL, &page_rank_vwarp_gpu},
-  {NULL, &page_rank_incoming_cpu},
-  {NULL, &page_rank_incoming_gpu},
-  {&totem_attrs[0], NULL},
-  {&totem_attrs[1], NULL},
-  {&totem_attrs[2], NULL},
-  {&totem_attrs[3], NULL},
-  {&totem_attrs[4], NULL},
-  {&totem_attrs[5], NULL},
-  {&totem_attrs[6], NULL},
-  {&totem_attrs[7], NULL}
+// Defines the set of PageRank vanilla implementations to be tested. To test
+// a new implementation, simply add it to the set below.
+static void* vanilla_funcs[] = {
+  reinterpret_cast<void*>(&page_rank_cpu),
+  reinterpret_cast<void*>(&page_rank_incoming_cpu),
+  reinterpret_cast<void*>(&page_rank_gpu),
+  reinterpret_cast<void*>(&page_rank_vwarp_gpu),
+  reinterpret_cast<void*>(&page_rank_incoming_gpu),
 };
+static const int vanilla_count = STATIC_ARRAY_COUNT(vanilla_funcs);
 
-// Values() receives a list of parameters and the framework will execute the
-// whole set of tests PageRankTest for each element of Values()
-// TODO(abdullah): both versions of the PageRank algorithm (the incoming- and
-// outgoing- based) can share the same tests because all the graphs are
-// undirected. Separate the two for cases where the graphs are directed.
+// Defines the set of PageRank hybrid implementations to be tested. To test
+// a new implementation, simply add it to the set below.
+static void* hybrid_funcs[] = {
+  reinterpret_cast<void*>(&page_rank_hybrid),
+  reinterpret_cast<void*>(&page_rank_incoming_hybrid),
+};
+static const int hybrid_count = STATIC_ARRAY_COUNT(hybrid_funcs);
+
+// Maintains references to the different configurations (vanilla and hybrid)
+// that will be tested by the framework.
+static const int params_count = vanilla_count +
+    hybrid_count * hybrid_configurations_count;
+static test_param_t* params[params_count];
+
+// From Google documentation:
+// In order to run value-parameterized tests, we need to instantiate them,
+// or bind them to a list of values which will be used as test parameters.
+//
+// ValuesIn() receives a list of parameters and the framework will execute the
+// whole set of tests for each entry in the array passed to ValuesIn().
 INSTANTIATE_TEST_CASE_P(PageRankGPUAndCPUTest, PageRankTest,
-                        Values(&page_rank_params[0],
-                               &page_rank_params[1],
-                               &page_rank_params[2],
-                               &page_rank_params[3],
-                               &page_rank_params[4],
-                               &page_rank_params[5],
-                               &page_rank_params[6],
-                               &page_rank_params[7],
-                               &page_rank_params[8],
-                               &page_rank_params[9],
-                               &page_rank_params[10],
-                               &page_rank_params[11],
-                               &page_rank_params[12]));
+                        ValuesIn(GetParameters(params, params_count,
+                                               vanilla_funcs, vanilla_count,
+                                               hybrid_funcs, hybrid_count),
+                                 params + params_count));
 
 #else
 
diff --git a/src/thirdparty/wattsup/makefile b/src/thirdparty/wattsup/makefile
new file mode 100644
index 0000000..f9541e8
--- /dev/null
+++ b/src/thirdparty/wattsup/makefile
@@ -0,0 +1,17 @@
+#--------------------------------------------------------------------------
+# Wattsup makefile.
+#
+# Created on: 2014-08-28
+# Author: Abdullah Gharaibeh
+#--------------------------------------------------------------------------
+TARGET  = wattsup
+ROOTDIR = ../..
+include $(ROOTDIR)/make.defs
+
+all: $(BINDIR)/$(TARGET)
+
+$(BINDIR)/$(TARGET): $(BINDIR) objects
+	@printf "\nLinking %s executable ...\n" $(BINDIR)/$(TARGET)
+	$(NVCC) -o $@ $(COBJS) $(CCOBJS) $(CUOBJS) $(LIBS) $(LFLAGS)
+
+include $(ROOTDIR)/make.rules