From 1243c34363b068155bdf06cfa397123b8ec33403 Mon Sep 17 00:00:00 2001
From: Dustin Richmond <dustinar@uw.edu>
Date: Thu, 31 Mar 2022 09:16:32 -0700
Subject: [PATCH 1/3] Added Preslav's working SW code

---
 examples/cuda/test_smith_waterman/Makefile    | 128 +++++
 examples/cuda/test_smith_waterman/kernel.cpp  | 293 ++++++++++
 .../kernel_smith_waterman.cpp                 | 263 +++++++++
 examples/cuda/test_smith_waterman/main.cpp    | 289 ++++++++++
 examples/cuda/test_smith_waterman/output      | 512 ++++++++++++++++++
 5 files changed, 1485 insertions(+)
 create mode 100644 examples/cuda/test_smith_waterman/Makefile
 create mode 100644 examples/cuda/test_smith_waterman/kernel.cpp
 create mode 100644 examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp
 create mode 100644 examples/cuda/test_smith_waterman/main.cpp
 create mode 100644 examples/cuda/test_smith_waterman/output

diff --git a/examples/cuda/test_smith_waterman/Makefile b/examples/cuda/test_smith_waterman/Makefile
new file mode 100644
index 000000000..67ac012af
--- /dev/null
+++ b/examples/cuda/test_smith_waterman/Makefile
@@ -0,0 +1,128 @@
+# Copyright (c) 2021, University of Washington All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+#
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+#
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Makefile compiles, links, and executes examples Run `make help`
+# to see the available targets for the selected platform.
+
+################################################################################
+# environment.mk verifies the build environment and sets the following
+# makefile variables:
+#
+# LIBRAIRES_PATH: The path to the libraries directory
+# HARDWARE_PATH: The path to the hardware directory
+# EXAMPLES_PATH: The path to the examples directory
+# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
+# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
+###############################################################################
+
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+
+include $(REPLICANT_PATH)/environment.mk
+SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
+CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime
+
+# KERNEL_NAME is the name of the CUDA-Lite Kernel
+KERNEL_NAME = smith_waterman
+
+###############################################################################
+# Host code compilation flags and flow
+###############################################################################
+
+# TEST_SOURCES is a list of source files that need to be compiled
+TEST_SOURCES = main.cpp
+
+DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE -D_DEFAULT_SOURCE
+CDEFINES +=
+CXXDEFINES +=
+
+FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
+CFLAGS   += -std=c99 $(FLAGS)
+CXXFLAGS += -std=c++11 $(FLAGS)
+
+# compilation.mk defines rules for compilation of C/C++
+include $(EXAMPLES_PATH)/compilation.mk
+
+###############################################################################
+# Host code link flags and flow
+###############################################################################
+
+LDFLAGS +=
+
+# link.mk defines rules for linking of the final execution binary.
+include $(EXAMPLES_PATH)/link.mk
+
+###############################################################################
+# Device code compilation flow
+###############################################################################
+
+# BSG_MANYCORE_KERNELS is a list of manycore executables that should
+# be built before executing.
+
+BSG_MANYCORE_KERNELS ?= kernel.riscv
+RISCV_INCLUDES  +=
+RISCV_CCPPFLAGS += -D__KERNEL__
+
+kernel.riscv: kernel.rvo
+
+TILE_GROUP_DIM_X ?= 16
+TILE_GROUP_DIM_Y ?= 8
+RISCV_DEFINES += -DTILE_GROUP_DIM_X=$(TILE_GROUP_DIM_X)
+RISCV_DEFINES += -DTILE_GROUP_DIM_Y=$(TILE_GROUP_DIM_Y)
+RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
+RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
+
+include $(EXAMPLES_PATH)/cuda/riscv.mk
+
+###############################################################################
+# Execution flow
+#
+# C_ARGS: Use this to pass arguments that you want to appear in argv
+#         For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
+#
+# SIM_ARGS: Use this to pass arguments to the simulator
+###############################################################################
+C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME)
+
+SIM_ARGS ?=
+
+# Include platform-specific execution rules
+include $(EXAMPLES_PATH)/execution.mk
+
+###############################################################################
+# Regression Flow
+###############################################################################
+
+regression: exec.log
+	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
+
+.DEFAULT_GOAL := help
+
+.PHONY: clean
+
+clean:
+	rm -rf *.ld
+
diff --git a/examples/cuda/test_smith_waterman/kernel.cpp b/examples/cuda/test_smith_waterman/kernel.cpp
new file mode 100644
index 000000000..379c0c5de
--- /dev/null
+++ b/examples/cuda/test_smith_waterman/kernel.cpp
@@ -0,0 +1,293 @@
+#define MEMCPY_FLAG
+#define HB
+//#define DEBUG
+
+#ifdef HB
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+#include "bsg_tile_group_barrier.hpp"
+
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+#else
+#include "kernel_smith_waterman.hpp"
+#endif
+
+template <typename T>
+inline T max(T a, T b)
+{
+  if (a > b)
+    return a;
+  else
+    return b;
+}
+
+template <typename T>
+inline T max(T a, T b, T c, T d)
+{
+  return max(max(a, b), max(c, d));
+}
+
+template <typename T>
+inline T max(T a, T b, T c)
+{
+  return max(max(a, b), c);
+}
+
+inline void unpack(const unsigned* packed, const int num_packed, unsigned char* unpacked) {
+  unsigned char* unpacked_ptr = unpacked;
+  for (int i = 0; i < num_packed; i++) {
+    int packed_val = packed[i];
+    for (int j = 0; j < 16; j++) {
+      unsigned char unpacked_val = packed_val >> (30 - 2 * j);
+      unpacked_val &= 0x00000003;
+      unpacked_ptr[j] = unpacked_val;
+    }
+    unpacked_ptr += 16;
+  }
+}
+
+inline void profile_start(){
+#ifdef HB
+  bsg_cuda_print_stat_kernel_start();
+#endif
+}
+
+inline void profile_end(){
+#ifdef HB
+  bsg_cuda_print_stat_kernel_end();
+#endif
+}
+
+inline void sync(){
+#ifdef HB
+  barrier.sync();
+#endif
+}
+
+inline void debug_printf(int tid, int k, int N, int i, int length){
+#ifdef HB
+#ifdef DEBUG
+  bsg_printf("[Tile %d] Alignment %d/%d, Row %d/%d\n", tid, k, N, i, length);
+#endif
+#endif
+}
+
+// copy num_words words from DRAM via non-blocking loads
+// num_words must be divisible by 16;
+template <typename T>
+inline void hb_memcpy(const T* src_ptr, const int num_words, T* dst_ptr) {
+  const T* src = src_ptr;
+  T* dst = dst_ptr;
+
+  for (int i = 0; i < num_words / 16; i++) {
+    T tmp00 =  src[0];
+    T tmp01 =  src[1];
+    T tmp02 =  src[2];
+    T tmp03 =  src[3];
+    T tmp04 =  src[4];
+    T tmp05 =  src[5];
+    T tmp06 =  src[6];
+    T tmp07 =  src[7];
+    T tmp08 =  src[8];
+    T tmp09 =  src[9];
+    T tmp10 = src[10];
+    T tmp11 = src[11];
+    T tmp12 = src[12];
+    T tmp13 = src[13];
+    T tmp14 = src[14];
+    T tmp15 = src[15];
+    asm volatile("": : :"memory");
+    dst[0] = tmp00;
+    dst[1] = tmp01;
+    dst[2] = tmp02;
+    dst[3] = tmp03;
+    dst[4] = tmp04;
+    dst[5] = tmp05;
+    dst[6] = tmp06;
+    dst[7] = tmp07;
+    dst[8] = tmp08;
+    dst[9] = tmp09;
+    dst[10] = tmp10;
+    dst[11] = tmp11;
+    dst[12] = tmp12;
+    dst[13] = tmp13;
+    dst[14] = tmp14;
+    dst[15] = tmp15;
+    src += 16;
+    dst += 16;
+  }
+}
+
+inline void load_spm(const unsigned* seqa, const unsigned* seqb,
+                     const unsigned* sizea, const unsigned* sizeb,
+                     const int num_packed_a, const int num_packed_b,
+                     const int N,
+                     unsigned* seqa_spm,  unsigned* seqb_spm,
+                     unsigned* sizea_spm,  unsigned* sizeb_spm) {
+#ifdef MEMCPY_FLAG
+  // Transfer sequences
+  hb_memcpy(seqa, N * num_packed_a, seqa_spm);
+  hb_memcpy(seqb, N * num_packed_b, seqb_spm);
+
+  // Transfer sequence lengths
+  unsigned sizea_temp0 = sizea[0];
+  unsigned sizea_temp1 = sizea[1];
+  unsigned sizea_temp2 = sizea[2];
+  unsigned sizea_temp3 = sizea[3];
+  unsigned sizeb_temp0 = sizeb[0];
+  unsigned sizeb_temp1 = sizeb[1];
+  unsigned sizeb_temp2 = sizeb[2];
+  unsigned sizeb_temp3 = sizeb[3];
+  asm volatile("": : :"memory");
+  sizea_spm[0] = sizea_temp0;
+  sizea_spm[1] = sizea_temp1;
+  sizea_spm[2] = sizea_temp2;
+  sizea_spm[3] = sizea_temp3;
+  sizeb_spm[0] = sizeb_temp0;
+  sizeb_spm[1] = sizeb_temp1;
+  sizeb_spm[2] = sizeb_temp2;
+  sizeb_spm[3] = sizeb_temp3;
+#else
+  for (int i = 0; i < N * num_packed_a; i++) {
+    seqa_spm[i] = seqa[i];
+  }
+  for (int i = 0; i < N * num_packed_b; i++) {
+    seqb_spm[i] = seqb[i];
+  }
+
+  // load sizes of sequences to SPM
+  for (int k = 0; k < N; k++) {
+    sizea_spm[k] = sizea[k];
+    sizeb_spm[k] = sizeb[k];
+  }
+#endif
+}
+
+inline int get_tid(){
+#ifdef HB
+  return bsg_x * bsg_tiles_Y + bsg_y;
+#else
+  return 0;
+#endif
+}
+
+inline void align(const unsigned length, const unsigned width,
+                  const unsigned char* seqa_spm_ptr,
+                  const unsigned char* seqb_spm_ptr,
+                  short* E_spm, short* F_spm, short* H_spm, short* H_prev_spm, int* score) {
+  // Hyperparameters (match GPGPU-Sim)
+  const int match_score    = 1;
+  const int mismatch_score = -3;
+  const int gap_open       = 3;
+  const int gap_extend     = 1;
+
+  // compute 2D DP matrix
+  int score_temp = 0;
+  const auto mm = [&](const unsigned char a, const unsigned char b){ return (a==b)?match_score:mismatch_score; };
+  for(int i = 0; i < 1; i++) {
+    for(int j = 1; j < width; j++) {
+      E_spm[j] = max(E_spm[j-1] - gap_extend,
+                     H_spm[j-1] - gap_open
+                     );
+
+      F_spm[j] = 0;
+
+      H_prev_spm[j] = H_spm[j];
+      H_spm[j] = max((short)0, E_spm[j], F_spm[j]);
+      if (H_spm[j] > score_temp)
+        score_temp = H_spm[j];
+    }
+  }
+  for(int i = 1; i < length; i++) {
+    unsigned char seqa_val = seqa_spm_ptr[i];
+    for(int j = 1; j < width; j++) {
+      E_spm[j] = max(E_spm[j-1] - gap_extend,
+                     H_spm[j-1] - gap_open
+                     );
+
+      F_spm[j] = max(F_spm[j] - gap_extend,
+                       H_spm[j] - gap_open
+                      );
+
+      H_prev_spm[j] = H_spm[j];
+      H_spm[j] = max((short)0, E_spm[j], F_spm[j],
+                    (short)(H_prev_spm[j-1] + mm(seqa_val, seqb_spm_ptr[j])));
+      if (H_spm[j] > score_temp)
+        score_temp = H_spm[j];
+      }
+  }
+  // DRAM write
+  *score = score_temp;
+}
+
+#ifdef HB
+extern "C" __attribute__ ((noinline))
+#endif
+void kernel_smith_waterman(
+  const int N,
+  const int SIZEA_MAX,
+  const int SIZEB_MAX,
+  const unsigned* seqa,
+  const unsigned* seqb,
+  const unsigned* sizea,
+  const unsigned* sizeb,
+  int* score
+){
+        bsg_nonsynth_saif_start();
+  profile_start();
+  // determine which alignments the tile does
+  int tid = get_tid();
+  const int SIZEA_MAX_PACKED = (SIZEA_MAX + 15) / 16;
+  const int SIZEB_MAX_PACKED = (SIZEB_MAX + 15) / 16;
+  const unsigned* seqa_ptr = seqa + tid * N * SIZEA_MAX_PACKED;
+  const unsigned* seqb_ptr = seqb + tid * N * SIZEB_MAX_PACKED;
+  const unsigned* sizea_ptr = sizea + tid * N;
+  const unsigned* sizeb_ptr = sizeb + tid * N;
+  int* score_ptr = score + tid * N;
+
+  // Load data to SPM
+  unsigned seqa_packed_spm[N*SIZEA_MAX_PACKED];
+  unsigned seqb_packed_spm[N*SIZEB_MAX_PACKED];
+  unsigned length[N];
+  unsigned width[N];
+  load_spm(seqa_ptr, seqb_ptr, sizea_ptr, sizeb_ptr, SIZEA_MAX_PACKED, SIZEB_MAX_PACKED, N,
+           seqa_packed_spm, seqb_packed_spm, length, width);
+
+  // Initialize matrices
+  short E_spm[SIZEB_MAX];
+  short F_spm[SIZEB_MAX];
+  short H_spm[SIZEB_MAX];
+  short H_prev_spm[SIZEB_MAX];
+  for (int i = 0; i < SIZEB_MAX; i++) {
+    E_spm[i] = 0;
+    F_spm[i] = 0;
+    H_spm[i] = 0;
+    H_prev_spm[i] = 0;
+  }
+
+  // unpack sequences in SPM
+  unsigned char seqa_spm[SIZEA_MAX];
+  unsigned char seqb_spm[SIZEB_MAX];
+  unsigned* seqa_packed_spm_ptr = seqa_packed_spm;
+  unsigned* seqb_packed_spm_ptr = seqb_packed_spm;
+
+  // loop through N alignments
+  for (int k = 0; k < N; k++) {
+    // unpack
+    unpack(seqa_packed_spm_ptr, SIZEA_MAX_PACKED, seqa_spm);
+    unpack(seqb_packed_spm_ptr, SIZEB_MAX_PACKED, seqb_spm);
+
+    // compute score
+    align(length[k], width[k], seqa_spm, seqb_spm,
+          E_spm, F_spm, H_spm, H_prev_spm, score_ptr);
+
+    // move to next sequence
+    seqa_packed_spm_ptr += SIZEA_MAX_PACKED;
+    seqb_packed_spm_ptr += SIZEB_MAX_PACKED;
+    score_ptr++;
+  }
+  profile_end();
+  sync();
+        bsg_nonsynth_saif_end();
+}
+
diff --git a/examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp b/examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp
new file mode 100644
index 000000000..7f14a2571
--- /dev/null
+++ b/examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp
@@ -0,0 +1,263 @@
+#define MEMCPY_FLAG
+#define HB
+//#define DEBUG
+
+#ifdef HB
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+#include "bsg_tile_group_barrier.hpp"
+
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+#else
+#include "kernel_smith_waterman.hpp"
+#endif
+
+template <typename T>
+inline T max(T a, T b)
+{
+  if (a > b)
+    return a;
+  else
+    return b;
+}
+
+template <typename T>
+inline T max(T a, T b, T c, T d)
+{
+  return max(max(a, b), max(c, d));
+}
+
+template <typename T>
+inline T max(T a, T b, T c)
+{
+  return max(max(a, b), c);
+}
+
+inline void unpack(const unsigned* packed, const int num_packed, unsigned char* unpacked) {
+  for (int i = 0; i < num_packed; i++) {
+    int packed_val = packed[i];
+    for (int j = 0; j < 16; j++) {
+      unsigned char unpacked_val = packed_val >> (30 - 2 * j);
+      unpacked_val &= 0x00000003;
+      unpacked[j] = unpacked_val;
+    }
+    unpacked += 16;
+  }
+}
+
+inline void profile_start(){
+#ifdef HB
+  bsg_cuda_print_stat_kernel_start();
+#endif
+}
+
+inline void profile_end(){
+#ifdef HB
+  bsg_cuda_print_stat_kernel_end();
+#endif
+}
+
+inline void sync(){
+#ifdef HB
+  barrier.sync();
+#endif
+}
+
+inline void debug_printf(int tid, int k, int N, int i, int length){
+#ifdef HB
+#ifdef DEBUG
+  bsg_printf("[Tile %d] Alignment %d/%d, Row %d/%d\n", tid, k, N, i, length);
+#endif
+#endif
+}
+
+inline void load_spm(const unsigned* seqa, const unsigned* seqb,
+                     const unsigned* sizea, const unsigned* sizeb,
+                     const int num_packed_a, const int num_packed_b,
+                     const int N,
+                     unsigned* seqa_spm,  unsigned* seqb_spm,
+                     unsigned* sizea_spm,  unsigned* sizeb_spm) {
+#ifdef MEMCPY_FLAG
+  unsigned seqa_temp0 = seqa[0];
+  unsigned seqa_temp1 = seqa[1];
+  unsigned seqa_temp2 = seqa[2];
+  unsigned seqa_temp3 = seqa[3];
+  unsigned seqa_temp4 = seqa[4];
+  unsigned seqa_temp5 = seqa[5];
+  unsigned seqa_temp6 = seqa[6];
+  unsigned seqa_temp7 = seqa[7];
+  unsigned seqb_temp0 = seqb[0];
+  unsigned seqb_temp1 = seqb[1];
+  unsigned seqb_temp2 = seqb[2];
+  unsigned seqb_temp3 = seqb[3];
+  unsigned seqb_temp4 = seqb[4];
+  unsigned seqb_temp5 = seqb[5];
+  unsigned seqb_temp6 = seqb[6];
+  unsigned seqb_temp7 = seqb[7];
+  asm volatile("": : :"memory");
+  seqa_spm[0] = seqa_temp0;
+  seqa_spm[1] = seqa_temp1;
+  seqa_spm[2] = seqa_temp2;
+  seqa_spm[3] = seqa_temp3;
+  seqa_spm[4] = seqa_temp4;
+  seqa_spm[5] = seqa_temp5;
+  seqa_spm[6] = seqa_temp6;
+  seqa_spm[7] = seqa_temp7;
+  seqb_spm[0] = seqb_temp0;
+  seqb_spm[1] = seqb_temp1;
+  seqb_spm[2] = seqb_temp2;
+  seqb_spm[3] = seqb_temp3;
+  seqb_spm[4] = seqb_temp4;
+  seqb_spm[5] = seqb_temp5;
+  seqb_spm[6] = seqb_temp6;
+  seqb_spm[7] = seqb_temp7;
+
+  // Transfer sequence lengths
+  unsigned sizea_temp0 = sizea[0];
+  unsigned sizea_temp1 = sizea[1];
+  unsigned sizea_temp2 = sizea[2];
+  unsigned sizea_temp3 = sizea[3];
+  unsigned sizeb_temp0 = sizeb[0];
+  unsigned sizeb_temp1 = sizeb[1];
+  unsigned sizeb_temp2 = sizeb[2];
+  unsigned sizeb_temp3 = sizeb[3];
+  asm volatile("": : :"memory");
+  sizea_spm[0] = sizea_temp0;
+  sizea_spm[1] = sizea_temp1;
+  sizea_spm[2] = sizea_temp2;
+  sizea_spm[3] = sizea_temp3;
+  sizeb_spm[0] = sizeb_temp0;
+  sizeb_spm[1] = sizeb_temp1;
+  sizeb_spm[2] = sizeb_temp2;
+  sizeb_spm[3] = sizeb_temp3;
+#else
+  for (int i = 0; i < num_packed_a; i++) {
+    seqa_spm[i] = seqa[i];
+  }
+  for (int i = 0; i < num_packed_b; i++) {
+    seqb_spm[i] = seqb[i];
+  }
+
+  // load sizes of sequences to SPM
+  for (int k = 0; k < N; k++) {
+    sizea_spm[k] = sizea[k];
+    sizeb_spm[k] = sizeb[k];
+  }
+#endif
+}
+
+inline int get_tid(){
+#ifdef HB
+  return bsg_x * bsg_tiles_Y + bsg_y;
+#else
+  return 0;
+#endif
+}
+
+#ifdef HB
+extern "C" __attribute__ ((noinline))
+#endif
+void kernel_smith_waterman(
+  const int N,
+  const int SIZEA_MAX,
+  const int SIZEB_MAX,
+  const unsigned* seqa,
+  const unsigned* seqb,
+  const unsigned* sizea,
+  const unsigned* sizeb,
+  unsigned* score
+){
+  profile_start();
+  // Hyperparameters (match GPGPU-Sim)
+  const int match_score    = 1;
+  const int mismatch_score = -3;
+  const int gap_open       = 3;
+  const int gap_extend     = 1;
+
+  // determine which alignments the tile does
+  int tid = get_tid();
+  const int SIZEA_MAX_PACKED = (SIZEA_MAX + 15) / 16;
+  const int SIZEB_MAX_PACKED = (SIZEB_MAX + 15) / 16;
+  const unsigned* seqa_ptr = seqa + tid * N * SIZEA_MAX_PACKED;
+  const unsigned* seqb_ptr = seqb + tid * N * SIZEB_MAX_PACKED;
+  const unsigned* sizea_ptr = sizea + tid * N;
+  const unsigned* sizeb_ptr = sizeb + tid * N;
+  unsigned* score_ptr = score + tid * N;
+
+  // Load data to SPM
+  unsigned seqa_packed_spm[N*SIZEA_MAX_PACKED];
+  unsigned seqb_packed_spm[N*SIZEB_MAX_PACKED];
+  unsigned length[N];
+  unsigned width[N];
+  load_spm(seqa_ptr, seqb_ptr, sizea_ptr, sizeb_ptr, SIZEA_MAX_PACKED, SIZEB_MAX_PACKED, N,
+           seqa_packed_spm, seqb_packed_spm, length, width);
+
+  // Unpack sequences in SPM
+  unsigned char seqa_spm[N*SIZEA_MAX];
+  unsigned char seqb_spm[N*SIZEB_MAX];
+  unpack(seqa_packed_spm, N * SIZEA_MAX_PACKED, seqa_spm);
+  unpack(seqb_packed_spm, N * SIZEB_MAX_PACKED, seqb_spm);
+
+  unsigned char* seqa_spm_ptr = seqa_spm;
+  unsigned char* seqb_spm_ptr = seqb_spm;
+
+  int E_spm[SIZEB_MAX];
+  int F_spm[SIZEB_MAX];
+  int H_spm[SIZEB_MAX];
+  int H_prev_spm[SIZEB_MAX];
+  unsigned score_temp;
+  for (int i = 0; i < SIZEB_MAX; i++) {
+    E_spm[i] = 0;
+    F_spm[i] = 0;
+    H_spm[i] = 0;
+    H_prev_spm[i] = 0;
+  }
+
+  // loop through N alignments
+  for (int k = 0; k < N; k++) {
+    const auto mm = [&](const unsigned char a, const unsigned char b){ return (a==b)?match_score:mismatch_score; };
+
+    // compute 2D DP matrix
+    score_temp = 0;
+    for(int i = 0; i < 1; i++) {
+      debug_printf(tid, k, N, i, length[k]);
+      for(int j = 1; j < width[k]; j++) {
+        E_spm[j] = max(E_spm[j-1] - gap_extend,
+                       H_spm[j-1] - gap_open
+                       );
+
+        F_spm[j] = 0;
+
+        H_prev_spm[j] = H_spm[j];
+        H_spm[j] = max(0, E_spm[j], F_spm[j]);
+        if (H_spm[j] > score_temp)
+          score_temp = H_spm[j];
+      }
+    }
+    for(int i = 1; i < length[k]; i++) {
+      debug_printf(tid, k, N, i, length[k]);
+      unsigned char seqa_val = seqa_spm_ptr[i];
+      for(int j = 1; j < width[k]; j++) {
+        E_spm[j] = max(E_spm[j-1] - gap_extend,
+                       H_spm[j-1] - gap_open
+                       );
+
+        F_spm[j] = max(F_spm[j] - gap_extend,
+                       H_spm[j] - gap_open
+                      );
+
+        H_prev_spm[j] = H_spm[j];
+        H_spm[j] = max(0, E_spm[j], F_spm[j],
+                      H_prev_spm[j-1] + mm(seqa_val, seqb_spm_ptr[j]));
+        if (H_spm[j] > score_temp)
+          score_temp = H_spm[j];
+      }
+    }
+    score_ptr[k] = score_temp;
+    seqa_spm_ptr += SIZEA_MAX;
+    seqb_spm_ptr += SIZEB_MAX;
+  }
+  profile_end();
+  sync();
+}
+
diff --git a/examples/cuda/test_smith_waterman/main.cpp b/examples/cuda/test_smith_waterman/main.cpp
new file mode 100644
index 000000000..5d1853206
--- /dev/null
+++ b/examples/cuda/test_smith_waterman/main.cpp
@@ -0,0 +1,289 @@
+// Copyright (c) 2019, University of Washington All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// Redistributions of source code must retain the above copyright notice, this list
+// of conditions and the following disclaimer.
+//
+// Redistributions in binary form must reproduce the above copyright notice, this
+// list of conditions and the following disclaimer in the documentation and/or
+// other materials provided with the distribution.
+//
+// Neither the name of the copyright holder nor the names of its contributors may
+// be used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_errno.h>
+#include <bsg_manycore_tile.h>
+#include <bsg_manycore_loader.h>
+#include <bsg_manycore_cuda.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <stdio.h>
+#include <bsg_manycore_regression.h>
+#include <math.h>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <map>
+#include <vector>
+#include <assert.h>
+
+using namespace std;
+
+#define PRINT_SCORE
+//#define PRINT_MATRIX
+#define ALLOC_NAME "default_allocator"
+#define CUDA_CALL(expr)                                                 \
+        {                                                               \
+                int __err;                                              \
+                __err = expr;                                           \
+                if (__err != HB_MC_SUCCESS) {                           \
+                        bsg_pr_err("'%s' failed: %s\n", #expr, hb_mc_strerror(__err)); \
+                        return __err;                                   \
+                }                                                       \
+        }
+
+class Sequence {
+  private:
+  // Read DNA Sequence from file
+  static void read_seq(const string file_name, const int N,
+                const int SIZE_SEQ, unsigned* seq, unsigned* size) {
+    map<char, int> dna_char2int = {{'A', 0}, {'C', 1}, {'G', 2}, {'T', 3}, {'N', 0}};
+
+    ifstream fin;
+    fin.open(file_name, ios::in);
+    if(fin.fail()){
+            bsg_pr_info("Hey! File does not exist!\n");
+            exit(1);
+    }
+    for (int i = 0; i < N; i++) {
+      string str, num;
+      fin >> num >> str;
+      for (int j = 0; j < str.size(); j++) {
+        seq[i*SIZE_SEQ+j] = dna_char2int[str[j]];
+      }
+      size[i] = str.size();
+    };
+    fin.close();
+  }
+
+  // Pack DNA sequence
+  static void pack(const unsigned* unpacked, const int num_unpacked, const int num_packed, unsigned* packed) {
+    for (int i = 0; i < num_packed; i++) {
+      for (int j = 0; j < 16 && i * 16 + j < num_unpacked; j++) {
+        int unpacked_val = unpacked[j] << (30 - 2 * j);
+        packed[i] |= unpacked_val;
+      }
+      unpacked += 16;
+    }
+  }
+
+  public:
+  // Get packed data
+  static void get_data_packed(const int N, const int SIZEA_MAX, const int SIZEB_MAX,
+                              unsigned* seqa, unsigned* seqb,
+                              unsigned* sizea, unsigned* sizeb) {
+      // read N queries
+      unsigned* seqa_unpacked = new unsigned[N*SIZEA_MAX]();
+      read_seq("data/dna-query32.fasta", N, SIZEA_MAX, seqa_unpacked, sizea);
+
+      // read N references
+      unsigned* seqb_unpacked = new unsigned[N*SIZEB_MAX]();
+      read_seq("data/dna-reference32.fasta", N, SIZEB_MAX, seqb_unpacked, sizeb);
+
+      // pack
+      int num_unpacked = N * SIZEA_MAX;
+      const int SIZEA_MAX_PACKED = (SIZEA_MAX + 15) / 16;
+      int num_packed = N * SIZEA_MAX_PACKED;
+      unsigned* unpacked = seqa_unpacked;
+      pack(unpacked, num_unpacked, num_packed, seqa);
+
+      const int SIZEB_MAX_PACKED = (SIZEB_MAX + 15) / 16;
+      num_unpacked = N * SIZEB_MAX;
+      num_packed = N * SIZEB_MAX_PACKED;
+      unpacked = seqb_unpacked;
+      pack(unpacked, num_unpacked, num_packed, seqb);
+      delete[] seqa_unpacked;
+      delete[] seqb_unpacked;
+  }
+  };
+
+int kernel_smith_waterman (int argc, char **argv) {
+        char *bin_path, *test_name;
+        struct arguments_path args = {NULL, NULL};
+
+        argp_parse (&argp_path, argc, argv, 0, 0, &args);
+        bin_path = args.path;
+        test_name = args.name;
+
+        bsg_pr_test_info("Running the CUDA Vector Addition Kernel on one 2x2 tile groups.\n");
+
+        srand(static_cast<unsigned>(time(0)));
+
+        /* Define path to binary. */
+        /* Initialize device, load binary and unfreeze tiles. */
+        hb_mc_dimension_t tg_dim = { .x = 16, .y = 8};
+        hb_mc_device_t device;
+        BSG_CUDA_CALL(hb_mc_device_init_custom_dimensions(&device, test_name, 0, tg_dim));
+
+        /* if DMA is not supported just return SUCCESS */
+        if (!hb_mc_manycore_supports_dma_write(device.mc)
+            || !hb_mc_manycore_supports_dma_read(device.mc)) {
+                bsg_pr_test_info("DMA not supported for this machine: returning success\n");
+                BSG_CUDA_CALL(hb_mc_device_finish(&device));
+                return HB_MC_SUCCESS;
+        }
+
+        hb_mc_pod_id_t pod;
+        hb_mc_device_foreach_pod_id(&device, pod)
+        {
+                BSG_CUDA_CALL(hb_mc_device_set_default_pod(&device, pod));
+                BSG_CUDA_CALL(hb_mc_device_program_init(&device, bin_path, ALLOC_NAME, 0));
+
+                // == Get data
+                int num_tiles = tg_dim.x * tg_dim.y;
+                const int N_TILE = 4;
+                const int N = N_TILE * num_tiles;
+                const int SIZEA_MAX = 64;
+                const int SIZEB_MAX = 64;
+                const int SIZEA_MAX_PACKED = (SIZEA_MAX + 15) / 16;
+                const int SIZEB_MAX_PACKED = (SIZEB_MAX + 15) / 16;
+                unsigned* seqa = new unsigned[N * SIZEA_MAX_PACKED]();
+                unsigned* seqb = new unsigned[N * SIZEB_MAX_PACKED]();
+                unsigned* sizea = new unsigned[N];
+                unsigned* sizeb = new unsigned[N];
+                Sequence::get_data_packed(N, SIZEA_MAX, SIZEB_MAX, seqa, seqb, sizea, sizeb);
+
+                // == Sending data to device
+                // Define the sizes of the I/O arrays
+                size_t seqa_bytes = N * SIZEA_MAX_PACKED * sizeof(unsigned);
+                size_t seqb_bytes = N * SIZEB_MAX_PACKED * sizeof(unsigned);
+                size_t sizea_bytes = N * sizeof(unsigned);
+                size_t sizeb_bytes = N * sizeof(unsigned);
+                size_t score_bytes = N * sizeof(unsigned);
+
+                // Allocate device memory for the I/O arrays
+                eva_t seqa_d, seqb_d, sizea_d, sizeb_d, score_d;
+                BSG_CUDA_CALL(hb_mc_device_malloc(&device, seqa_bytes, &seqa_d));
+                BSG_CUDA_CALL(hb_mc_device_malloc(&device, seqb_bytes, &seqb_d));
+                BSG_CUDA_CALL(hb_mc_device_malloc(&device, sizea_bytes, &sizea_d));
+                BSG_CUDA_CALL(hb_mc_device_malloc(&device, sizeb_bytes, &sizeb_d));
+                BSG_CUDA_CALL(hb_mc_device_malloc(&device, score_bytes, &score_d));
+
+                // Transfer data host -> device
+                hb_mc_dma_htod_t htod_jobs [] = {
+                        {
+                                .d_addr = seqa_d,
+                                .h_addr = seqa,
+                                .size   = seqa_bytes
+                        },
+                        {
+                                .d_addr = seqb_d,
+                                .h_addr = seqb,
+                                .size   = seqb_bytes
+                        },
+                        {
+                                .d_addr = sizea_d,
+                                .h_addr = sizea,
+                                .size   = sizea_bytes
+                        },
+                        {
+                                .d_addr = sizeb_d,
+                                .h_addr = sizeb,
+                                .size   = sizeb_bytes
+                        }
+                };
+
+                bsg_pr_test_info("Writing A and B to device\n");
+
+                BSG_CUDA_CALL(hb_mc_device_dma_to_device(&device, htod_jobs, 4));
+                delete[] seqa;
+                delete[] seqb;
+                delete[] sizea;
+                delete[] sizeb;
+
+                // == Launching kernel ==
+                // Define amount of work for each tile group
+                /* Define tg_dim_x/y: number of tiles in each tile group */
+                /* Calculate grid_dim_x/y: number of tile groups needed based on block_size_x/y */
+                hb_mc_dimension_t grid_dim = { .x = 1, .y = 1};
+
+                /* Prepare list of input arguments for kernel. */
+                uint32_t cuda_argv[8] = {N_TILE, SIZEA_MAX, SIZEB_MAX,
+                                         seqa_d, seqb_d, sizea_d,
+                                         sizeb_d, score_d};
+
+                /* Enque grid of tile groups, pass in grid and tile group dimensions,
+                   kernel name, number and list of input arguments */
+                BSG_CUDA_CALL(hb_mc_kernel_enqueue (&device, grid_dim, tg_dim, "kernel_smith_waterman", 8, cuda_argv));
+
+                /* Launch and execute all tile groups on device and wait for all to finish.  */
+                BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(&device));
+
+                // Transfer data device -> host
+                int* score = new int[N];
+                hb_mc_dma_dtoh_t dtoh_job = {
+                        .d_addr = score_d,
+                        .h_addr = score,
+                        .size   = score_bytes
+                };
+
+                bsg_pr_test_info("Reading C to host\n");
+
+                BSG_CUDA_CALL(hb_mc_device_dma_to_host(&device, &dtoh_job, 1));
+
+                /* Freeze the tiles and memory manager cleanup.  */
+                BSG_CUDA_CALL(hb_mc_device_program_finish(&device));
+
+                // == Check output
+                // check N scores against golden
+                unsigned score_golden[N];
+                ifstream fin;
+                fin.open("data/output32", ios::in);
+                for (int i = 0; i < N; i++) {
+                  fin >> score_golden[i];
+                }
+                fin.close();
+
+                // Write to file
+                ofstream fout;
+                fout.open("output", ios::out);
+                for (int i = 0; i < N; i++) {
+                  fout << score[i] << endl;
+                }
+                fout.close();
+
+                // Check
+                for (int i = 0; i < N; i++) {
+                  if (score[i] != score_golden[i]) {
+                    cout << "ERROR : mismatch for score " << i << endl;
+                    return HB_MC_FAIL;
+                  }
+                }
+                delete[] score;
+        }
+
+        BSG_CUDA_CALL(hb_mc_device_finish(&device));
+
+        return HB_MC_SUCCESS;
+}
+
+declare_program_main("test_smith_waterman", kernel_smith_waterman);
+
diff --git a/examples/cuda/test_smith_waterman/output b/examples/cuda/test_smith_waterman/output
new file mode 100644
index 000000000..94a52c5c4
--- /dev/null
+++ b/examples/cuda/test_smith_waterman/output
@@ -0,0 +1,512 @@
+5
+5
+28
+5
+5
+7
+6
+3
+6
+4
+23
+4
+3
+5
+4
+4
+5
+4
+5
+5
+5
+5
+5
+3
+3
+5
+5
+6
+5
+4
+4
+5
+5
+5
+26
+4
+10
+4
+4
+3
+4
+4
+5
+4
+3
+4
+4
+4
+6
+5
+5
+4
+3
+7
+5
+7
+5
+3
+4
+12
+4
+3
+4
+4
+4
+4
+4
+6
+5
+4
+6
+5
+7
+4
+4
+7
+5
+4
+4
+5
+5
+5
+5
+5
+5
+4
+4
+3
+5
+5
+23
+4
+3
+6
+4
+6
+4
+4
+4
+7
+5
+4
+5
+3
+4
+5
+5
+5
+6
+5
+5
+4
+5
+6
+4
+5
+5
+3
+5
+6
+5
+4
+7
+24
+6
+8
+5
+5
+4
+4
+14
+9
+19
+5
+5
+7
+4
+4
+4
+4
+5
+5
+29
+5
+6
+4
+5
+4
+4
+5
+4
+4
+5
+4
+5
+4
+5
+5
+4
+3
+4
+4
+4
+4
+3
+5
+4
+5
+4
+5
+5
+5
+5
+11
+4
+20
+4
+24
+18
+15
+4
+4
+5
+6
+6
+6
+4
+6
+5
+4
+5
+3
+4
+4
+4
+5
+4
+4
+4
+4
+4
+5
+5
+11
+5
+5
+6
+5
+4
+5
+4
+21
+5
+4
+4
+4
+8
+4
+4
+4
+5
+4
+5
+4
+7
+4
+29
+7
+4
+6
+5
+5
+5
+6
+4
+5
+6
+4
+3
+6
+4
+6
+4
+5
+4
+4
+5
+7
+27
+8
+6
+4
+4
+4
+4
+4
+3
+5
+6
+5
+4
+5
+4
+3
+5
+5
+4
+5
+5
+5
+6
+6
+8
+6
+4
+4
+4
+4
+5
+4
+5
+5
+6
+4
+5
+6
+9
+8
+3
+4
+3
+7
+3
+5
+18
+6
+6
+6
+10
+6
+28
+6
+25
+4
+29
+3
+5
+6
+4
+6
+3
+4
+4
+5
+5
+5
+10
+4
+5
+5
+5
+4
+6
+4
+4
+29
+3
+4
+23
+5
+4
+4
+4
+4
+4
+5
+5
+4
+9
+4
+5
+4
+4
+8
+6
+5
+5
+4
+4
+4
+4
+4
+4
+12
+5
+5
+4
+4
+4
+4
+4
+5
+4
+7
+5
+10
+7
+5
+6
+7
+4
+4
+5
+4
+4
+5
+5
+6
+4
+4
+14
+6
+6
+5
+3
+4
+4
+4
+5
+5
+5
+7
+7
+5
+5
+5
+29
+5
+3
+5
+7
+8
+4
+4
+5
+3
+5
+4
+6
+4
+5
+5
+4
+5
+4
+5
+4
+4
+5
+5
+4
+5
+4
+4
+5
+8
+4
+4
+4
+30
+7
+5
+7
+7
+5
+7
+5
+7
+3
+5
+4
+6
+6
+5
+5
+6
+3
+6
+4
+4
+5
+4
+4
+3
+5
+4
+5
+5
+6
+5
+4
+5
+5
+4
+5
+5
+6
+5
+4
+3
+6
+5
+5
+4
+4
+5
+6
+4
+3
+5
+4
+8
+4
+5
+6
+6
+9
+3
+6
+5
+5
+4
+4
+4
+6
+6
+3
+23
+5
+4
+5
+5
+5
+6
+7
+4
+14
+4
+4
+4
+6
+5

From 250b65ecf8c800058d8cc2e4acf18b3dc1c15750 Mon Sep 17 00:00:00 2001
From: Dustin Richmond <dustinar@uw.edu>
Date: Thu, 31 Mar 2022 09:18:26 -0700
Subject: [PATCH 2/3] placeholder file

---
 examples/cuda/test_smith_waterman/data/.placeholder | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/cuda/test_smith_waterman/data/.placeholder

diff --git a/examples/cuda/test_smith_waterman/data/.placeholder b/examples/cuda/test_smith_waterman/data/.placeholder
new file mode 100644
index 000000000..e69de29bb

From 5b4607e5514dc4bd46e2de5e500126f4244cad31 Mon Sep 17 00:00:00 2001
From: Dustin Richmond <dustinar@uw.edu>
Date: Thu, 31 Mar 2022 09:18:58 -0700
Subject: [PATCH 3/3] Move to apps directory

---
 .../{cuda/test_smith_waterman => apps/smith_waterman}/Makefile    | 0
 .../test_smith_waterman => apps/smith_waterman}/data/.placeholder | 0
 .../{cuda/test_smith_waterman => apps/smith_waterman}/kernel.cpp  | 0
 .../smith_waterman}/kernel_smith_waterman.cpp                     | 0
 .../{cuda/test_smith_waterman => apps/smith_waterman}/main.cpp    | 0
 examples/{cuda/test_smith_waterman => apps/smith_waterman}/output | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/Makefile (100%)
 rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/data/.placeholder (100%)
 rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/kernel.cpp (100%)
 rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/kernel_smith_waterman.cpp (100%)
 rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/main.cpp (100%)
 rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/output (100%)

diff --git a/examples/cuda/test_smith_waterman/Makefile b/examples/apps/smith_waterman/Makefile
similarity index 100%
rename from examples/cuda/test_smith_waterman/Makefile
rename to examples/apps/smith_waterman/Makefile
diff --git a/examples/cuda/test_smith_waterman/data/.placeholder b/examples/apps/smith_waterman/data/.placeholder
similarity index 100%
rename from examples/cuda/test_smith_waterman/data/.placeholder
rename to examples/apps/smith_waterman/data/.placeholder
diff --git a/examples/cuda/test_smith_waterman/kernel.cpp b/examples/apps/smith_waterman/kernel.cpp
similarity index 100%
rename from examples/cuda/test_smith_waterman/kernel.cpp
rename to examples/apps/smith_waterman/kernel.cpp
diff --git a/examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp b/examples/apps/smith_waterman/kernel_smith_waterman.cpp
similarity index 100%
rename from examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp
rename to examples/apps/smith_waterman/kernel_smith_waterman.cpp
diff --git a/examples/cuda/test_smith_waterman/main.cpp b/examples/apps/smith_waterman/main.cpp
similarity index 100%
rename from examples/cuda/test_smith_waterman/main.cpp
rename to examples/apps/smith_waterman/main.cpp
diff --git a/examples/cuda/test_smith_waterman/output b/examples/apps/smith_waterman/output
similarity index 100%
rename from examples/cuda/test_smith_waterman/output
rename to examples/apps/smith_waterman/output