From 1243c34363b068155bdf06cfa397123b8ec33403 Mon Sep 17 00:00:00 2001 From: Dustin Richmond Date: Thu, 31 Mar 2022 09:16:32 -0700 Subject: [PATCH 1/3] Added Preslav's working SW code --- examples/cuda/test_smith_waterman/Makefile | 128 +++++ examples/cuda/test_smith_waterman/kernel.cpp | 293 ++++++++++ .../kernel_smith_waterman.cpp | 263 +++++++++ examples/cuda/test_smith_waterman/main.cpp | 289 ++++++++++ examples/cuda/test_smith_waterman/output | 512 ++++++++++++++++++ 5 files changed, 1485 insertions(+) create mode 100644 examples/cuda/test_smith_waterman/Makefile create mode 100644 examples/cuda/test_smith_waterman/kernel.cpp create mode 100644 examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp create mode 100644 examples/cuda/test_smith_waterman/main.cpp create mode 100644 examples/cuda/test_smith_waterman/output diff --git a/examples/cuda/test_smith_waterman/Makefile b/examples/cuda/test_smith_waterman/Makefile new file mode 100644 index 000000000..67ac012af --- /dev/null +++ b/examples/cuda/test_smith_waterman/Makefile @@ -0,0 +1,128 @@ +# Copyright (c) 2021, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This Makefile compiles, links, and executes examples Run `make help` +# to see the available targets for the selected platform. + +################################################################################ +# environment.mk verifies the build environment and sets the following +# makefile variables: +# +# LIBRAIRES_PATH: The path to the libraries directory +# HARDWARE_PATH: The path to the hardware directory +# EXAMPLES_PATH: The path to the examples directory +# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL +# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore +############################################################################### + +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) + +include $(REPLICANT_PATH)/environment.mk +SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd +CUDALITE_SRC_PATH = $(SPMD_SRC_PATH)/bsg_cuda_lite_runtime + +# KERNEL_NAME is the name of the CUDA-Lite Kernel +KERNEL_NAME = smith_waterman + +############################################################################### +# Host code compilation flags and flow +############################################################################### + +# TEST_SOURCES is a list of source files that need to be compiled +TEST_SOURCES = main.cpp + +DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE -D_DEFAULT_SOURCE +CDEFINES += +CXXDEFINES += + +FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable +CFLAGS += -std=c99 $(FLAGS) +CXXFLAGS += -std=c++11 $(FLAGS) + +# compilation.mk defines rules for compilation of C/C++ +include $(EXAMPLES_PATH)/compilation.mk + +############################################################################### +# Host code link flags and flow +############################################################################### + +LDFLAGS += + +# link.mk defines rules for linking of the final execution binary. +include $(EXAMPLES_PATH)/link.mk + +############################################################################### +# Device code compilation flow +############################################################################### + +# BSG_MANYCORE_KERNELS is a list of manycore executables that should +# be built before executing. + +BSG_MANYCORE_KERNELS ?= kernel.riscv +RISCV_INCLUDES += +RISCV_CCPPFLAGS += -D__KERNEL__ + +kernel.riscv: kernel.rvo + +TILE_GROUP_DIM_X ?= 16 +TILE_GROUP_DIM_Y ?= 8 +RISCV_DEFINES += -DTILE_GROUP_DIM_X=$(TILE_GROUP_DIM_X) +RISCV_DEFINES += -DTILE_GROUP_DIM_Y=$(TILE_GROUP_DIM_Y) +RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) +RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) + +include $(EXAMPLES_PATH)/cuda/riscv.mk + +############################################################################### +# Execution flow +# +# C_ARGS: Use this to pass arguments that you want to appear in argv +# For SPMD tests C arguments are: +# +# SIM_ARGS: Use this to pass arguments to the simulator +############################################################################### +C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME) + +SIM_ARGS ?= + +# Include platform-specific execution rules +include $(EXAMPLES_PATH)/execution.mk + +############################################################################### +# Regression Flow +############################################################################### + +regression: exec.log + @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null + +.DEFAULT_GOAL := help + +.PHONY: clean + +clean: + rm -rf *.ld + diff --git a/examples/cuda/test_smith_waterman/kernel.cpp b/examples/cuda/test_smith_waterman/kernel.cpp new file mode 100644 index 000000000..379c0c5de --- /dev/null +++ b/examples/cuda/test_smith_waterman/kernel.cpp @@ -0,0 +1,293 @@ +#define MEMCPY_FLAG +#define HB +//#define DEBUG + +#ifdef HB +#include "bsg_manycore.h" +#include "bsg_set_tile_x_y.h" +#include "bsg_tile_group_barrier.hpp" + +bsg_barrier barrier; +#else +#include "kernel_smith_waterman.hpp" +#endif + +template +inline T max(T a, T b) +{ + if (a > b) + return a; + else + return b; +} + +template +inline T max(T a, T b, T c, T d) +{ + return max(max(a, b), max(c, d)); +} + +template +inline T max(T a, T b, T c) +{ + return max(max(a, b), c); +} + +inline void unpack(const unsigned* packed, const int num_packed, unsigned char* unpacked) { + unsigned char* unpacked_ptr = unpacked; + for (int i = 0; i < num_packed; i++) { + int packed_val = packed[i]; + for (int j = 0; j < 16; j++) { + unsigned char unpacked_val = packed_val >> (30 - 2 * j); + unpacked_val &= 0x00000003; + unpacked_ptr[j] = unpacked_val; + } + unpacked_ptr += 16; + } +} + +inline void profile_start(){ +#ifdef HB + bsg_cuda_print_stat_kernel_start(); +#endif +} + +inline void profile_end(){ +#ifdef HB + bsg_cuda_print_stat_kernel_end(); +#endif +} + +inline void sync(){ +#ifdef HB + barrier.sync(); +#endif +} + +inline void debug_printf(int tid, int k, int N, int i, int length){ +#ifdef HB +#ifdef DEBUG + bsg_printf("[Tile %d] Alignment %d/%d, Row %d/%d\n", tid, k, N, i, length); +#endif +#endif +} + +// copy num_words words from DRAM via non-blocking loads +// num_words must be divisible by 16; +template +inline void hb_memcpy(const T* src_ptr, const int num_words, T* dst_ptr) { + const T* src = src_ptr; + T* dst = dst_ptr; + + for (int i = 0; i < num_words / 16; i++) { + T tmp00 = src[0]; + T tmp01 = src[1]; + T tmp02 = src[2]; + T tmp03 = src[3]; + T tmp04 = src[4]; + T tmp05 = src[5]; + T tmp06 = src[6]; + T tmp07 = src[7]; + T tmp08 = src[8]; + T tmp09 = src[9]; + T tmp10 = src[10]; + T tmp11 = src[11]; + T tmp12 = src[12]; + T tmp13 = src[13]; + T tmp14 = src[14]; + T tmp15 = src[15]; + asm volatile("": : :"memory"); + dst[0] = tmp00; + dst[1] = tmp01; + dst[2] = tmp02; + dst[3] = tmp03; + dst[4] = tmp04; + dst[5] = tmp05; + dst[6] = tmp06; + dst[7] = tmp07; + dst[8] = tmp08; + dst[9] = tmp09; + dst[10] = tmp10; + dst[11] = tmp11; + dst[12] = tmp12; + dst[13] = tmp13; + dst[14] = tmp14; + dst[15] = tmp15; + src += 16; + dst += 16; + } +} + +inline void load_spm(const unsigned* seqa, const unsigned* seqb, + const unsigned* sizea, const unsigned* sizeb, + const int num_packed_a, const int num_packed_b, + const int N, + unsigned* seqa_spm, unsigned* seqb_spm, + unsigned* sizea_spm, unsigned* sizeb_spm) { +#ifdef MEMCPY_FLAG + // Transfer sequences + hb_memcpy(seqa, N * num_packed_a, seqa_spm); + hb_memcpy(seqb, N * num_packed_b, seqb_spm); + + // Transfer sequence lengths + unsigned sizea_temp0 = sizea[0]; + unsigned sizea_temp1 = sizea[1]; + unsigned sizea_temp2 = sizea[2]; + unsigned sizea_temp3 = sizea[3]; + unsigned sizeb_temp0 = sizeb[0]; + unsigned sizeb_temp1 = sizeb[1]; + unsigned sizeb_temp2 = sizeb[2]; + unsigned sizeb_temp3 = sizeb[3]; + asm volatile("": : :"memory"); + sizea_spm[0] = sizea_temp0; + sizea_spm[1] = sizea_temp1; + sizea_spm[2] = sizea_temp2; + sizea_spm[3] = sizea_temp3; + sizeb_spm[0] = sizeb_temp0; + sizeb_spm[1] = sizeb_temp1; + sizeb_spm[2] = sizeb_temp2; + sizeb_spm[3] = sizeb_temp3; +#else + for (int i = 0; i < N * num_packed_a; i++) { + seqa_spm[i] = seqa[i]; + } + for (int i = 0; i < N * num_packed_b; i++) { + seqb_spm[i] = seqb[i]; + } + + // load sizes of sequences to SPM + for (int k = 0; k < N; k++) { + sizea_spm[k] = sizea[k]; + sizeb_spm[k] = sizeb[k]; + } +#endif +} + +inline int get_tid(){ +#ifdef HB + return bsg_x * bsg_tiles_Y + bsg_y; +#else + return 0; +#endif +} + +inline void align(const unsigned length, const unsigned width, + const unsigned char* seqa_spm_ptr, + const unsigned char* seqb_spm_ptr, + short* E_spm, short* F_spm, short* H_spm, short* H_prev_spm, int* score) { + // Hyperparameters (match GPGPU-Sim) + const int match_score = 1; + const int mismatch_score = -3; + const int gap_open = 3; + const int gap_extend = 1; + + // compute 2D DP matrix + int score_temp = 0; + const auto mm = [&](const unsigned char a, const unsigned char b){ return (a==b)?match_score:mismatch_score; }; + for(int i = 0; i < 1; i++) { + for(int j = 1; j < width; j++) { + E_spm[j] = max(E_spm[j-1] - gap_extend, + H_spm[j-1] - gap_open + ); + + F_spm[j] = 0; + + H_prev_spm[j] = H_spm[j]; + H_spm[j] = max((short)0, E_spm[j], F_spm[j]); + if (H_spm[j] > score_temp) + score_temp = H_spm[j]; + } + } + for(int i = 1; i < length; i++) { + unsigned char seqa_val = seqa_spm_ptr[i]; + for(int j = 1; j < width; j++) { + E_spm[j] = max(E_spm[j-1] - gap_extend, + H_spm[j-1] - gap_open + ); + + F_spm[j] = max(F_spm[j] - gap_extend, + H_spm[j] - gap_open + ); + + H_prev_spm[j] = H_spm[j]; + H_spm[j] = max((short)0, E_spm[j], F_spm[j], + (short)(H_prev_spm[j-1] + mm(seqa_val, seqb_spm_ptr[j]))); + if (H_spm[j] > score_temp) + score_temp = H_spm[j]; + } + } + // DRAM write + *score = score_temp; +} + +#ifdef HB +extern "C" __attribute__ ((noinline)) +#endif +void kernel_smith_waterman( + const int N, + const int SIZEA_MAX, + const int SIZEB_MAX, + const unsigned* seqa, + const unsigned* seqb, + const unsigned* sizea, + const unsigned* sizeb, + int* score +){ + bsg_nonsynth_saif_start(); + profile_start(); + // determine which alignments the tile does + int tid = get_tid(); + const int SIZEA_MAX_PACKED = (SIZEA_MAX + 15) / 16; + const int SIZEB_MAX_PACKED = (SIZEB_MAX + 15) / 16; + const unsigned* seqa_ptr = seqa + tid * N * SIZEA_MAX_PACKED; + const unsigned* seqb_ptr = seqb + tid * N * SIZEB_MAX_PACKED; + const unsigned* sizea_ptr = sizea + tid * N; + const unsigned* sizeb_ptr = sizeb + tid * N; + int* score_ptr = score + tid * N; + + // Load data to SPM + unsigned seqa_packed_spm[N*SIZEA_MAX_PACKED]; + unsigned seqb_packed_spm[N*SIZEB_MAX_PACKED]; + unsigned length[N]; + unsigned width[N]; + load_spm(seqa_ptr, seqb_ptr, sizea_ptr, sizeb_ptr, SIZEA_MAX_PACKED, SIZEB_MAX_PACKED, N, + seqa_packed_spm, seqb_packed_spm, length, width); + + // Initialize matrices + short E_spm[SIZEB_MAX]; + short F_spm[SIZEB_MAX]; + short H_spm[SIZEB_MAX]; + short H_prev_spm[SIZEB_MAX]; + for (int i = 0; i < SIZEB_MAX; i++) { + E_spm[i] = 0; + F_spm[i] = 0; + H_spm[i] = 0; + H_prev_spm[i] = 0; + } + + // unpack sequences in SPM + unsigned char seqa_spm[SIZEA_MAX]; + unsigned char seqb_spm[SIZEB_MAX]; + unsigned* seqa_packed_spm_ptr = seqa_packed_spm; + unsigned* seqb_packed_spm_ptr = seqb_packed_spm; + + // loop through N alignments + for (int k = 0; k < N; k++) { + // unpack + unpack(seqa_packed_spm_ptr, SIZEA_MAX_PACKED, seqa_spm); + unpack(seqb_packed_spm_ptr, SIZEB_MAX_PACKED, seqb_spm); + + // compute score + align(length[k], width[k], seqa_spm, seqb_spm, + E_spm, F_spm, H_spm, H_prev_spm, score_ptr); + + // move to next sequence + seqa_packed_spm_ptr += SIZEA_MAX_PACKED; + seqb_packed_spm_ptr += SIZEB_MAX_PACKED; + score_ptr++; + } + profile_end(); + sync(); + bsg_nonsynth_saif_end(); +} + diff --git a/examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp b/examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp new file mode 100644 index 000000000..7f14a2571 --- /dev/null +++ b/examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp @@ -0,0 +1,263 @@ +#define MEMCPY_FLAG +#define HB +//#define DEBUG + +#ifdef HB +#include "bsg_manycore.h" +#include "bsg_set_tile_x_y.h" +#include "bsg_tile_group_barrier.hpp" + +bsg_barrier barrier; +#else +#include "kernel_smith_waterman.hpp" +#endif + +template +inline T max(T a, T b) +{ + if (a > b) + return a; + else + return b; +} + +template +inline T max(T a, T b, T c, T d) +{ + return max(max(a, b), max(c, d)); +} + +template +inline T max(T a, T b, T c) +{ + return max(max(a, b), c); +} + +inline void unpack(const unsigned* packed, const int num_packed, unsigned char* unpacked) { + for (int i = 0; i < num_packed; i++) { + int packed_val = packed[i]; + for (int j = 0; j < 16; j++) { + unsigned char unpacked_val = packed_val >> (30 - 2 * j); + unpacked_val &= 0x00000003; + unpacked[j] = unpacked_val; + } + unpacked += 16; + } +} + +inline void profile_start(){ +#ifdef HB + bsg_cuda_print_stat_kernel_start(); +#endif +} + +inline void profile_end(){ +#ifdef HB + bsg_cuda_print_stat_kernel_end(); +#endif +} + +inline void sync(){ +#ifdef HB + barrier.sync(); +#endif +} + +inline void debug_printf(int tid, int k, int N, int i, int length){ +#ifdef HB +#ifdef DEBUG + bsg_printf("[Tile %d] Alignment %d/%d, Row %d/%d\n", tid, k, N, i, length); +#endif +#endif +} + +inline void load_spm(const unsigned* seqa, const unsigned* seqb, + const unsigned* sizea, const unsigned* sizeb, + const int num_packed_a, const int num_packed_b, + const int N, + unsigned* seqa_spm, unsigned* seqb_spm, + unsigned* sizea_spm, unsigned* sizeb_spm) { +#ifdef MEMCPY_FLAG + unsigned seqa_temp0 = seqa[0]; + unsigned seqa_temp1 = seqa[1]; + unsigned seqa_temp2 = seqa[2]; + unsigned seqa_temp3 = seqa[3]; + unsigned seqa_temp4 = seqa[4]; + unsigned seqa_temp5 = seqa[5]; + unsigned seqa_temp6 = seqa[6]; + unsigned seqa_temp7 = seqa[7]; + unsigned seqb_temp0 = seqb[0]; + unsigned seqb_temp1 = seqb[1]; + unsigned seqb_temp2 = seqb[2]; + unsigned seqb_temp3 = seqb[3]; + unsigned seqb_temp4 = seqb[4]; + unsigned seqb_temp5 = seqb[5]; + unsigned seqb_temp6 = seqb[6]; + unsigned seqb_temp7 = seqb[7]; + asm volatile("": : :"memory"); + seqa_spm[0] = seqa_temp0; + seqa_spm[1] = seqa_temp1; + seqa_spm[2] = seqa_temp2; + seqa_spm[3] = seqa_temp3; + seqa_spm[4] = seqa_temp4; + seqa_spm[5] = seqa_temp5; + seqa_spm[6] = seqa_temp6; + seqa_spm[7] = seqa_temp7; + seqb_spm[0] = seqb_temp0; + seqb_spm[1] = seqb_temp1; + seqb_spm[2] = seqb_temp2; + seqb_spm[3] = seqb_temp3; + seqb_spm[4] = seqb_temp4; + seqb_spm[5] = seqb_temp5; + seqb_spm[6] = seqb_temp6; + seqb_spm[7] = seqb_temp7; + + // Transfer sequence lengths + unsigned sizea_temp0 = sizea[0]; + unsigned sizea_temp1 = sizea[1]; + unsigned sizea_temp2 = sizea[2]; + unsigned sizea_temp3 = sizea[3]; + unsigned sizeb_temp0 = sizeb[0]; + unsigned sizeb_temp1 = sizeb[1]; + unsigned sizeb_temp2 = sizeb[2]; + unsigned sizeb_temp3 = sizeb[3]; + asm volatile("": : :"memory"); + sizea_spm[0] = sizea_temp0; + sizea_spm[1] = sizea_temp1; + sizea_spm[2] = sizea_temp2; + sizea_spm[3] = sizea_temp3; + sizeb_spm[0] = sizeb_temp0; + sizeb_spm[1] = sizeb_temp1; + sizeb_spm[2] = sizeb_temp2; + sizeb_spm[3] = sizeb_temp3; +#else + for (int i = 0; i < num_packed_a; i++) { + seqa_spm[i] = seqa[i]; + } + for (int i = 0; i < num_packed_b; i++) { + seqb_spm[i] = seqb[i]; + } + + // load sizes of sequences to SPM + for (int k = 0; k < N; k++) { + sizea_spm[k] = sizea[k]; + sizeb_spm[k] = sizeb[k]; + } +#endif +} + +inline int get_tid(){ +#ifdef HB + return bsg_x * bsg_tiles_Y + bsg_y; +#else + return 0; +#endif +} + +#ifdef HB +extern "C" __attribute__ ((noinline)) +#endif +void kernel_smith_waterman( + const int N, + const int SIZEA_MAX, + const int SIZEB_MAX, + const unsigned* seqa, + const unsigned* seqb, + const unsigned* sizea, + const unsigned* sizeb, + unsigned* score +){ + profile_start(); + // Hyperparameters (match GPGPU-Sim) + const int match_score = 1; + const int mismatch_score = -3; + const int gap_open = 3; + const int gap_extend = 1; + + // determine which alignments the tile does + int tid = get_tid(); + const int SIZEA_MAX_PACKED = (SIZEA_MAX + 15) / 16; + const int SIZEB_MAX_PACKED = (SIZEB_MAX + 15) / 16; + const unsigned* seqa_ptr = seqa + tid * N * SIZEA_MAX_PACKED; + const unsigned* seqb_ptr = seqb + tid * N * SIZEB_MAX_PACKED; + const unsigned* sizea_ptr = sizea + tid * N; + const unsigned* sizeb_ptr = sizeb + tid * N; + unsigned* score_ptr = score + tid * N; + + // Load data to SPM + unsigned seqa_packed_spm[N*SIZEA_MAX_PACKED]; + unsigned seqb_packed_spm[N*SIZEB_MAX_PACKED]; + unsigned length[N]; + unsigned width[N]; + load_spm(seqa_ptr, seqb_ptr, sizea_ptr, sizeb_ptr, SIZEA_MAX_PACKED, SIZEB_MAX_PACKED, N, + seqa_packed_spm, seqb_packed_spm, length, width); + + // Unpack sequences in SPM + unsigned char seqa_spm[N*SIZEA_MAX]; + unsigned char seqb_spm[N*SIZEB_MAX]; + unpack(seqa_packed_spm, N * SIZEA_MAX_PACKED, seqa_spm); + unpack(seqb_packed_spm, N * SIZEB_MAX_PACKED, seqb_spm); + + unsigned char* seqa_spm_ptr = seqa_spm; + unsigned char* seqb_spm_ptr = seqb_spm; + + int E_spm[SIZEB_MAX]; + int F_spm[SIZEB_MAX]; + int H_spm[SIZEB_MAX]; + int H_prev_spm[SIZEB_MAX]; + unsigned score_temp; + for (int i = 0; i < SIZEB_MAX; i++) { + E_spm[i] = 0; + F_spm[i] = 0; + H_spm[i] = 0; + H_prev_spm[i] = 0; + } + + // loop through N alignments + for (int k = 0; k < N; k++) { + const auto mm = [&](const unsigned char a, const unsigned char b){ return (a==b)?match_score:mismatch_score; }; + + // compute 2D DP matrix + score_temp = 0; + for(int i = 0; i < 1; i++) { + debug_printf(tid, k, N, i, length[k]); + for(int j = 1; j < width[k]; j++) { + E_spm[j] = max(E_spm[j-1] - gap_extend, + H_spm[j-1] - gap_open + ); + + F_spm[j] = 0; + + H_prev_spm[j] = H_spm[j]; + H_spm[j] = max(0, E_spm[j], F_spm[j]); + if (H_spm[j] > score_temp) + score_temp = H_spm[j]; + } + } + for(int i = 1; i < length[k]; i++) { + debug_printf(tid, k, N, i, length[k]); + unsigned char seqa_val = seqa_spm_ptr[i]; + for(int j = 1; j < width[k]; j++) { + E_spm[j] = max(E_spm[j-1] - gap_extend, + H_spm[j-1] - gap_open + ); + + F_spm[j] = max(F_spm[j] - gap_extend, + H_spm[j] - gap_open + ); + + H_prev_spm[j] = H_spm[j]; + H_spm[j] = max(0, E_spm[j], F_spm[j], + H_prev_spm[j-1] + mm(seqa_val, seqb_spm_ptr[j])); + if (H_spm[j] > score_temp) + score_temp = H_spm[j]; + } + } + score_ptr[k] = score_temp; + seqa_spm_ptr += SIZEA_MAX; + seqb_spm_ptr += SIZEB_MAX; + } + profile_end(); + sync(); +} + diff --git a/examples/cuda/test_smith_waterman/main.cpp b/examples/cuda/test_smith_waterman/main.cpp new file mode 100644 index 000000000..5d1853206 --- /dev/null +++ b/examples/cuda/test_smith_waterman/main.cpp @@ -0,0 +1,289 @@ +// Copyright (c) 2019, University of Washington All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// Redistributions of source code must retain the above copyright notice, this list +// of conditions and the following disclaimer. +// +// Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// +// Neither the name of the copyright holder nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#define PRINT_SCORE +//#define PRINT_MATRIX +#define ALLOC_NAME "default_allocator" +#define CUDA_CALL(expr) \ + { \ + int __err; \ + __err = expr; \ + if (__err != HB_MC_SUCCESS) { \ + bsg_pr_err("'%s' failed: %s\n", #expr, hb_mc_strerror(__err)); \ + return __err; \ + } \ + } + +class Sequence { + private: + // Read DNA Sequence from file + static void read_seq(const string file_name, const int N, + const int SIZE_SEQ, unsigned* seq, unsigned* size) { + map dna_char2int = {{'A', 0}, {'C', 1}, {'G', 2}, {'T', 3}, {'N', 0}}; + + ifstream fin; + fin.open(file_name, ios::in); + if(fin.fail()){ + bsg_pr_info("Hey! File does not exist!\n"); + exit(1); + } + for (int i = 0; i < N; i++) { + string str, num; + fin >> num >> str; + for (int j = 0; j < str.size(); j++) { + seq[i*SIZE_SEQ+j] = dna_char2int[str[j]]; + } + size[i] = str.size(); + }; + fin.close(); + } + + // Pack DNA sequence + static void pack(const unsigned* unpacked, const int num_unpacked, const int num_packed, unsigned* packed) { + for (int i = 0; i < num_packed; i++) { + for (int j = 0; j < 16 && i * 16 + j < num_unpacked; j++) { + int unpacked_val = unpacked[j] << (30 - 2 * j); + packed[i] |= unpacked_val; + } + unpacked += 16; + } + } + + public: + // Get packed data + static void get_data_packed(const int N, const int SIZEA_MAX, const int SIZEB_MAX, + unsigned* seqa, unsigned* seqb, + unsigned* sizea, unsigned* sizeb) { + // read N queries + unsigned* seqa_unpacked = new unsigned[N*SIZEA_MAX](); + read_seq("data/dna-query32.fasta", N, SIZEA_MAX, seqa_unpacked, sizea); + + // read N references + unsigned* seqb_unpacked = new unsigned[N*SIZEB_MAX](); + read_seq("data/dna-reference32.fasta", N, SIZEB_MAX, seqb_unpacked, sizeb); + + // pack + int num_unpacked = N * SIZEA_MAX; + const int SIZEA_MAX_PACKED = (SIZEA_MAX + 15) / 16; + int num_packed = N * SIZEA_MAX_PACKED; + unsigned* unpacked = seqa_unpacked; + pack(unpacked, num_unpacked, num_packed, seqa); + + const int SIZEB_MAX_PACKED = (SIZEB_MAX + 15) / 16; + num_unpacked = N * SIZEB_MAX; + num_packed = N * SIZEB_MAX_PACKED; + unpacked = seqb_unpacked; + pack(unpacked, num_unpacked, num_packed, seqb); + delete[] seqa_unpacked; + delete[] seqb_unpacked; + } + }; + +int kernel_smith_waterman (int argc, char **argv) { + char *bin_path, *test_name; + struct arguments_path args = {NULL, NULL}; + + argp_parse (&argp_path, argc, argv, 0, 0, &args); + bin_path = args.path; + test_name = args.name; + + bsg_pr_test_info("Running the CUDA Vector Addition Kernel on one 2x2 tile groups.\n"); + + srand(static_cast(time(0))); + + /* Define path to binary. */ + /* Initialize device, load binary and unfreeze tiles. */ + hb_mc_dimension_t tg_dim = { .x = 16, .y = 8}; + hb_mc_device_t device; + BSG_CUDA_CALL(hb_mc_device_init_custom_dimensions(&device, test_name, 0, tg_dim)); + + /* if DMA is not supported just return SUCCESS */ + if (!hb_mc_manycore_supports_dma_write(device.mc) + || !hb_mc_manycore_supports_dma_read(device.mc)) { + bsg_pr_test_info("DMA not supported for this machine: returning success\n"); + BSG_CUDA_CALL(hb_mc_device_finish(&device)); + return HB_MC_SUCCESS; + } + + hb_mc_pod_id_t pod; + hb_mc_device_foreach_pod_id(&device, pod) + { + BSG_CUDA_CALL(hb_mc_device_set_default_pod(&device, pod)); + BSG_CUDA_CALL(hb_mc_device_program_init(&device, bin_path, ALLOC_NAME, 0)); + + // == Get data + int num_tiles = tg_dim.x * tg_dim.y; + const int N_TILE = 4; + const int N = N_TILE * num_tiles; + const int SIZEA_MAX = 64; + const int SIZEB_MAX = 64; + const int SIZEA_MAX_PACKED = (SIZEA_MAX + 15) / 16; + const int SIZEB_MAX_PACKED = (SIZEB_MAX + 15) / 16; + unsigned* seqa = new unsigned[N * SIZEA_MAX_PACKED](); + unsigned* seqb = new unsigned[N * SIZEB_MAX_PACKED](); + unsigned* sizea = new unsigned[N]; + unsigned* sizeb = new unsigned[N]; + Sequence::get_data_packed(N, SIZEA_MAX, SIZEB_MAX, seqa, seqb, sizea, sizeb); + + // == Sending data to device + // Define the sizes of the I/O arrays + size_t seqa_bytes = N * SIZEA_MAX_PACKED * sizeof(unsigned); + size_t seqb_bytes = N * SIZEB_MAX_PACKED * sizeof(unsigned); + size_t sizea_bytes = N * sizeof(unsigned); + size_t sizeb_bytes = N * sizeof(unsigned); + size_t score_bytes = N * sizeof(unsigned); + + // Allocate device memory for the I/O arrays + eva_t seqa_d, seqb_d, sizea_d, sizeb_d, score_d; + BSG_CUDA_CALL(hb_mc_device_malloc(&device, seqa_bytes, &seqa_d)); + BSG_CUDA_CALL(hb_mc_device_malloc(&device, seqb_bytes, &seqb_d)); + BSG_CUDA_CALL(hb_mc_device_malloc(&device, sizea_bytes, &sizea_d)); + BSG_CUDA_CALL(hb_mc_device_malloc(&device, sizeb_bytes, &sizeb_d)); + BSG_CUDA_CALL(hb_mc_device_malloc(&device, score_bytes, &score_d)); + + // Transfer data host -> device + hb_mc_dma_htod_t htod_jobs [] = { + { + .d_addr = seqa_d, + .h_addr = seqa, + .size = seqa_bytes + }, + { + .d_addr = seqb_d, + .h_addr = seqb, + .size = seqb_bytes + }, + { + .d_addr = sizea_d, + .h_addr = sizea, + .size = sizea_bytes + }, + { + .d_addr = sizeb_d, + .h_addr = sizeb, + .size = sizeb_bytes + } + }; + + bsg_pr_test_info("Writing A and B to device\n"); + + BSG_CUDA_CALL(hb_mc_device_dma_to_device(&device, htod_jobs, 4)); + delete[] seqa; + delete[] seqb; + delete[] sizea; + delete[] sizeb; + + // == Launching kernel == + // Define amount of work for each tile group + /* Define tg_dim_x/y: number of tiles in each tile group */ + /* Calculate grid_dim_x/y: number of tile groups needed based on block_size_x/y */ + hb_mc_dimension_t grid_dim = { .x = 1, .y = 1}; + + /* Prepare list of input arguments for kernel. */ + uint32_t cuda_argv[8] = {N_TILE, SIZEA_MAX, SIZEB_MAX, + seqa_d, seqb_d, sizea_d, + sizeb_d, score_d}; + + /* Enque grid of tile groups, pass in grid and tile group dimensions, + kernel name, number and list of input arguments */ + BSG_CUDA_CALL(hb_mc_kernel_enqueue (&device, grid_dim, tg_dim, "kernel_smith_waterman", 8, cuda_argv)); + + /* Launch and execute all tile groups on device and wait for all to finish. */ + BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(&device)); + + // Transfer data device -> host + int* score = new int[N]; + hb_mc_dma_dtoh_t dtoh_job = { + .d_addr = score_d, + .h_addr = score, + .size = score_bytes + }; + + bsg_pr_test_info("Reading C to host\n"); + + BSG_CUDA_CALL(hb_mc_device_dma_to_host(&device, &dtoh_job, 1)); + + /* Freeze the tiles and memory manager cleanup. */ + BSG_CUDA_CALL(hb_mc_device_program_finish(&device)); + + // == Check output + // check N scores against golden + unsigned score_golden[N]; + ifstream fin; + fin.open("data/output32", ios::in); + for (int i = 0; i < N; i++) { + fin >> score_golden[i]; + } + fin.close(); + + // Write to file + ofstream fout; + fout.open("output", ios::out); + for (int i = 0; i < N; i++) { + fout << score[i] << endl; + } + fout.close(); + + // Check + for (int i = 0; i < N; i++) { + if (score[i] != score_golden[i]) { + cout << "ERROR : mismatch for score " << i << endl; + return HB_MC_FAIL; + } + } + delete[] score; + } + + BSG_CUDA_CALL(hb_mc_device_finish(&device)); + + return HB_MC_SUCCESS; +} + +declare_program_main("test_smith_waterman", kernel_smith_waterman); + diff --git a/examples/cuda/test_smith_waterman/output b/examples/cuda/test_smith_waterman/output new file mode 100644 index 000000000..94a52c5c4 --- /dev/null +++ b/examples/cuda/test_smith_waterman/output @@ -0,0 +1,512 @@ +5 +5 +28 +5 +5 +7 +6 +3 +6 +4 +23 +4 +3 +5 +4 +4 +5 +4 +5 +5 +5 +5 +5 +3 +3 +5 +5 +6 +5 +4 +4 +5 +5 +5 +26 +4 +10 +4 +4 +3 +4 +4 +5 +4 +3 +4 +4 +4 +6 +5 +5 +4 +3 +7 +5 +7 +5 +3 +4 +12 +4 +3 +4 +4 +4 +4 +4 +6 +5 +4 +6 +5 +7 +4 +4 +7 +5 +4 +4 +5 +5 +5 +5 +5 +5 +4 +4 +3 +5 +5 +23 +4 +3 +6 +4 +6 +4 +4 +4 +7 +5 +4 +5 +3 +4 +5 +5 +5 +6 +5 +5 +4 +5 +6 +4 +5 +5 +3 +5 +6 +5 +4 +7 +24 +6 +8 +5 +5 +4 +4 +14 +9 +19 +5 +5 +7 +4 +4 +4 +4 +5 +5 +29 +5 +6 +4 +5 +4 +4 +5 +4 +4 +5 +4 +5 +4 +5 +5 +4 +3 +4 +4 +4 +4 +3 +5 +4 +5 +4 +5 +5 +5 +5 +11 +4 +20 +4 +24 +18 +15 +4 +4 +5 +6 +6 +6 +4 +6 +5 +4 +5 +3 +4 +4 +4 +5 +4 +4 +4 +4 +4 +5 +5 +11 +5 +5 +6 +5 +4 +5 +4 +21 +5 +4 +4 +4 +8 +4 +4 +4 +5 +4 +5 +4 +7 +4 +29 +7 +4 +6 +5 +5 +5 +6 +4 +5 +6 +4 +3 +6 +4 +6 +4 +5 +4 +4 +5 +7 +27 +8 +6 +4 +4 +4 +4 +4 +3 +5 +6 +5 +4 +5 +4 +3 +5 +5 +4 +5 +5 +5 +6 +6 +8 +6 +4 +4 +4 +4 +5 +4 +5 +5 +6 +4 +5 +6 +9 +8 +3 +4 +3 +7 +3 +5 +18 +6 +6 +6 +10 +6 +28 +6 +25 +4 +29 +3 +5 +6 +4 +6 +3 +4 +4 +5 +5 +5 +10 +4 +5 +5 +5 +4 +6 +4 +4 +29 +3 +4 +23 +5 +4 +4 +4 +4 +4 +5 +5 +4 +9 +4 +5 +4 +4 +8 +6 +5 +5 +4 +4 +4 +4 +4 +4 +12 +5 +5 +4 +4 +4 +4 +4 +5 +4 +7 +5 +10 +7 +5 +6 +7 +4 +4 +5 +4 +4 +5 +5 +6 +4 +4 +14 +6 +6 +5 +3 +4 +4 +4 +5 +5 +5 +7 +7 +5 +5 +5 +29 +5 +3 +5 +7 +8 +4 +4 +5 +3 +5 +4 +6 +4 +5 +5 +4 +5 +4 +5 +4 +4 +5 +5 +4 +5 +4 +4 +5 +8 +4 +4 +4 +30 +7 +5 +7 +7 +5 +7 +5 +7 +3 +5 +4 +6 +6 +5 +5 +6 +3 +6 +4 +4 +5 +4 +4 +3 +5 +4 +5 +5 +6 +5 +4 +5 +5 +4 +5 +5 +6 +5 +4 +3 +6 +5 +5 +4 +4 +5 +6 +4 +3 +5 +4 +8 +4 +5 +6 +6 +9 +3 +6 +5 +5 +4 +4 +4 +6 +6 +3 +23 +5 +4 +5 +5 +5 +6 +7 +4 +14 +4 +4 +4 +6 +5 From 250b65ecf8c800058d8cc2e4acf18b3dc1c15750 Mon Sep 17 00:00:00 2001 From: Dustin Richmond Date: Thu, 31 Mar 2022 09:18:26 -0700 Subject: [PATCH 2/3] placeholder file --- examples/cuda/test_smith_waterman/data/.placeholder | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/cuda/test_smith_waterman/data/.placeholder diff --git a/examples/cuda/test_smith_waterman/data/.placeholder b/examples/cuda/test_smith_waterman/data/.placeholder new file mode 100644 index 000000000..e69de29bb From 5b4607e5514dc4bd46e2de5e500126f4244cad31 Mon Sep 17 00:00:00 2001 From: Dustin Richmond Date: Thu, 31 Mar 2022 09:18:58 -0700 Subject: [PATCH 3/3] Move to apps directory --- .../{cuda/test_smith_waterman => apps/smith_waterman}/Makefile | 0 .../test_smith_waterman => apps/smith_waterman}/data/.placeholder | 0 .../{cuda/test_smith_waterman => apps/smith_waterman}/kernel.cpp | 0 .../smith_waterman}/kernel_smith_waterman.cpp | 0 .../{cuda/test_smith_waterman => apps/smith_waterman}/main.cpp | 0 examples/{cuda/test_smith_waterman => apps/smith_waterman}/output | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/Makefile (100%) rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/data/.placeholder (100%) rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/kernel.cpp (100%) rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/kernel_smith_waterman.cpp (100%) rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/main.cpp (100%) rename examples/{cuda/test_smith_waterman => apps/smith_waterman}/output (100%) diff --git a/examples/cuda/test_smith_waterman/Makefile b/examples/apps/smith_waterman/Makefile similarity index 100% rename from examples/cuda/test_smith_waterman/Makefile rename to examples/apps/smith_waterman/Makefile diff --git a/examples/cuda/test_smith_waterman/data/.placeholder b/examples/apps/smith_waterman/data/.placeholder similarity index 100% rename from examples/cuda/test_smith_waterman/data/.placeholder rename to examples/apps/smith_waterman/data/.placeholder diff --git a/examples/cuda/test_smith_waterman/kernel.cpp b/examples/apps/smith_waterman/kernel.cpp similarity index 100% rename from examples/cuda/test_smith_waterman/kernel.cpp rename to examples/apps/smith_waterman/kernel.cpp diff --git a/examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp b/examples/apps/smith_waterman/kernel_smith_waterman.cpp similarity index 100% rename from examples/cuda/test_smith_waterman/kernel_smith_waterman.cpp rename to examples/apps/smith_waterman/kernel_smith_waterman.cpp diff --git a/examples/cuda/test_smith_waterman/main.cpp b/examples/apps/smith_waterman/main.cpp similarity index 100% rename from examples/cuda/test_smith_waterman/main.cpp rename to examples/apps/smith_waterman/main.cpp diff --git a/examples/cuda/test_smith_waterman/output b/examples/apps/smith_waterman/output similarity index 100% rename from examples/cuda/test_smith_waterman/output rename to examples/apps/smith_waterman/output