From cbd231e0ea66f5e467c5e40f1c56046269244b81 Mon Sep 17 00:00:00 2001 From: nournadar Date: Sun, 27 Nov 2022 13:39:11 +0300 Subject: [PATCH] v-0.0.2 --- .gitignore | 1 + .gitmodules | 4 - CMakeLists.txt | 18 +- LICENSE | 24 + README.md | 5 +- benchmark/cpp/bench_mvm_fp32.cpp | 3 + benchmark/cpp/bench_mvm_singlecomplex.cpp | 3 + benchmark/cuda/bench_mvm_cuda_fp32.cpp | 3 + .../cuda/bench_mvm_cuda_singlecomplex.cpp | 3 + benchmark/hip/Bench_hip_constrank.cpp | 3 + benchmark/hip/Bench_hip_tlrmvm.sh | 5 + cmake/hiptlrmvm.cmake | 21 + install/install-x86-cuda.sh | 5 + install/install-x86.sh | 5 + install/spack-install-x86-cuda.sh | 5 + install/spack-install-x86.sh | 5 + pytlrmvm/Densemat.py | 3 + pytlrmvm/__init__.py | 3 + pytlrmvm/_wrapper.py | 3 + pytlrmvm/convertmat2npy.py | 27 - pytlrmvm/generateseismicinput.py | 77 -- pytlrmvm/src/BatchTlrmvm.cpp | 3 + pytlrmvm/src/CommonWrapper.cpp | 3 + pytlrmvm/src/Tlrmvm.cpp | 3 + pytlrmvm/src/Wrapper.cpp | 4 +- pytlrmvm/src/wrapperdef.h | 3 + pytlrmvm/tlrmat.py | 3 + pytlrmvm/tlrmvmtools.py | 275 +++++-- setup.py | 5 +- src/common/AppUtil.cpp | 3 + src/common/AppUtil.hpp | 3 + src/common/Common.hpp | 3 + src/common/cpu/BlasInterface.cpp | 3 + src/common/cpu/BlasInterface.hpp | 3 + src/common/cpu/Matrix.cpp | 3 + src/common/cpu/Matrix.hpp | 3 + src/common/cpu/Util.cpp | 13 +- src/common/cpu/Util.hpp | 3 + src/common/cpu/vendorblas/blisinterface.cpp | 3 + src/common/cpu/vendorblas/blisinterface.h | 3 + src/common/cpu/vendorblas/mklinterface.cpp | 3 + src/common/cpu/vendorblas/mklinterface.h | 3 + .../cpu/vendorblas/openblasinterface.cpp | 3 + src/common/cpu/vendorblas/openblasinterface.h | 3 + src/common/cuda/Util.cpp | 3 + src/common/cuda/Util.hpp | 3 + src/common/cuda/cublasInterface.cpp | 3 + src/common/cuda/cublasInterface.hpp | 3 + src/common/hip/Util.cpp | 3 + src/common/hip/Util.hpp | 3 + src/common/hip/hipblasInterface.cpp | 3 + src/common/hip/hipblasInterface.hpp | 3 + src/tlrmvm/Tlrmvm.hpp | 3 + src/tlrmvm/cpu/PCMatrix.cpp | 3 + src/tlrmvm/cpu/PCMatrix.hpp | 3 + src/tlrmvm/cpu/TlrmvmCPU.cpp | 27 +- src/tlrmvm/cpu/TlrmvmCPU.hpp | 3 + src/tlrmvm/cpu/TlrmvmDPCPP.cpp | 3 + src/tlrmvm/cpu/TlrmvmDPCPP.hpp | 5 +- src/tlrmvm/cuda/BatchTlrmvmcuda.cpp | 3 + src/tlrmvm/cuda/BatchTlrmvmcuda.hpp | 3 + src/tlrmvm/cuda/BatchTlrmvmcudaFP16.cpp | 3 + src/tlrmvm/cuda/BatchTlrmvmcudaFP16.hpp | 3 + src/tlrmvm/cuda/BatchTlrmvmcudaINT8.cpp | 3 + src/tlrmvm/cuda/BatchTlrmvmcudaINT8.hpp | 3 + src/tlrmvm/cuda/TlrmvmMPfp16.cpp | 3 + src/tlrmvm/cuda/TlrmvmMPfp16.hpp | 3 + src/tlrmvm/cuda/TlrmvmMPint8.cpp | 3 + src/tlrmvm/cuda/TlrmvmMPint8.hpp | 3 + src/tlrmvm/cuda/Tlrmvmcuda.cpp | 3 + src/tlrmvm/cuda/Tlrmvmcuda.hpp | 5 +- src/tlrmvm/cuda/TlrmvmcudaConstRank.cpp | 3 + src/tlrmvm/cuda/TlrmvmcudaConstRank.hpp | 3 + src/tlrmvm/cuda/cudakernel.cu | 3 + src/tlrmvm/cuda/cudakernel.cuh | 3 + src/tlrmvm/cuda/tlrmvmcudautil.cpp | 3 + src/tlrmvm/cuda/tlrmvmcudautil.hpp | 3 + src/tlrmvm/hip/BatchTlrmvmhip.cpp | 715 ++++++++++++++++++ src/tlrmvm/hip/BatchTlrmvmhip.hpp | 87 +++ src/tlrmvm/hip/Tlrmvmhip.cpp | 610 +++++++++++++++ src/tlrmvm/hip/Tlrmvmhip.hpp | 117 +++ src/tlrmvm/hip/TlrmvmhipConstRank.cpp | 487 ++++++++++++ src/tlrmvm/hip/TlrmvmhipConstRank.hpp | 103 +++ src/tlrmvm/hip/hipkernel.cpp | 77 ++ src/tlrmvm/hip/hipkernel.cuh | 25 + src/tlrmvm/hip/tlrmvmhiputil.cpp | 94 +++ src/tlrmvm/hip/tlrmvmhiputil.hpp | 96 +++ test/CMakeLists.txt | 17 + test/cpp/Test_cpu_tlrmvm.cpp | 3 + test/cpp/ex2mpitlrmvm_complexfloat.cpp | 47 +- test/cpp/ex2mpitlrmvm_float.cpp | 59 +- test/cpp/ex3_gendata.cpp | 3 + test/cpp/mkl_example.h | 15 +- test/cpp/runexp1.sh | 5 + test/cpp/runexp2A64FX.sh | 5 + test/cpp/runexp2AMD.sh | 5 + test/cpp/runexp2AMD_synthetic.sh | 5 + test/cpp/runexp2ICX.sh | 5 + test/cpp/runexp2NEC.sh | 5 + test/cpp/runexp3_gendata.sh | 5 + test/cuda/Test_A100TimeDecomposition.cpp | 3 + test/cuda/Test_BatchCall_Tlrmvm.cpp | 3 + test/cuda/Test_MultiGPUBatchCall_Tlrmvm.cpp | 3 + test/cuda/astronomy_cudatest.sh | 5 + test/cuda/seismology_cudatest.sh | 5 + test/cuda/seismology_mixedprec_cudatest.sh | 5 + test/hip/Test_hip_constrank.cpp | 61 ++ test/hip/Test_hip_hipblas.cpp | 23 + test/hip/Test_hip_tlrmvm.cpp | 55 ++ test/hip/Test_hip_tlrmvm_singlecall.cpp | 47 ++ test/python/generateinput.py | 5 +- test/python/geninput_astronomy.sh | 5 + test/python/geninput_seismology.sh | 7 +- test/python/mode4generateinput.py | 3 + test/python/seismicexample.py | 3 + test/python/tlrmvmexample.py | 5 +- thirdparty/pybind11 | 1 - 117 files changed, 3249 insertions(+), 287 deletions(-) delete mode 100644 .gitmodules create mode 100644 LICENSE create mode 100644 cmake/hiptlrmvm.cmake delete mode 100644 pytlrmvm/convertmat2npy.py delete mode 100644 pytlrmvm/generateseismicinput.py create mode 100644 src/tlrmvm/hip/BatchTlrmvmhip.cpp create mode 100644 src/tlrmvm/hip/BatchTlrmvmhip.hpp create mode 100644 src/tlrmvm/hip/Tlrmvmhip.cpp create mode 100644 src/tlrmvm/hip/Tlrmvmhip.hpp create mode 100644 src/tlrmvm/hip/TlrmvmhipConstRank.cpp create mode 100644 src/tlrmvm/hip/TlrmvmhipConstRank.hpp create mode 100644 src/tlrmvm/hip/hipkernel.cpp create mode 100644 src/tlrmvm/hip/hipkernel.cuh create mode 100644 src/tlrmvm/hip/tlrmvmhiputil.cpp create mode 100644 src/tlrmvm/hip/tlrmvmhiputil.hpp create mode 100644 test/hip/Test_hip_constrank.cpp create mode 100644 test/hip/Test_hip_hipblas.cpp create mode 100644 test/hip/Test_hip_tlrmvm.cpp create mode 100644 test/hip/Test_hip_tlrmvm_singlecall.cpp delete mode 160000 thirdparty/pybind11 diff --git a/.gitignore b/.gitignore index 4d0b5bc..7c053e1 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ cudacomplex cudasingle amdcpucomplex amdcpusingle +*build/ \ No newline at end of file diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 613ca93..0000000 --- a/.gitmodules +++ /dev/null @@ -1,4 +0,0 @@ -[submodule "thirdparty/pybind11"] - path = thirdparty/pybind11 - url = https://github.com/pybind/pybind11.git - diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cab794..9df0cc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.20) set(PROJECT_NAME TLRMVM) @@ -35,7 +35,7 @@ set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) option(BUILD_CPU "Build TLR-MVM using cpp" ON) option(BUILD_DPCPP "Build TLR-MVM on intel archs and use intel dpc++." OFF) option(BUILD_CUDA "Build TLR-MVM on NVIDIA gpu and cuda." OFF) # using NVIDIA GPU -# option(BUILD_HIP "Build TLR-MVM on amd gpu and use hip." OFF) # using AMD GPU (AMD is under dev) + option(BUILD_HIP "Build TLR-MVM on amd gpu and use hip." OFF) # using AMD GPU (AMD is under dev) ######################### # BLAS backend selection @@ -190,18 +190,16 @@ endif() # BUILD_CUDA # HIP library ####################### if(BUILD_HIP) - list(APPEND CMAKE_PREFIX_PATH $ENV{HIP_PATH} $ENV{ROCM_PATH} - $ENV{HIP_PATH}/hip $ENV{HIP_PATH}/llvm/lib/clang/14.0.0/lib/linux) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result ") + enable_language(HIP) + find_package(HIP REQUIRED) include(cmake/hiptlrmvm.cmake) add_library(tlrmvmhiplib SHARED ${CPU_HEADERS} ${CPU_SRCS} ${HIP_HEADERS} ${HIP_SRCS}) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - set(TLRMVM_LIBS ${TLRMVM_LIBS} hip::device roc::hipblas) +# find_package(hipblas REQUIRED) + set(TLRMVM_LIBS ${TLRMVM_LIBS} hip::device hip::host hipblas) target_include_directories(tlrmvmhiplib PUBLIC ${TLRMVM_INCS}) target_link_libraries(tlrmvmhiplib PUBLIC ${TLRMVM_LIBS}) AddCompileDefinitions(tlrmvmhiplib) - target_compile_definitions(tlrmvmhiplib PUBLIC -D__HIP_PLATFORM_HCC__=1) # for clion search ... +# target_compile_definitions(tlrmvmhiplib PUBLIC -D__HIP_PLATFORM_HCC__=1) # for clion search ... target_compile_definitions(tlrmvmhiplib PUBLIC USE_HIP) install(TARGETS tlrmvmhiplib DESTINATION lib) endif() # BUILD_HIP @@ -212,7 +210,7 @@ endif() # BUILD_HIP ################# if(BUILD_PYTHON AND (BUILD_CUDA OR BUILD_HIP)) # now python is only available for CUDA and HIP build. - add_subdirectory(thirdparty/pybind11) + find_package(pybind11 REQUIRED) add_subdirectory(pytlrmvm) endif() diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..32483c5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2022, King Abdullah University of Science and Technology +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 1de8f15..7f598fb 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,10 @@ Compile and install ## 4. Test You also need to download the dataset to run the experiments. dataset download url: -https://drive.google.com/drive/folders/1_DSyloFjlScXGTlA1_ybJnTne59tUpgR?usp=sharing +1. seismic redatuming dataset +https://zenodo.org/record/6582600 +2. MAVIS AO system matrcies dataset +https://zenodo.org/record/7305622 after download, put it in a seperate folder and set `WORK_ROOT` to that folder. diff --git a/benchmark/cpp/bench_mvm_fp32.cpp b/benchmark/cpp/bench_mvm_fp32.cpp index 313243f..cc41c40 100644 --- a/benchmark/cpp/bench_mvm_fp32.cpp +++ b/benchmark/cpp/bench_mvm_fp32.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/benchmark/cpp/bench_mvm_singlecomplex.cpp b/benchmark/cpp/bench_mvm_singlecomplex.cpp index 224d628..23045af 100644 --- a/benchmark/cpp/bench_mvm_singlecomplex.cpp +++ b/benchmark/cpp/bench_mvm_singlecomplex.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/benchmark/cuda/bench_mvm_cuda_fp32.cpp b/benchmark/cuda/bench_mvm_cuda_fp32.cpp index 117f8d7..94ad6c7 100644 --- a/benchmark/cuda/bench_mvm_cuda_fp32.cpp +++ b/benchmark/cuda/bench_mvm_cuda_fp32.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/benchmark/cuda/bench_mvm_cuda_singlecomplex.cpp b/benchmark/cuda/bench_mvm_cuda_singlecomplex.cpp index d513ad6..321c174 100644 --- a/benchmark/cuda/bench_mvm_cuda_singlecomplex.cpp +++ b/benchmark/cuda/bench_mvm_cuda_singlecomplex.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/benchmark/hip/Bench_hip_constrank.cpp b/benchmark/hip/Bench_hip_constrank.cpp index 5b65301..38c78bf 100644 --- a/benchmark/hip/Bench_hip_constrank.cpp +++ b/benchmark/hip/Bench_hip_constrank.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/benchmark/hip/Bench_hip_tlrmvm.sh b/benchmark/hip/Bench_hip_tlrmvm.sh index 444fe78..0ea1fda 100644 --- a/benchmark/hip/Bench_hip_tlrmvm.sh +++ b/benchmark/hip/Bench_hip_tlrmvm.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + ./install/test/Bench_hip_constrank --M=9801 --N=9801 --threshold=0.001 \ --datafolder=$WORK_ROOT/compresseddata --nb=256 \ --problem=Mode4_Ordernormal_Mck_freqslice_100 --streams=10 --loopsize=200 \ No newline at end of file diff --git a/cmake/hiptlrmvm.cmake b/cmake/hiptlrmvm.cmake new file mode 100644 index 0000000..33b4d25 --- /dev/null +++ b/cmake/hiptlrmvm.cmake @@ -0,0 +1,21 @@ +include(${PROJECT_SOURCE_DIR}/cmake/tlrmvm.cmake) +set(HIP_SRCS) +set(HIP_HEADERS) + +# Common +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.cpp) +list(APPEND HIP_SRCS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.cu) +list(APPEND HIP_SRCS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.hpp) +list(APPEND HIP_HEADERS ${TMP}) + +# tlrmvm +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cpp) +list(APPEND HIP_SRCS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cu) +list(APPEND HIP_SRCS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.hpp) +list(APPEND HIP_HEADERS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cuh) +list(APPEND HIP_HEADERS ${TMP}) \ No newline at end of file diff --git a/install/install-x86-cuda.sh b/install/install-x86-cuda.sh index 8b63ec4..be5726d 100755 --- a/install/install-x86-cuda.sh +++ b/install/install-x86-cuda.sh @@ -1,4 +1,9 @@ #!/usr/bin/bash +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + cmake -DCMAKE_CUDA_COMPILER:PATH=$(which nvcc) -DCMAKE_INSTALL_PREFIX=./install -DCMAKE_C_COMPILER=$(which gcc) -DCMAKE_CXX_COMPILER=$(which g++) \ -DUSE_MPI=ON -DUSE_MKL=ON -DBUILD_CUDA=ON -DBUILD_TEST=ON -DBUILD_PYTHON=ON -DPYTHON_EXECUTABLE=$(which python) #-DPython_ROOT_DIR=/home/hongy0a/miniconda3/envs/a100env diff --git a/install/install-x86.sh b/install/install-x86.sh index e69de29..6f3a30d 100644 --- a/install/install-x86.sh +++ b/install/install-x86.sh @@ -0,0 +1,5 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + diff --git a/install/spack-install-x86-cuda.sh b/install/spack-install-x86-cuda.sh index 3f0e6db..e90566e 100644 --- a/install/spack-install-x86-cuda.sh +++ b/install/spack-install-x86-cuda.sh @@ -1,4 +1,9 @@ #!/usr/bin/bash +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + git clone -c feature.manyFiles=true https://github.com/spack/spack.git . $HOME/spack/share/spack/setup-env.sh spack compiler find diff --git a/install/spack-install-x86.sh b/install/spack-install-x86.sh index b5d458c..46b4767 100644 --- a/install/spack-install-x86.sh +++ b/install/spack-install-x86.sh @@ -1,4 +1,9 @@ #!/usr/bin/bash +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + git clone -c feature.manyFiles=true https://github.com/spack/spack.git . $HOME/spack/share/spack/setup-env.sh spack compiler find diff --git a/pytlrmvm/Densemat.py b/pytlrmvm/Densemat.py index f8020ff..0be59f8 100644 --- a/pytlrmvm/Densemat.py +++ b/pytlrmvm/Densemat.py @@ -1,3 +1,6 @@ +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + from os.path import join from scipy.io import loadmat import numpy as np diff --git a/pytlrmvm/__init__.py b/pytlrmvm/__init__.py index 3384ec9..ef08afc 100644 --- a/pytlrmvm/__init__.py +++ b/pytlrmvm/__init__.py @@ -13,6 +13,9 @@ def __bootstrap__(): spec.loader.exec_module(mod) __bootstrap__() +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + from ._wrapper import * # from .tlrmvmtools import * # from .tlrmat import * diff --git a/pytlrmvm/_wrapper.py b/pytlrmvm/_wrapper.py index 344a05e..67dd273 100644 --- a/pytlrmvm/_wrapper.py +++ b/pytlrmvm/_wrapper.py @@ -1,3 +1,6 @@ +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + import TLRMVMpy as _cppimpl import numpy as _np from time import time as _time diff --git a/pytlrmvm/convertmat2npy.py b/pytlrmvm/convertmat2npy.py deleted file mode 100644 index 8d4a403..0000000 --- a/pytlrmvm/convertmat2npy.py +++ /dev/null @@ -1,27 +0,0 @@ -################################################################## -# @copyright (c) 2021- King Abdullah University of Science and -# Technology (KAUST). All rights reserved. -# -# Author: Yuxi Hong, 2021.10.27 -# Description: Convert matlab .mat file to numpy npy. -################################################################## -import os -from os.path import join -from scipy.io import loadmat -import numpy as np -import pickle -import argparse -import time - -parser = argparse.ArgumentParser() -parser.add_argument('--filename', type=str, help='matfilename') -parser.add_argument('--mat_root', type=str, help='matfile dir') -parser.add_argument('--work_root', type=str, help='workspace dir') -args = parser.parse_args() - -matname = join(args.mat_root, args.filename + '.mat') -npyname = join(args.work_root, args.filename + '.npy') -work_root = args.work_root -A = loadmat(matname)['Rfreq'] -with open(npyname, 'wb') as f: - np.save(f, A) diff --git a/pytlrmvm/generateseismicinput.py b/pytlrmvm/generateseismicinput.py deleted file mode 100644 index f388c7c..0000000 --- a/pytlrmvm/generateseismicinput.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np -import sys -import os -import argparse -import time - -parser = argparse.ArgumentParser() -parser.add_argument('--TLRMVM_ROOT', type=str, help='installation dir') -parser.add_argument('--WORK_ROOT', type=str, help='workspace dir') -parser.add_argument('--nb', type=int, help='nb') -parser.add_argument('--error_threshold', type=str,help='error threshold') -parser.add_argument('--problemname', type=str, help='problem name') -parser.add_argument('--datatype', type=str, help='datatype of dataset') - -args = parser.parse_args() - -print("Your installation path: ", args.TLRMVM_ROOT) -print("Your workspace path: ", args.WORK_ROOT) - -# your TLRMVM_ROOT should point to your installation path. -TLRMVM_ROOT = args.TLRMVM_ROOT -sys.path.append( os.path.join( args.TLRMVM_ROOT,"python") ) -WORK_ROOT = args.WORK_ROOT -print("Downloading dataset to path: {}".format( WORK_ROOT )) -if not os.path.exists(WORK_ROOT): - os.mkdir(WORK_ROOT) -problemname = args.problemname - -from tlrmvmtools import * -dtype = None -datatype = args.datatype -if datatype == 'float': - dtype = np.float32 -elif datatype == 'double': - dtype = np.float64 -elif datatype == 'csingle': - dtype = np.csingle -elif datatype == 'cdouble': - dtype = np.cdouble -else: - print("Not support datatype.") - exit(1) - -# get your data matrix in numpy format -# you can download the matrix used in this example at -# https://drive.google.com/file/d/1KY4eeVSMm2mWOOKVU7QjsAf6tOREv-99/view?usp=sharing -A = np.load( os.path.join(WORK_ROOT, "{}.npy".format(problemname)) ).astype(dtype) - -if datatype in ['csingle', 'cdouble']: - randomx = np.random.rand(A.shape[1]) + 1j * np.random.rand(A.shape[1]) - randomx = randomx.astype(dtype) -else: - randomx = np.random.rand(A.shape[1]) - randomx = randomx.astype(dtype) - -""" -Below is needed for creating input of tlrmvm. -They are parameters Tile size (nb) and Accuracy Threshold (error_threshold) -you can play with to get decent performance and numerical accuracy. -""" -m = A.shape[0] -n = A.shape[1] -nb = args.nb -error_threshold = args.error_threshold # we use string for easy concatnating. -workplacefolder = WORK_ROOT -datasetname = args.problemname - -# create tlrmvm util class -tlrmvmutil = TLRMVM_Util(A, nb, workplacefolder, error_threshold, datasetname) -# compute svd and save -tlrmvmutil.computesvd() -# create input of tlrmvm -tlrmvmutil.saveUV() -# get compression info -tlrmvmutil.printdatainfo() - -tlrmvmutil.saveX(randomx) \ No newline at end of file diff --git a/pytlrmvm/src/BatchTlrmvm.cpp b/pytlrmvm/src/BatchTlrmvm.cpp index 1c42ac9..1ae2828 100644 --- a/pytlrmvm/src/BatchTlrmvm.cpp +++ b/pytlrmvm/src/BatchTlrmvm.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 12/03/2022. // diff --git a/pytlrmvm/src/CommonWrapper.cpp b/pytlrmvm/src/CommonWrapper.cpp index 930dd44..6753e69 100644 --- a/pytlrmvm/src/CommonWrapper.cpp +++ b/pytlrmvm/src/CommonWrapper.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 11/03/2022. // diff --git a/pytlrmvm/src/Tlrmvm.cpp b/pytlrmvm/src/Tlrmvm.cpp index 400fc6e..5bcc75b 100644 --- a/pytlrmvm/src/Tlrmvm.cpp +++ b/pytlrmvm/src/Tlrmvm.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 12/03/2022. // diff --git a/pytlrmvm/src/Wrapper.cpp b/pytlrmvm/src/Wrapper.cpp index ffc5170..44bf74e 100644 --- a/pytlrmvm/src/Wrapper.cpp +++ b/pytlrmvm/src/Wrapper.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include @@ -157,7 +160,6 @@ PYBIND11_MODULE(TLRMVMpy, m) { m.def("BatchUpdatexgpu_INT8_cf", &BatchUpdatex_INT8); m.def("BatchUpdateygpu_INT8_cf", &BatchUpdatey_INT8); - addbatchtlrmvmgpufp16int8(m); m.def("SetMaskmat", &SetMaskmat); } diff --git a/pytlrmvm/src/wrapperdef.h b/pytlrmvm/src/wrapperdef.h index 34398cf..9822dd9 100644 --- a/pytlrmvm/src/wrapperdef.h +++ b/pytlrmvm/src/wrapperdef.h @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 12/03/2022. // diff --git a/pytlrmvm/tlrmat.py b/pytlrmvm/tlrmat.py index 3291c93..24af5ad 100644 --- a/pytlrmvm/tlrmat.py +++ b/pytlrmvm/tlrmat.py @@ -1,3 +1,6 @@ +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + import time import pickle import os diff --git a/pytlrmvm/tlrmvmtools.py b/pytlrmvm/tlrmvmtools.py index c856145..71c9c8a 100644 --- a/pytlrmvm/tlrmvmtools.py +++ b/pytlrmvm/tlrmvmtools.py @@ -1,114 +1,120 @@ +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + ################################################################## -# @copyright (c) 2021- King Abdullah University of Science and -# Technology (KAUST). All rights reserved. # -# Author: Yuxi Hong, 2021.10.27 -# Description: A tools for generating compressed U and V bases. -# They are input of TLR-MVM. +# Author: Yuxi Hong +# Description: A tools for generating compressed U and V bases +# which are input of TLR-MVM. ################################################################## import os from os.path import join, exists from tqdm import tqdm import numpy as np -import pickle +import pickle +from scipy.linalg import svd + class TLRMVM_Util: - """A TLR-MVM Utility class + """A TLR-MVM Utility class 1. compute svd for input of TLR-MVM 3. save U and V bases 4. save Dense matrix """ + def __init__(self, denseAarray, nb, datafolder, error_threshold, problemname, rankmodule) -> None: self.denseA = denseAarray self.dtype = denseAarray.dtype self.m = denseAarray.shape[0] self.n = denseAarray.shape[1] - self.nb = nb + + self.nb = nb self.mtg = self.m // nb if self.m % nb == 0 else self.m // nb + 1 self.ntg = self.n // nb if self.n % nb == 0 else self.n // nb + 1 - self.paddingm = self.mtg * nb - self.paddingn = self.ntg * nb + self.paddingm = self.mtg * nb + self.paddingn = self.ntg * nb self.datafolder = datafolder if not exists(self.datafolder): print("Folder {} not exists!".format(self.datafolder)) self.error_threshold = error_threshold self.problemname = problemname self.rankfile = join(self.datafolder, 'compresseddata', - '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname,self.nb,self.error_threshold)) + '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold)) self.rankmodule = rankmodule def computesvd(self): A = self.denseA - padding_m = self.paddingm + padding_m = self.paddingm padding_n = self.paddingn - m = self.m - n = self.n - ntiles = self.ntg - mtiles = self.mtg + m = self.m + n = self.n + mtiles = self.mtg + ntiles = self.ntg svdsavepath = join(self.datafolder, 'SVDinfo') if not exists(svdsavepath): os.mkdir(svdsavepath) - nb = self.nb - svdname = join( svdsavepath, '{}_nb{}.pickle'.format(self.problemname,nb) ) + nb = self.nb + svdname = join(svdsavepath, '{}_nb{}.pickle'.format(self.problemname, nb)) if exists(svdname): print("svd {} exists.".format(svdname)) - return + return else: print("save svd to {}. ".format(svdname)) bigmap = dict() - padA = np.zeros((padding_m,padding_n),dtype=self.dtype) - padA[:m,:n] = A + padA = np.zeros((padding_m, padding_n), dtype=self.dtype) + padA[:m, :n] = A for j in tqdm(range(ntiles)): for i in range(mtiles): - curblock = padA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] - [u,s,v] = np.linalg.svd(curblock) - bigmap['{}_{}'.format(i,j)] = [u,s,v] - with open( svdname,'wb') as f: - pickle.dump(bigmap, f) + curblock = padA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] + [u, s, v] = svd(curblock) + bigmap['{}_{}'.format(i, j)] = [u, s, v] + with open(svdname, 'wb') as f: + pickle.dump(bigmap, f) def saveX(self, xvec): xfile = join(self.datafolder, '{}_x.bin'.format(self.problemname)) xvec.tofile(xfile) def saveUV(self): - svdname = join( self.datafolder, 'SVDinfo', '{}_nb{}.pickle'.format(self.problemname,self.nb) ) + svdname = join(self.datafolder, 'SVDinfo', '{}_nb{}.pickle'.format(self.problemname, self.nb)) if not exists(svdname): print("please do svd to matrix first!") with open(svdname, 'rb') as f: bigmap = pickle.load(f) - nb = self.nb + nb = self.nb acc = self.error_threshold - uvsavepath = join(self.datafolder,'compresseddata') + uvsavepath = join(self.datafolder, 'compresseddata') if not exists(uvsavepath): os.mkdir(uvsavepath) - ufile = uvsavepath + '/{}_Ubases_nb{}_acc{}.bin'.format(self.problemname,nb,acc) - vfile = uvsavepath + '/{}_Vbases_nb{}_acc{}.bin'.format(self.problemname,nb,acc) - rfile = uvsavepath + '/{}_Rmat_nb{}_acc{}.bin'.format(self.problemname,nb,acc) - if exists(ufile) and exists(vfile) and exists(rfile): - print("Compress file exists. ") - return + ufile = uvsavepath + '/{}_Ubases_nb{}_acc{}.bin'.format(self.problemname, nb, acc) + vfile = uvsavepath + '/{}_Vbases_nb{}_acc{}.bin'.format(self.problemname, nb, acc) + rfile = uvsavepath + '/{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, nb, acc) + print("generate uvr file to {}.".format(uvsavepath)) - padding_m = self.paddingm + padding_m = self.paddingm padding_n = self.paddingn - m = self.m - n = self.n - ntiles = self.ntg - mtiles = self.mtg - uvsavepath = self.datafolder - nb = self.nb + m = self.m + n = self.n + ntiles = self.ntg + mtiles = self.mtg + nb = self.nb tmpacc = self.error_threshold - acc = tmpacc if isinstance(tmpacc,float) else float(tmpacc) - ApproximateA = np.zeros((padding_m, padding_n),dtype=self.dtype) - originpadA = np.zeros((padding_m, padding_n),dtype=self.dtype) - originpadA[:m,:n] = self.denseA - normA = np.linalg.norm(self.denseA,'fro') - ranklist = np.zeros(mtiles * ntiles,dtype=np.int32) + acc = tmpacc if isinstance(tmpacc, float) else float(tmpacc) + ApproximateA = np.zeros((padding_m, padding_n), dtype=self.dtype) + originpadA = np.zeros((padding_m, padding_n), dtype=self.dtype) + originpadA[:m, :n] = self.denseA + normA = np.linalg.norm(self.denseA, 'fro') + ranklist = np.zeros((mtiles, ntiles), dtype=np.int32) + print("rankmat shape, ", ranklist.shape) ulist = [[] for _ in range(mtiles)] vlist = [[] for _ in range(mtiles)] - for i in tqdm(range(mtiles)): - for j in range(ntiles): - curblock = originpadA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] - [u,s,v] = bigmap['{}_{}'.format(i,j)] + + p = mtiles + for i in tqdm(range(mtiles - 1)): + for j in range(ntiles - 1): + curblock = originpadA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] + normblock = np.linalg.norm(curblock, 'fro') + [u, s, v] = bigmap['{}_{}'.format(i, j)] srk = 0 erk = nb while srk != erk: @@ -116,54 +122,169 @@ def saveUV(self): tmpu = u[:, :midrk] tmps = s[:midrk] tmpv = v[:midrk, :] - if np.linalg.norm(curblock-(tmpu*tmps)@tmpv, ord='fro') < normA * acc: + if np.linalg.norm(curblock - (tmpu * tmps) @ tmpv, ord='fro') < normA * acc: erk = midrk else: - srk = midrk+1 - if srk % self.rankmodule != 0: - mod4srk = ((srk//self.rankmodule) + 1) * self.rankmodule - else: - mod4srk = srk - mod4srk = min(mod4srk, nb) - srk = mod4srk + srk = midrk + 1 + if srk == 0: + srk = 1 tmpu = u[:, :srk] tmps = s[:srk] tmpv = v[:srk, :] + ApproximateA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] = (tmpu * tmps) @ tmpv + us = tmpu * tmps + vt = tmpv if srk == 0: - ApproximateA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] = np.zeros((nb,nb),dtype=self.dtype) + ranklist[i, j] = 1 + ulist[i].append(np.zeros((nb, 1), dtype=self.dtype)) + vlist[i].append(np.zeros((1, nb), dtype=self.dtype)) else: - ApproximateA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] = (tmpu*tmps)@tmpv - us = tmpu * tmps + ranklist[i, j] = srk + ulist[i].append(us) + vlist[i].append(vt) + + def getsrk(normA, nb, acc, u, s, v): + srk = 0 + erk = nb + while srk != erk: + midrk = (srk + erk) // 2 + tmpu = u[:, :midrk] + tmps = s[:midrk] + tmpv = v[:midrk, :] + if np.linalg.norm(curblock - (tmpu * tmps) @ tmpv, ord='fro') < normA * acc: + erk = midrk + else: + srk = midrk + 1 + return srk + + for i in tqdm(range(mtiles)): + for j in range(ntiles): + if i < mtiles - 1 and j < ntiles - 1: + continue + curblock = originpadA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] + normblock = np.linalg.norm(curblock, 'fro') + [u, s, v] = bigmap['{}_{}'.format(i, j)] + if i < mtiles - 1 or j < ntiles - 1: + if i == mtiles - 1: + presum = np.sum(ranklist[:, j]) + srk = getsrk(normA, nb, acc, u, s, v) + while srk < nb and (srk + presum) % self.rankmodule != 0: + srk += 1 + + if srk == nb and (srk + presum) % self.rankmodule != 0: + print("can't find a solution! i = mtiles") + exit() + else: + ranklist[i, j] = srk + elif j == ntiles - 1: + presum = np.sum(ranklist[i, :]) + srk = getsrk(normA, nb, acc, u, s, v) + while srk < nb and (srk + presum) % self.rankmodule != 0: + srk += 1 + if srk == nb and (srk + presum) % self.rankmodule != 0: + print("can't find a solution! j = ntiles") + exit() + else: + ranklist[i, j] = srk + elif i == mtiles - 1 and j == ntiles - 1: + srk = 0 + while srk < nb and (srk + np.sum(ranklist[i, :])) % self.rankmodule != 0 and \ + (srk + np.sum(ranklist[:, j])) % self.rankmodule != 0: + srk += 1 + if srk == nb: + print("can't find a solution!") + exit() + else: + ranklist[i, j] = srk + if srk == 0: + srk = self.rankmodule + tmpu = u[:, :srk] + tmps = s[:srk] + tmpv = v[:srk, :] + ApproximateA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] = (tmpu * tmps) @ tmpv + us = tmpu * tmps vt = tmpv if srk == 0: - ranklist[j*mtiles+i] = 1 - ulist[i].append(np.zeros((nb,1),dtype=self.dtype)) - vlist[i].append(np.zeros((1,nb),dtype=self.dtype)) + ranklist[i, j] = 1 + ulist[i].append(np.zeros((nb, 1), dtype=self.dtype)) + vlist[i].append(np.zeros((1, nb), dtype=self.dtype)) else: - ranklist[j*mtiles+i] = srk + ranklist[i, j] = srk ulist[i].append(us) vlist[i].append(vt) tmpurow = [] for x in ulist: - tmpurow.append(np.concatenate(x,axis=1)) - finalu = np.concatenate(tmpurow,axis=1) + tmpurow.append(np.concatenate(x, axis=1)) + finalu = np.concatenate(tmpurow, axis=1) finalu.T.tofile(ufile) tmpvcol = [] - npvlist = np.array(vlist,dtype=np.object) + npvlist = np.array(vlist, dtype=np.object) for i in range(npvlist.shape[1]): - tmpvcol.append(np.concatenate(npvlist[:,i],axis=0)) - + tmpvcol.append(np.concatenate(npvlist[:, i], axis=0)) + with open(vfile, 'wb') as f: for x in tmpvcol: x.T.tofile(f) - ranklist.tofile(rfile) + ranklist.T.tofile(rfile) def printdatainfo(self): print("Description of Dataset: ") - print("problem name : {} ".format(self.problemname) ) + print("problem name : {} ".format(self.problemname)) print("m is {} n {} nb is {} error threshold is {}.".format(self.m, self.n, self.nb, self.error_threshold)) - rankfile = join(self.datafolder, 'compresseddata', '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold)) + rankfile = join(self.datafolder, 'compresseddata', + '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold)) self.ranklist = np.fromfile(rankfile, dtype=np.int32) - mn = self.m * self.n + mn = self.m * self.n rank = np.sum(self.ranklist) - print("Global rank is {}, compression rate is {:.3f}%.".format( rank, 2*rank*self.nb / mn * 100)) + print("Global rank is {}, compression rate is {:.3f}%.".format(rank, 2 * rank * self.nb / mn * 100)) + + +if __name__ == "__main__": + import numpy as np + import os + import argparse + from astropy.io.fits import open as fitsopen + from scipy.io import loadmat + parser = argparse.ArgumentParser() + parser.add_argument('--nb', type=int, help='nb') + parser.add_argument('--error_threshold', type=str,help='error threshold.') + parser.add_argument('--compressed_name', type=str, help='The file name for compressed U,V,and R.') + parser.add_argument('--data_dir', type=str, help='your original data dir.') + parser.add_argument('--data_type', type=str, help='datatype of dataset.') + parser.add_argument('--data_name', type=str, help='The name of original matrix.') + parser.add_argument('--matlabmat_name', type=str, default=None, help='The name of original matrix in .mat file.') + parser.add_argument('--rank_module', type=int, help='rank module.') + args = parser.parse_args() + dtype = None + datatype = args.data_type + if datatype == 'float': + dtype = np.float32 + elif datatype == 'double': + dtype = np.float64 + elif datatype == 'csingle': + dtype = np.csingle + elif datatype == 'cdouble': + dtype = np.cdouble + else: + print("Not support datatype.") + exit(1) + A = None + if args.data_name.split('.')[-1] == 'npy': + A = np.load(join(args.data_dir,args.data_name)).astype(dtype) + elif args.data_name.split('.')[-1] == 'fits': + A = fitsopen(join(args.data_dir,args.data_name))[0].data.astype(dtype) + elif args.data_name.split('.')[-1] == 'mat': + A = loadmat(join(args.data_dir,args.data_name))[args.matlabmat_name] + else: + A = pickle.load(open(join(args.data_dir,args.data_name))).astype(dtype) + rankmodule = int(args.rank_module) + if rankmodule == 0: + print("not 0.") + exit() + tlrmvmutil = TLRMVM_Util(A, args.nb, args.data_dir, args.error_threshold, args.compressed_name, rankmodule) + # compute svd and save + tlrmvmutil.computesvd() + # create input of tlrmvm + tlrmvmutil.saveUV() + # get compression info + tlrmvmutil.printdatainfo() diff --git a/setup.py b/setup.py index 69e8743..fbb9add 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,6 @@ +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + import os import pathlib import subprocess @@ -64,7 +67,7 @@ def cuda_cmake(self, ext): "-DCMAKE_C_COMPILER={}".format(self.c_compiler), "-DCMAKE_CXX_COMPILER={}".format(self.cxx_compiler), "-DCMAKE_CUDA_HOST_COMPILER={}".format(self.cxx_compiler), - "-DCMAKE_CUDA_FLAGS='-ccbin {}'".format(self.cxx_compiler), + #"-DCMAKE_CUDA_FLAGS='-ccbin {}'".format(self.cxx_compiler), "-DUSE_MKL=ON", "-DUSE_MPI=ON", "-DBUILD_CUDA=ON", diff --git a/src/common/AppUtil.cpp b/src/common/AppUtil.cpp index 894f7a2..c5188b7 100644 --- a/src/common/AppUtil.cpp +++ b/src/common/AppUtil.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include "AppUtil.hpp" diff --git a/src/common/AppUtil.hpp b/src/common/AppUtil.hpp index c418d6b..fd2668d 100644 --- a/src/common/AppUtil.hpp +++ b/src/common/AppUtil.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef APPUTIL_H #define APPUTIL_H diff --git a/src/common/Common.hpp b/src/common/Common.hpp index 3cc3f93..c3de49b 100644 --- a/src/common/Common.hpp +++ b/src/common/Common.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include "cpu/BlasInterface.hpp" diff --git a/src/common/cpu/BlasInterface.cpp b/src/common/cpu/BlasInterface.cpp index 349a98f..59a1026 100644 --- a/src/common/cpu/BlasInterface.cpp +++ b/src/common/cpu/BlasInterface.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include "BlasInterface.hpp" void gemm(const int *A, const int *B, int *C, int m, int n, int k){ diff --git a/src/common/cpu/BlasInterface.hpp b/src/common/cpu/BlasInterface.hpp index ee42d1b..7c4f409 100644 --- a/src/common/cpu/BlasInterface.hpp +++ b/src/common/cpu/BlasInterface.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef BLASINTERFACE_H #define BLASINTERFACE_H diff --git a/src/common/cpu/Matrix.cpp b/src/common/cpu/Matrix.cpp index 7fa7872..616c862 100644 --- a/src/common/cpu/Matrix.cpp +++ b/src/common/cpu/Matrix.cpp @@ -1,4 +1,7 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/src/common/cpu/Matrix.hpp b/src/common/cpu/Matrix.hpp index c5ad350..1becd1f 100644 --- a/src/common/cpu/Matrix.hpp +++ b/src/common/cpu/Matrix.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef MATRIX_H #define MATRIX_H diff --git a/src/common/cpu/Util.cpp b/src/common/cpu/Util.cpp index 78bac44..f87871c 100644 --- a/src/common/cpu/Util.cpp +++ b/src/common/cpu/Util.cpp @@ -1,4 +1,7 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include @@ -137,22 +140,16 @@ ArgsParser::ArgsParser(int argc, char**argv){ int ArgsParser::getint(string key){ if(argmap.find(key) == argmap.end()) - {cout << "key error in getint" << endl; exit(0);} + {cout << "key error in getint:" << key << endl; exit(0);} return atoi(argmap[key].c_str()); } string ArgsParser::getstring(string key){ if(argmap.find(key) == argmap.end()) - {cout << "key error in getstring" << endl; exit(0);} + {cout << "key error in getstring: "<< key << endl; exit(0);} return argmap[key]; } -bool ArgsParser::getbool(string key){ - if(argmap.find(key) == argmap.end()) - {cout << "key error in getint" << endl; exit(0);} - return atoi(argmap[key].c_str()); -} - template diff --git a/src/common/cpu/Util.hpp b/src/common/cpu/Util.hpp index df16510..e89556e 100644 --- a/src/common/cpu/Util.hpp +++ b/src/common/cpu/Util.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include diff --git a/src/common/cpu/vendorblas/blisinterface.cpp b/src/common/cpu/vendorblas/blisinterface.cpp index e256f7e..edec5bc 100644 --- a/src/common/cpu/vendorblas/blisinterface.cpp +++ b/src/common/cpu/vendorblas/blisinterface.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifdef USE_BLIS #include "blisinterface.h" diff --git a/src/common/cpu/vendorblas/blisinterface.h b/src/common/cpu/vendorblas/blisinterface.h index 8981f99..a125f48 100644 --- a/src/common/cpu/vendorblas/blisinterface.h +++ b/src/common/cpu/vendorblas/blisinterface.h @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef BLIS_INTERFACE_H #define BLIS_INTERFACE_H diff --git a/src/common/cpu/vendorblas/mklinterface.cpp b/src/common/cpu/vendorblas/mklinterface.cpp index 13f6d7f..8ee5e34 100644 --- a/src/common/cpu/vendorblas/mklinterface.cpp +++ b/src/common/cpu/vendorblas/mklinterface.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifdef USE_MKL #include "mklinterface.h" diff --git a/src/common/cpu/vendorblas/mklinterface.h b/src/common/cpu/vendorblas/mklinterface.h index 245d2ea..ba2cfdf 100644 --- a/src/common/cpu/vendorblas/mklinterface.h +++ b/src/common/cpu/vendorblas/mklinterface.h @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef MKL_INTERFACE_H #define MKL_INTERFACE_H diff --git a/src/common/cpu/vendorblas/openblasinterface.cpp b/src/common/cpu/vendorblas/openblasinterface.cpp index 4945115..5965087 100644 --- a/src/common/cpu/vendorblas/openblasinterface.cpp +++ b/src/common/cpu/vendorblas/openblasinterface.cpp @@ -1,4 +1,7 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #if defined(USE_OPENBLAS) || defined(USE_COMPILER_BLAS) #include "openblasinterface.h" diff --git a/src/common/cpu/vendorblas/openblasinterface.h b/src/common/cpu/vendorblas/openblasinterface.h index 2d69563..765e9d5 100644 --- a/src/common/cpu/vendorblas/openblasinterface.h +++ b/src/common/cpu/vendorblas/openblasinterface.h @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef OPENBLAS_INTERFACE_H #define OPENBLAS_INTERFACE_H diff --git a/src/common/cuda/Util.cpp b/src/common/cuda/Util.cpp index dc12acc..971d789 100644 --- a/src/common/cuda/Util.cpp +++ b/src/common/cuda/Util.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include "Util.hpp" #include "../cpu/Util.hpp" diff --git a/src/common/cuda/Util.hpp b/src/common/cuda/Util.hpp index a5e19c4..43b0ebe 100644 --- a/src/common/cuda/Util.hpp +++ b/src/common/cuda/Util.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef CUDA_UTIL_H #define CUDA_UTIL_H diff --git a/src/common/cuda/cublasInterface.cpp b/src/common/cuda/cublasInterface.cpp index 5cb388b..cfc7dba 100644 --- a/src/common/cuda/cublasInterface.cpp +++ b/src/common/cuda/cublasInterface.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include // if you need CUBLAS v2, include before magma.h #include diff --git a/src/common/cuda/cublasInterface.hpp b/src/common/cuda/cublasInterface.hpp index 3e0ca75..d81d09e 100644 --- a/src/common/cuda/cublasInterface.hpp +++ b/src/common/cuda/cublasInterface.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef CUBLASINTERFACE_HPP #define CUBLASINTERFACE_HPP diff --git a/src/common/hip/Util.cpp b/src/common/hip/Util.cpp index 608ebea..8e0d3f2 100644 --- a/src/common/hip/Util.cpp +++ b/src/common/hip/Util.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 28/02/2022. // diff --git a/src/common/hip/Util.hpp b/src/common/hip/Util.hpp index 0852321..27d54bc 100644 --- a/src/common/hip/Util.hpp +++ b/src/common/hip/Util.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 28/02/2022. // diff --git a/src/common/hip/hipblasInterface.cpp b/src/common/hip/hipblasInterface.cpp index c7cc5c6..cf40144 100644 --- a/src/common/hip/hipblasInterface.cpp +++ b/src/common/hip/hipblasInterface.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 28/02/2022. // diff --git a/src/common/hip/hipblasInterface.hpp b/src/common/hip/hipblasInterface.hpp index 4684a90..7b2d8ad 100644 --- a/src/common/hip/hipblasInterface.hpp +++ b/src/common/hip/hipblasInterface.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 28/02/2022. // diff --git a/src/tlrmvm/Tlrmvm.hpp b/src/tlrmvm/Tlrmvm.hpp index 20b8905..e5ce310 100644 --- a/src/tlrmvm/Tlrmvm.hpp +++ b/src/tlrmvm/Tlrmvm.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef TLRMVM_H #define TLRMVM_H diff --git a/src/tlrmvm/cpu/PCMatrix.cpp b/src/tlrmvm/cpu/PCMatrix.cpp index e76c50a..08ebd69 100644 --- a/src/tlrmvm/cpu/PCMatrix.cpp +++ b/src/tlrmvm/cpu/PCMatrix.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/src/tlrmvm/cpu/PCMatrix.hpp b/src/tlrmvm/cpu/PCMatrix.hpp index ae7d601..a7e7ee3 100644 --- a/src/tlrmvm/cpu/PCMatrix.hpp +++ b/src/tlrmvm/cpu/PCMatrix.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef PCMATRIX_H #define PCMATRIX_H diff --git a/src/tlrmvm/cpu/TlrmvmCPU.cpp b/src/tlrmvm/cpu/TlrmvmCPU.cpp index a880922..5771919 100644 --- a/src/tlrmvm/cpu/TlrmvmCPU.cpp +++ b/src/tlrmvm/cpu/TlrmvmCPU.cpp @@ -1,4 +1,7 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include @@ -7,6 +10,8 @@ #include "../../common/AppUtil.hpp" #include "../../common/cpu/Util.hpp" #include +#include +#include #ifdef USE_MPI #include #endif @@ -124,7 +129,7 @@ void TlrmvmBase::InitData(){ RandomX(Datax, config.originN); this->xmat = Matrix(Datax, config.paddingN, 1); }else{ - char filename[200]; + char filename[300]; sprintf(filename, "%s/%s_Ubases_nb%d_acc%s.bin", config.datafolder.c_str(), config.problemname.c_str(),config.nb, config.acc.c_str()); size_t elems = config.granksum * config.nb; @@ -187,10 +192,10 @@ void TlrmvmBase::Phase1GetMembuffer(){ template void TlrmvmBase::Phase1GetMembufferTranspose() { - GetHostMemory(&p1transptrs.Abp, config.Ntg); - GetHostMemory(&p1transptrs.xbp, config.Ntg); - GetHostMemory(&p1transptrs.ybp, config.Ntg); - for(int i=0; i::Phase1GetMembufferTranspose() { p1transptrs.Acnt = 0; p1transptrs.Xcnt = 0; p1transptrs.Ycnt = 0; - for(int i=0; i::Phase2PrepareTranspose() { template void TlrmvmBase::Phase3GetMembuffer(){ - GetHostMemory(&p3ptrs.Abp, config.Ntg); - GetHostMemory(&p3ptrs.xbp, config.Ntg); - GetHostMemory(&p3ptrs.ybp, config.Ntg); - for(int i=0; i::Phase3GetMembuffer(){ p3ptrs.Acnt = 0; p3ptrs.Xcnt = 0; p3ptrs.Ycnt = 0; - for(int i=0; i #endif diff --git a/src/tlrmvm/cpu/TlrmvmDPCPP.hpp b/src/tlrmvm/cpu/TlrmvmDPCPP.hpp index d7e9e41..d218839 100644 --- a/src/tlrmvm/cpu/TlrmvmDPCPP.hpp +++ b/src/tlrmvm/cpu/TlrmvmDPCPP.hpp @@ -1,4 +1,7 @@ -#pragma once +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#pragma once #include "TlrmvmCPU.hpp" #include diff --git a/src/tlrmvm/cuda/BatchTlrmvmcuda.cpp b/src/tlrmvm/cuda/BatchTlrmvmcuda.cpp index f8a2ada..f63ee55 100644 --- a/src/tlrmvm/cuda/BatchTlrmvmcuda.cpp +++ b/src/tlrmvm/cuda/BatchTlrmvmcuda.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 02/03/2022. // diff --git a/src/tlrmvm/cuda/BatchTlrmvmcuda.hpp b/src/tlrmvm/cuda/BatchTlrmvmcuda.hpp index 21cd926..0dff1a7 100644 --- a/src/tlrmvm/cuda/BatchTlrmvmcuda.hpp +++ b/src/tlrmvm/cuda/BatchTlrmvmcuda.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include diff --git a/src/tlrmvm/cuda/BatchTlrmvmcudaFP16.cpp b/src/tlrmvm/cuda/BatchTlrmvmcudaFP16.cpp index 597d4ea..78cb430 100644 --- a/src/tlrmvm/cuda/BatchTlrmvmcudaFP16.cpp +++ b/src/tlrmvm/cuda/BatchTlrmvmcudaFP16.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 02/03/2022. // diff --git a/src/tlrmvm/cuda/BatchTlrmvmcudaFP16.hpp b/src/tlrmvm/cuda/BatchTlrmvmcudaFP16.hpp index 9428201..c5f8ec6 100644 --- a/src/tlrmvm/cuda/BatchTlrmvmcudaFP16.hpp +++ b/src/tlrmvm/cuda/BatchTlrmvmcudaFP16.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include diff --git a/src/tlrmvm/cuda/BatchTlrmvmcudaINT8.cpp b/src/tlrmvm/cuda/BatchTlrmvmcudaINT8.cpp index 7b72a24..89f23e6 100644 --- a/src/tlrmvm/cuda/BatchTlrmvmcudaINT8.cpp +++ b/src/tlrmvm/cuda/BatchTlrmvmcudaINT8.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 02/03/2022. // diff --git a/src/tlrmvm/cuda/BatchTlrmvmcudaINT8.hpp b/src/tlrmvm/cuda/BatchTlrmvmcudaINT8.hpp index 4298e4d..af6858e 100644 --- a/src/tlrmvm/cuda/BatchTlrmvmcudaINT8.hpp +++ b/src/tlrmvm/cuda/BatchTlrmvmcudaINT8.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include diff --git a/src/tlrmvm/cuda/TlrmvmMPfp16.cpp b/src/tlrmvm/cuda/TlrmvmMPfp16.cpp index 061628b..5182918 100644 --- a/src/tlrmvm/cuda/TlrmvmMPfp16.cpp +++ b/src/tlrmvm/cuda/TlrmvmMPfp16.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include "Tlrmvmcuda.hpp" #include "TlrmvmMPfp16.hpp" #include "cudakernel.cuh" diff --git a/src/tlrmvm/cuda/TlrmvmMPfp16.hpp b/src/tlrmvm/cuda/TlrmvmMPfp16.hpp index 0fd0fd3..8e67533 100644 --- a/src/tlrmvm/cuda/TlrmvmMPfp16.hpp +++ b/src/tlrmvm/cuda/TlrmvmMPfp16.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include #include diff --git a/src/tlrmvm/cuda/TlrmvmMPint8.cpp b/src/tlrmvm/cuda/TlrmvmMPint8.cpp index 0985da2..a1e7d79 100644 --- a/src/tlrmvm/cuda/TlrmvmMPint8.cpp +++ b/src/tlrmvm/cuda/TlrmvmMPint8.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include "Tlrmvmcuda.hpp" #include "TlrmvmMPint8.hpp" #include "cudakernel.cuh" diff --git a/src/tlrmvm/cuda/TlrmvmMPint8.hpp b/src/tlrmvm/cuda/TlrmvmMPint8.hpp index 8e6173a..9b714d4 100644 --- a/src/tlrmvm/cuda/TlrmvmMPint8.hpp +++ b/src/tlrmvm/cuda/TlrmvmMPint8.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include #include diff --git a/src/tlrmvm/cuda/Tlrmvmcuda.cpp b/src/tlrmvm/cuda/Tlrmvmcuda.cpp index 803873a..f1f7ffc 100644 --- a/src/tlrmvm/cuda/Tlrmvmcuda.cpp +++ b/src/tlrmvm/cuda/Tlrmvmcuda.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include // if you need CUBLAS v2, include before magma.h diff --git a/src/tlrmvm/cuda/Tlrmvmcuda.hpp b/src/tlrmvm/cuda/Tlrmvmcuda.hpp index ca61de1..426cfd8 100644 --- a/src/tlrmvm/cuda/Tlrmvmcuda.hpp +++ b/src/tlrmvm/cuda/Tlrmvmcuda.hpp @@ -1,4 +1,7 @@ -#pragma once +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#pragma once #include "../../common/Common.hpp" #include "../cpu/TlrmvmCPU.hpp" diff --git a/src/tlrmvm/cuda/TlrmvmcudaConstRank.cpp b/src/tlrmvm/cuda/TlrmvmcudaConstRank.cpp index c10425c..096da62 100644 --- a/src/tlrmvm/cuda/TlrmvmcudaConstRank.cpp +++ b/src/tlrmvm/cuda/TlrmvmcudaConstRank.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include // if you need CUBLAS v2, include before magma.h diff --git a/src/tlrmvm/cuda/TlrmvmcudaConstRank.hpp b/src/tlrmvm/cuda/TlrmvmcudaConstRank.hpp index 15809a2..aa6cfdc 100644 --- a/src/tlrmvm/cuda/TlrmvmcudaConstRank.hpp +++ b/src/tlrmvm/cuda/TlrmvmcudaConstRank.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include "Tlrmvmcuda.hpp" diff --git a/src/tlrmvm/cuda/cudakernel.cu b/src/tlrmvm/cuda/cudakernel.cu index 8f7ae1a..306929b 100644 --- a/src/tlrmvm/cuda/cudakernel.cu +++ b/src/tlrmvm/cuda/cudakernel.cu @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/src/tlrmvm/cuda/cudakernel.cuh b/src/tlrmvm/cuda/cudakernel.cuh index 58d5cb8..02873ef 100644 --- a/src/tlrmvm/cuda/cudakernel.cuh +++ b/src/tlrmvm/cuda/cudakernel.cuh @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #ifndef TLRMVMKERNEL_CUH #define TLRMVMKERNEL_CUH diff --git a/src/tlrmvm/cuda/tlrmvmcudautil.cpp b/src/tlrmvm/cuda/tlrmvmcudautil.cpp index 6ab97eb..9d47545 100644 --- a/src/tlrmvm/cuda/tlrmvmcudautil.cpp +++ b/src/tlrmvm/cuda/tlrmvmcudautil.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + // // Created by Yuxi Hong on 25/03/2022. // diff --git a/src/tlrmvm/cuda/tlrmvmcudautil.hpp b/src/tlrmvm/cuda/tlrmvmcudautil.hpp index ae81796..4ce5d6b 100644 --- a/src/tlrmvm/cuda/tlrmvmcudautil.hpp +++ b/src/tlrmvm/cuda/tlrmvmcudautil.hpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #pragma once #include diff --git a/src/tlrmvm/hip/BatchTlrmvmhip.cpp b/src/tlrmvm/hip/BatchTlrmvmhip.cpp new file mode 100644 index 0000000..9b7bf6f --- /dev/null +++ b/src/tlrmvm/hip/BatchTlrmvmhip.cpp @@ -0,0 +1,715 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +// +// Created by Yuxi Hong on 02/03/2022. +// + +#include "../../common/Common.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include +#include +#include +#include "BatchTlrmvmhip.hpp" +#include "hipkernel.cuh" +#include + +namespace hiptlrmvm +{ + template + BatchTlrmvmhip::BatchTlrmvmhip(vector tlrmvmconfigvec) + :config_vec(tlrmvmconfigvec),batchsize(tlrmvmconfigvec.size()) + { + cout << "calling Batch Tlrmvmcuda" << endl; +#ifdef USE_MPI + int initflag; + MPI_Initialized(&initflag); + if(initflag == 1){ + int rank; + int size; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + MPI_Comm_size(MPI_COMM_WORLD,&size); + if(rank == 0) + cout << "we are in mpi environment:" << endl; + int totaldevcount = 0; + HIPCHECK(hipGetDeviceCount(&totaldevcount)); + if(totaldevcount < size){ + if(rank == 0) + cout << "not enough cards, in debug mode, set all to 0." << endl; + HIPCHECK(hipSetDevice(0)); + }else{ + if(rank == 0) + cout << "we have enough cards, set to different cards." << endl; + HIPCHECK(hipSetDevice(rank%8)); + } + } + +#endif + cpuinstvec.resize(tlrmvmconfigvec.size()); + for(int i=0; i>(tlrmvmconfigvec[i])); + init_alpha_beta(alpha, beta); + finalresults.resize(tlrmvmconfigvec.size() * tlrmvmconfigvec[0].originM); + } + + template + BatchTlrmvmhip::BatchTlrmvmhip(){} + template + void BatchTlrmvmhip::StreamInit(int streamsize){ + this->stream_size = streamsize; + streamptr = new hipStream_t[streamsize]; + cublashandleptr = new hipblasHandle_t[streamsize]; + for(int i=0; i + void BatchTlrmvmhip::MemoryInit(){ + cudap1ptrs_vec.resize(batchsize); + cudap3ptrs_vec.resize(batchsize); + cudap1transptrs_vec.resize(batchsize); + cudap3transptrs_vec.resize(batchsize); + auto start = std::chrono::steady_clock::now(); +#pragma omp parallel for default(none) + for(int bi=0; biMemoryInit(); + PhasePointersCopyNonPointers + (cudap1ptrs_vec[bi],cpuinstvec[bi]->p1ptrs); + PhasePointersCopyNonPointers + (cudap3ptrs_vec[bi],cpuinstvec[bi]->p3ptrs); + PhasePointersCopyNonPointers( + cudap1transptrs_vec[bi],cpuinstvec[bi]->p1transptrs); + PhasePointersCopyNonPointers( + cudap3transptrs_vec[bi],cpuinstvec[bi]->p3transptrs); + } + auto end = std::chrono::steady_clock::now(); + auto elapse_time = std::chrono::duration_cast(end - start).count(); +#ifdef USE_MPI + int initflag; + MPI_Initialized(&initflag); + if(initflag == 1){ + int rank; + int size; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + MPI_Comm_size(MPI_COMM_WORLD,&size); + MPI_Barrier(MPI_COMM_WORLD); + auto recv_buffer = elapse_time; + MPI_Allreduce(&elapse_time, &recv_buffer, 1, + MPI_LONG_LONG, MPI_MAX, MPI_COMM_WORLD); + if(rank == 0) cout << "Reading data buffer takes time is " << recv_buffer * 1e-6 << " seconds."<< endl; + } +#else + cout << "Reading data buffer takes time is " << elapse_time * 1e-6 << endl; +#endif + Phase1Prepare(); + Phase2Prepare(); + Phase3Prepare(); + // transpose + Phase1PrepareTranspose(); + Phase2PrepareTranspose(); + Phase3PrepareTranspose(); + } + + template + void BatchTlrmvmhip::SetTransposeConjugate(bool transpose, bool conjugate){ + this->transpose = transpose; + this->conjugate = conjugate; + for(auto &x : cpuinstvec) x->SetTransposeConjugate(transpose, conjugate); + } + + + template + void BatchTlrmvmhip::setX(HostType * xvector, size_t xlength){ + int offset = 0; + assert(xlength == config_vec[0].originN * batchsize); + for(int i=0; isetX(xvector + offset , config_vec[i].originN); + offset += config_vec[i].originN; + } + } + + template + void BatchTlrmvmhip::TryConjugateXvec() { + for(int bi=0; biTryConjugateXvec(); + CopyDataB2HD((HostType*)cudap1ptrs_vec[bi].x, cpuinstvec[bi]->p1ptrs.x, cpuinstvec[bi]->xmat.Shape()[0]); + } + } + + template + void BatchTlrmvmhip::Phase1(){ + hipDeviceSynchronize(); + for(int bi=0; bi < batchsize; bi++){ + for(int i=0; i + void BatchTlrmvmhip::Phase1Transpose(){ + hipDeviceSynchronize(); + for(int bi=0; bi < batchsize; bi++){ + for(int i=0; i + void BatchTlrmvmhip::Phase1Prepare() { + + for(int bi=0; bip1ptrs.A, cpuinstvec[bi]->p1ptrs.Acnt); + CopyDataB2HD((HostType*)cudap1ptrs_vec[bi].x, cpuinstvec[bi]->p1ptrs.x, cpuinstvec[bi]->p1ptrs.Xcnt); + } + + } + + template + void BatchTlrmvmhip::Phase1PrepareTranspose() { + for(int bi=0; bi < batchsize; bi++){ + int curbatch = cudap1ptrs_vec[bi].Ms.size(); + GethipHostMemory(&cudap1transptrs_vec[bi].Abp, curbatch); + GethipHostMemory(&cudap1transptrs_vec[bi].xbp, curbatch); + GethipHostMemory(&cudap1transptrs_vec[bi].ybp, curbatch); + } + for(int bi=0; bi + void BatchTlrmvmhip::Phase2(){ + hipDeviceSynchronize(); + for(int bi=0; bi(cudap1ptrs_vec[bi].y, d_phase2mapping_vec[bi], + cudap3ptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[bi % stream_size]); + } + hipDeviceSynchronize(); + } + template + void BatchTlrmvmhip::Phase2Transpose(){ + hipDeviceSynchronize(); + for(int bi=0; bi(cudap1transptrs_vec[bi].y, d_phase2mapping_transpose_vec[bi], + cudap3transptrs_vec[bi].x, config_vec[bi].granksum, + streamptr[bi%stream_size]); + } + hipDeviceSynchronize(); + } + + template + void BatchTlrmvmhip::Phase2Prepare(){ + d_phase2mapping_vec = new size_t*[batchsize]; + for(int bi=0; bih_phase2mapping.size()); + CopyDataB2HD(d_phase2mapping_vec[bi], cpuinstvec[bi]->h_phase2mapping.data(), + cpuinstvec[bi]->h_phase2mapping.size()); + } + } + template + void BatchTlrmvmhip::Phase2PrepareTranspose(){ + d_phase2mapping_transpose_vec = new size_t*[batchsize]; + for(int bi=0; bih_phase2mappingTranspose.size()); + CopyDataB2HD(d_phase2mapping_transpose_vec[bi], cpuinstvec[bi]->h_phase2mappingTranspose.data(), + cpuinstvec[bi]->h_phase2mappingTranspose.size()); + } + } + + template + void BatchTlrmvmhip::Phase3(){ + hipDeviceSynchronize(); + for(int bi=0; bi + void BatchTlrmvmhip::Phase3Transpose(){ + hipDeviceSynchronize(); + for(int bi=0; bi + void BatchTlrmvmhip::Phase3Prepare() { + for(int bi=0; biconfig.Mtg; i++){ + size_t AuMK = AuMs[i-1] * AuKs[i-1]; + size_t AuKN = AuKs[i-1] * AuNs[i-1]; + size_t AuMN = AuMs[i-1] * AuNs[i-1]; + cudap3ptrs_vec[bi].Abp[i] = cudap3ptrs_vec[bi].Abp[i-1] + AuMK; + cudap3ptrs_vec[bi].xbp[i] = cudap3ptrs_vec[bi].xbp[i-1] + AuKN; + cudap3ptrs_vec[bi].ybp[i] = cudap3ptrs_vec[bi].ybp[i-1] + AuMN; + } + // load phase 3 A to GPU + CopyDataB2HD((HostType*)cudap3ptrs_vec[bi].A, cpuinstvec[bi]->p3ptrs.A, cudap3ptrs_vec[bi].Acnt); + } + } + template + void BatchTlrmvmhip::Phase3PrepareTranspose() { + for(int bi=0; bi + void BatchTlrmvmhip::MVM_MultiGraph(){ + if(transpose){ + MVM_MultiGraphTranspose(); + }else{ + MVM_MultiGraphNoTranspose(); + } + } + + template + void BatchTlrmvmhip::MVM_MultiGraphTranspose(){ + auto & graphCreated = transposemultigraph.graphCreated; + auto & event_start = transposemultigraph.event_start; + auto & events = transposemultigraph.events; + auto & graph = transposemultigraph.graph; + auto & instance = transposemultigraph.instance; + for(int bi=0; bi(cudap1transptrs_vec[bi].y, + d_phase2mapping_transpose_vec[bi], + cudap3transptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[0]); + hipEventRecord(events[bi][0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], events[bi][0],0); + } + // phase 3 transpose + for(int i=0; i + void BatchTlrmvmhip::MVM_MultiGraphNoTranspose() + { + auto & graphCreated = multigraph.graphCreated; + auto & event_start = multigraph.event_start; + auto & events = multigraph.events; + auto & graph = multigraph.graph; + auto & instance = multigraph.instance; + for(int bi=0; bi(cudap1ptrs_vec[bi].y, + d_phase2mapping_vec[bi], cudap3ptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[0]); + hipEventRecord(events[bi][0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], events[bi][0],0); + } + // phase 3 + for(int i=0; i + void BatchTlrmvmhip::MVM_SingleGraph() + { + if(transpose){ + MVM_MultiGraphTranspose(); + }else{ + MVM_MultiGraphNoTranspose(); + } + } + + template + void BatchTlrmvmhip::MVM_SingleGraphTranspose() + { + auto & graphCreated = transposesinglegraph.graphCreated; + auto & event_start = transposesinglegraph.event_start; + auto & events = transposesinglegraph.events; + auto & graph = transposesinglegraph.graph; + auto & instance = transposesinglegraph.instance; + if(!graphCreated){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + transposesinglegraph.syncotherstreams(event_start, streamptr, stream_size); + for(int bi=0; bi(cudap1transptrs_vec[bi].y, + d_phase2mapping_transpose_vec[bi], + cudap3transptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[0]); + } + // phase 2 synchronization + transposesinglegraph.syncallstreams(events+1*stream_size, streamptr, stream_size); + for(int bi=0; bi + void BatchTlrmvmhip::MVM_SingleGraphNoTranspose() + { + auto & graphCreated = singlegraph.graphCreated; + auto & event_start = singlegraph.event_start; + auto & events = singlegraph.events; + auto & graph = singlegraph.graph; + auto & instance = singlegraph.instance; + + if(!graphCreated){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + singlegraph.syncotherstreams(event_start, streamptr, stream_size); + // phase 1 + for(int bi=0; bi(cudap1ptrs_vec[bi].y, + d_phase2mapping_vec[bi], cudap3ptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[bi%stream_size]); + } + // phase 2 synchronization + singlegraph.syncallstreams(events+1*stream_size, streamptr, stream_size); + // phase 3 + for(int bi=0; bi + void BatchTlrmvmhip::TryConjugateResults() { +// if(!conjugate) return; + if(transpose){ + for(int bi=0; bi(cudap3transptrs_vec[bi].y,config_vec[bi].originN, streamptr[0]); + CopyDataB2HD(cpuinstvec[bi]->p3transptrs.y, (HostType*)cudap3transptrs_vec[bi].y,cpuinstvec[bi]->config.originM); + } + }else{ + for(int bi=0; bi(cudap3ptrs_vec[bi].y,config_vec[bi].originM,streamptr[0]); + CopyDataB2HD(cpuinstvec[bi]->p3ptrs.y, (HostType*)cudap3ptrs_vec[bi].y,cpuinstvec[bi]->config.originM); + } + } + } + + + template + void BatchTlrmvmhip::CopyBackResults() + { + size_t offset = 0, origin = 0; + for(int bi=0; bip1transptrs.y, (HostType*)cudap1transptrs_vec[bi].y, cpuinstvec[bi]->config.granksum); + CopyDataB2HD(cpuinstvec[bi]->p3transptrs.x, (HostType*)cudap3transptrs_vec[bi].x, cpuinstvec[bi]->config.granksum); + CopyDataB2HD(cpuinstvec[bi]->p3transptrs.y, (HostType*)cudap3transptrs_vec[bi].y, cpuinstvec[bi]->config.originM); + origin = cpuinstvec[bi]->config.originM; +// memcpy(cpuinstvec[bi]->p3transptrs.y, &alpha, sizeof(HostType)); + memcpy(finalresults.data() + offset,cpuinstvec[bi]->p3transptrs.y, sizeof(HostType) * origin); + offset += cpuinstvec[bi]->config.originM; + }else{ + CopyDataB2HD(cpuinstvec[bi]->p1ptrs.y, (HostType*)cudap1ptrs_vec[bi].y, cpuinstvec[bi]->config.granksum); + CopyDataB2HD(cpuinstvec[bi]->p3ptrs.x, (HostType*)cudap3ptrs_vec[bi].x, cpuinstvec[bi]->config.granksum); + CopyDataB2HD(cpuinstvec[bi]->p3ptrs.y, (HostType*)cudap3ptrs_vec[bi].y, cpuinstvec[bi]->config.originM); + origin = cpuinstvec[bi]->config.originM; + memcpy(finalresults.data() + offset,cpuinstvec[bi]->p3ptrs.y, sizeof(HostType) * origin); + offset += cpuinstvec[bi]->config.originM; + } + cpuinstvec[bi]->CopyToFinalresults(); + } + } + + template + void BatchTlrmvmhip::MemoryFree(){ + for(int bi=0; biMemoryFree(); + FreehipHostMemory(cudap1ptrs_vec[bi].Abp); + FreehipHostMemory(cudap1ptrs_vec[bi].xbp); + FreehipHostMemory(cudap1ptrs_vec[bi].ybp); + FreeDeviceMemory(cudap1ptrs_vec[bi].A); + FreeDeviceMemory(cudap1ptrs_vec[bi].x); + FreeDeviceMemory(cudap1ptrs_vec[bi].y); + + FreehipHostMemory(cudap3ptrs_vec[bi].Abp); + FreehipHostMemory(cudap3ptrs_vec[bi].xbp); + FreehipHostMemory(cudap3ptrs_vec[bi].ybp); + FreeDeviceMemory(cudap3ptrs_vec[bi].A); + FreeDeviceMemory(cudap3ptrs_vec[bi].x); + FreeDeviceMemory(cudap3ptrs_vec[bi].y); + + FreehipHostMemory(cudap1transptrs_vec[bi].Abp); + FreehipHostMemory(cudap1transptrs_vec[bi].xbp); + FreehipHostMemory(cudap1transptrs_vec[bi].ybp); + FreeDeviceMemory(cudap1transptrs_vec[bi].y); + + FreehipHostMemory(cudap3transptrs_vec[bi].Abp); + FreehipHostMemory(cudap3transptrs_vec[bi].xbp); + FreehipHostMemory(cudap3transptrs_vec[bi].ybp); + FreeDeviceMemory(cudap3transptrs_vec[bi].y); + } + } + + template class BatchTlrmvmhip; + template class BatchTlrmvmhip; + template class BatchTlrmvmhip, hipComplex>; + template class BatchTlrmvmhip, hipDoubleComplex>; + + +} \ No newline at end of file diff --git a/src/tlrmvm/hip/BatchTlrmvmhip.hpp b/src/tlrmvm/hip/BatchTlrmvmhip.hpp new file mode 100644 index 0000000..106155c --- /dev/null +++ b/src/tlrmvm/hip/BatchTlrmvmhip.hpp @@ -0,0 +1,87 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#pragma once + +#include +using std::vector; + + +#include "../cpu/TlrmvmCPU.hpp" +#include "Tlrmvmhip.hpp" +#include "tlrmvmhiputil.hpp" +#include + +#ifdef USE_MPI +#include +#endif + +namespace hiptlrmvm +{ + template + class BatchTlrmvmhip + { + public: + explicit BatchTlrmvmhip(vector tlrmvmconfigvec); + BatchTlrmvmhip(); + void StreamInit(int streamsize); + void StreamDestroy(); + void MemoryInit(); + void MemoryFree(); + void Phase1(); + void Phase1Transpose(); + void Phase1Prepare(); + void Phase1PrepareTranspose(); + void Phase2(); + void Phase2Transpose(); + void Phase2Prepare(); + void Phase2PrepareTranspose(); + void Phase3(); + void Phase3Transpose(); + void Phase3Prepare(); + void Phase3PrepareTranspose(); + void MVM_SingleGraph(); + void MVM_SingleGraphTranspose(); + void MVM_SingleGraphNoTranspose(); + void MVM_MultiGraph(); + void MVM_MultiGraphTranspose(); + void MVM_MultiGraphNoTranspose(); + + // seperate 2 functions. + void SetTransposeConjugate(bool transpose, bool conjugate); + void setX(HostType * xvector, size_t xlength); + void TryConjugateXvec(); + void TryConjugateResults(); + void CopyBackResults(); + + bool transpose; + bool conjugate; + + int batchsize; + // cpu instance + vector config_vec; + vector>> cpuinstvec; + + // GPU resources + hipStream_t * streamptr; + hipblasHandle_t * cublashandleptr; + int stream_size; + + MultiGraph multigraph; + MultiGraph transposemultigraph; + SingleGraph singlegraph; + SingleGraph transposesinglegraph; + + DeviceType alpha; + DeviceType beta; + // gpu pointers + vector> cudap1ptrs_vec; + vector> cudap1transptrs_vec; + size_t * *d_phase2mapping_vec; + size_t * *d_phase2mapping_transpose_vec; + vector> cudap3ptrs_vec; + vector> cudap3transptrs_vec; + vector finalresults; + }; +} + diff --git a/src/tlrmvm/hip/Tlrmvmhip.cpp b/src/tlrmvm/hip/Tlrmvmhip.cpp new file mode 100644 index 0000000..eec29ac --- /dev/null +++ b/src/tlrmvm/hip/Tlrmvmhip.cpp @@ -0,0 +1,610 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#include +#include +#include + +#include "../../common/Common.hpp" +#include "../../common/AppUtil.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include "Tlrmvmhip.hpp" +#include "hipkernel.cuh" + +namespace hiptlrmvm +{ + + template + HIPPhasePointers::HIPPhasePointers(){} + + template struct HIPPhasePointers; + template struct HIPPhasePointers; + template struct HIPPhasePointers; + template struct HIPPhasePointers; + + template + void PhasePointersCopyNonPointers(HIPPhasePointers &dest, const PhasePointers &src){ + dest.Acnt = src.Acnt; + dest.Xcnt = src.Xcnt; + dest.Ycnt = src.Ycnt; + dest.Ms = src.Ms; + dest.Ks = src.Ks; + dest.Ns = src.Ns; + } + + template void PhasePointersCopyNonPointers(HIPPhasePointers &, + const PhasePointers&); + template void PhasePointersCopyNonPointers(HIPPhasePointers &, + const PhasePointers&); + template void PhasePointersCopyNonPointers,hipComplex> + (HIPPhasePointers &, const PhasePointers>&); + template void PhasePointersCopyNonPointers,hipDoubleComplex> + (HIPPhasePointers &, const PhasePointers>&); + + template + Tlrmvmhip::Tlrmvmhip() {} + + template + Tlrmvmhip::Tlrmvmhip(TlrmvmConfig tlrmvmconfig) + :config(tlrmvmconfig) + { + transpose = false; + conjugate = false; + init_alpha_beta(alpha, beta); + tlrmvmcpu = std::make_shared>(tlrmvmconfig); + } + + template + void Tlrmvmhip::UpdateConfig(TlrmvmConfig &tlrmvmconfig) + { +// transpose = false; +// conjugate = false; +// init_alpha_beta(alpha, beta); +// tlrmvmcpu->UpdateConfig(tlrmvmconfig); + cout << "UpdateConfig not implemented." << endl; + exit(0); + } + + template + void Tlrmvmhip::setX(HostType * xvector, size_t xlength){ + tlrmvmcpu->setX(xvector, xlength); + tlrmvmcpu->TryConjugateXvec(); + } + + template + void Tlrmvmhip::TryConjugateXvec() { + // no transpose logic + tlrmvmcpu->TryConjugateXvec(); + CopyDataB2HD((HostType*)this->cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->xmat.Shape()[0]); + } + template + void Tlrmvmhip::TryConjugateResults() { + if(!conjugate) return; + if(transpose){ + ConjugateDriver(cudap3transptrs.y, config.originN, streamptr[0]); + CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM); + }else{ + ConjugateDriver(cudap3ptrs.y, config.originM, streamptr[0]); + CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM); + } + } + + template + void Tlrmvmhip::StreamInit(int streamsize){ + this->stream_size = streamsize; + streamptr = new hipStream_t[streamsize]; + cublashandleptr = new hipblasHandle_t[streamsize]; + for(int i=0; i + void Tlrmvmhip::StreamDestroy(){ + for(int i=0; i + void Tlrmvmhip::MemoryInit(){ + tlrmvmcpu->MemoryInit(); + PhasePointersCopyNonPointers(cudap1ptrs, tlrmvmcpu->p1ptrs); + PhasePointersCopyNonPointers(cudap3ptrs, tlrmvmcpu->p3ptrs); + PhasePointersCopyNonPointers(cudap1transptrs, tlrmvmcpu->p1transptrs); + PhasePointersCopyNonPointers(cudap3transptrs, tlrmvmcpu->p3transptrs); + Phase1GetMembuffer(); + AllocatePhase1Buffer(); + Phase1CopyData(); + Phase2Prepare(); + Phase3GetMembuffer(); + AllocatePhase3Buffer(); + Phase3CopyData(); + // transpose + Phase1GetMembufferTranspose(); + AllocatePhase1BufferTranspose(); + Phase1CopyDataTranspose(); + Phase2PrepareTranspose(); + Phase3GetMembufferTranspose(); + AllocatePhase3BufferTranspose(); + Phase3CopyDataTranspose(); + } + + template + void Tlrmvmhip::MemoryFree(){ + tlrmvmcpu->MemoryFree(); + FreehipHostMemory(cudap1ptrs.Abp); + FreehipHostMemory(cudap1ptrs.xbp); + FreehipHostMemory(cudap1ptrs.ybp); + FreeDeviceMemory(cudap1ptrs.A); + FreeDeviceMemory(cudap1ptrs.x); + FreeDeviceMemory(cudap1ptrs.y); + + FreehipHostMemory(cudap3ptrs.Abp); + FreehipHostMemory(cudap3ptrs.xbp); + FreehipHostMemory(cudap3ptrs.ybp); + FreeDeviceMemory(cudap3ptrs.A); + FreeDeviceMemory(cudap3ptrs.x); + FreeDeviceMemory(cudap3ptrs.y); + + FreehipHostMemory(cudap1transptrs.Abp); + FreehipHostMemory(cudap1transptrs.xbp); + FreehipHostMemory(cudap1transptrs.ybp); + FreeDeviceMemory(cudap1transptrs.y); + + FreehipHostMemory(cudap3transptrs.Abp); + FreehipHostMemory(cudap3transptrs.xbp); + FreehipHostMemory(cudap3transptrs.ybp); + FreeDeviceMemory(cudap3transptrs.y); + } + + template + void Tlrmvmhip::Phase1(){ + hipDeviceSynchronize(); + for(int i=0; i + void Tlrmvmhip::Phase1Transpose(){ + hipDeviceSynchronize(); + for(int i=0; i + void Tlrmvmhip::Phase1GetMembuffer(){ + int batchsize = cudap1ptrs.Ms.size(); + GethipHostMemory(&cudap1ptrs.Abp, batchsize); + GethipHostMemory(&cudap1ptrs.xbp, batchsize); + GethipHostMemory(&cudap1ptrs.ybp, batchsize); + } + template + void Tlrmvmhip::Phase1GetMembufferTranspose() + { + int batchsize = cudap1ptrs.Ms.size(); + GethipHostMemory(&cudap1transptrs.Abp, batchsize); + GethipHostMemory(&cudap1transptrs.xbp, batchsize); + GethipHostMemory(&cudap1transptrs.ybp, batchsize); + } + + template + void Tlrmvmhip::AllocatePhase1Buffer(){ + GetDeviceMemory(&cudap1ptrs.A, cudap1ptrs.Acnt); + GetDeviceMemory(&cudap1ptrs.x, cudap1ptrs.Xcnt); + GetDeviceMemory(&cudap1ptrs.y, cudap1ptrs.Ycnt); + cudap1ptrs.Abp[0] = cudap1ptrs.A; + cudap1ptrs.xbp[0] = cudap1ptrs.x; + cudap1ptrs.ybp[0] = cudap1ptrs.y; + } + template + void Tlrmvmhip::AllocatePhase1BufferTranspose(){ + cudap1transptrs.A = cudap3ptrs.A; + cudap1transptrs.x = cudap1ptrs.x; + GetDeviceMemory(&cudap1transptrs.y, cudap1transptrs.Ycnt); + cudap1transptrs.Abp[0] = cudap3ptrs.A; // use phase 3, U bases + cudap1transptrs.xbp[0] = cudap1ptrs.x; // use phase 1, x + cudap1transptrs.ybp[0] = cudap1transptrs.y; // create a new buffer + } + + template + void Tlrmvmhip::Phase1CopyData(){ + auto AvMs = cudap1ptrs.Ms; + auto AvNs = cudap1ptrs.Ns; + auto AvKs = cudap1ptrs.Ks; + for(int i=1; ip1ptrs.A, tlrmvmcpu->p1ptrs.Acnt); + CopyDataB2HD((HostType*)cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->p1ptrs.Xcnt); + } + template + void Tlrmvmhip::Phase1CopyDataTranspose(){ + for(int i=1; i + void Tlrmvmhip::Phase2(){ + hipDeviceSynchronize(); + phase2_nosplit(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, streamptr[0]); + hipDeviceSynchronize(); + } + template + void Tlrmvmhip::Phase2Transpose(){ + hipDeviceSynchronize(); + phase2_nosplit(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, streamptr[0]); + hipDeviceSynchronize(); + } + + template + void Tlrmvmhip::Phase2Prepare(){ + GetDeviceMemory(&d_phase2mapping, tlrmvmcpu->h_phase2mapping.size()); + CopyDataB2HD(d_phase2mapping, tlrmvmcpu->h_phase2mapping.data(),tlrmvmcpu->h_phase2mapping.size()); + } + template + void Tlrmvmhip::Phase2PrepareTranspose(){ + GetDeviceMemory(&d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.size()); + CopyDataB2HD(d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.data(), + tlrmvmcpu->h_phase2mappingTranspose.size()); + } + + template + void Tlrmvmhip::Phase3(){ + hipDeviceSynchronize(); + for(int i=0; i + void Tlrmvmhip::Phase3Transpose(){ + hipDeviceSynchronize(); + for(int i=0; i + void Tlrmvmhip::Phase3GetMembuffer(){ + int batchsize = cudap3ptrs.Ms.size(); + GethipHostMemory(&cudap3ptrs.Abp, batchsize); + GethipHostMemory(&cudap3ptrs.xbp, batchsize); + GethipHostMemory(&cudap3ptrs.ybp, batchsize); + } + template + void Tlrmvmhip::Phase3GetMembufferTranspose(){ + int batchsize = cudap3transptrs.Ms.size(); + GethipHostMemory(&cudap3transptrs.Abp, batchsize); + GethipHostMemory(&cudap3transptrs.xbp, batchsize); + GethipHostMemory(&cudap3transptrs.ybp, batchsize); + } + + template + void Tlrmvmhip::AllocatePhase3Buffer(){ + GetDeviceMemory(&cudap3ptrs.A, cudap3ptrs.Acnt); + GetDeviceMemory(&cudap3ptrs.x, cudap3ptrs.Xcnt); + GetDeviceMemory(&cudap3ptrs.y, cudap3ptrs.Ycnt); + cudap3ptrs.Abp[0] = cudap3ptrs.A; + cudap3ptrs.xbp[0] = cudap3ptrs.x; + cudap3ptrs.ybp[0] = cudap3ptrs.y; + } + template + void Tlrmvmhip::AllocatePhase3BufferTranspose(){ + cudap3transptrs.A = cudap1ptrs.A; + cudap3transptrs.x = cudap3ptrs.x; + GetDeviceMemory(&cudap3transptrs.y, cudap3transptrs.Ycnt); + cudap3transptrs.Abp[0] = cudap1ptrs.A; // use phase 1, V bases + cudap3transptrs.xbp[0] = cudap3ptrs.x; // use phase 3, x + cudap3transptrs.ybp[0] = cudap3transptrs.y; // create a new buffer + } + + template + void Tlrmvmhip::Phase3CopyData(){ + auto AuMs = cudap3ptrs.Ms; + auto AuNs = cudap3ptrs.Ns; + auto AuKs = cudap3ptrs.Ks; + for(int i=1; iconfig.Mtg; i++){ + size_t AuMK = AuMs[i-1] * AuKs[i-1]; + size_t AuKN = AuKs[i-1] * AuNs[i-1]; + size_t AuMN = AuMs[i-1] * AuNs[i-1]; + cudap3ptrs.Abp[i] = cudap3ptrs.Abp[i-1] + AuMK; + cudap3ptrs.xbp[i] = cudap3ptrs.xbp[i-1] + AuKN; + cudap3ptrs.ybp[i] = cudap3ptrs.ybp[i-1] + AuMN; + } + // load phase 3 A to GPU + CopyDataB2HD((HostType*)cudap3ptrs.A, tlrmvmcpu->p3ptrs.A, cudap3ptrs.Acnt); + } + template + void Tlrmvmhip::Phase3CopyDataTranspose(){ + for(int i=1; i + void Tlrmvmhip::MVMGraphTranspose() { + if(!graphCreated){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + hipEventRecord(event_start, streamptr[0]); + for(int streami=1; streami(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, streamptr[0]); + hipEventRecord(events[0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], events[0],0); + } + // phase 3 transpose + for(int i=0; i + void Tlrmvmhip::MVMGraphNoTranspose() { + if(!graphCreated){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + hipEventRecord(event_start, streamptr[0]); + for(int streami=1; streami(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, streamptr[0]); + hipEventRecord(events[0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], events[0],0); + } + // phase 3 + for(int i=0; i + void Tlrmvmhip::MVMGraph(){ + if(transpose){ + MVMGraphTranspose(); + }else{ + MVMGraphNoTranspose(); + } + } + + template + void Tlrmvmhip::MVMTranspose() + { + hipDeviceSynchronize(); + for(int i=0; i(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, streamptr[0]); + hipDeviceSynchronize(); + for(int i=0; i + void Tlrmvmhip::MVMNoTranspose() + { + hipDeviceSynchronize(); + for(int i=0; i(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, streamptr[0]); + hipDeviceSynchronize(); + for(int i=0; i + void Tlrmvmhip::CopyBackResults() + { + // use cpu pointers to send output + if(transpose){ + CopyDataB2HD(tlrmvmcpu->p1transptrs.y, (HostType*)cudap1transptrs.y, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3transptrs.x, (HostType*)cudap3transptrs.x, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM); + }else{ + CopyDataB2HD(tlrmvmcpu->p1ptrs.y, (HostType*)cudap1ptrs.y, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3ptrs.x, (HostType*)cudap3ptrs.x, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM); + } + tlrmvmcpu->CopyToFinalresults(); + } + + template + void Tlrmvmhip::MVM() { + if(transpose){ + MVMTranspose(); + }else{ + MVMNoTranspose(); + } + } + + template + void Tlrmvmhip::SetTransposeConjugate(bool transpose, bool conjugate) { + this->transpose = transpose; + this->conjugate = conjugate; + tlrmvmcpu->SetTransposeConjugate(transpose, conjugate); + } + + + + template class Tlrmvmhip; + template class Tlrmvmhip; + template class Tlrmvmhip, hipComplex>; + template class Tlrmvmhip, hipDoubleComplex>; + +} // namespace cudatlrmvm + diff --git a/src/tlrmvm/hip/Tlrmvmhip.hpp b/src/tlrmvm/hip/Tlrmvmhip.hpp new file mode 100644 index 0000000..33f2c04 --- /dev/null +++ b/src/tlrmvm/hip/Tlrmvmhip.hpp @@ -0,0 +1,117 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#pragma once + +#include "../../common/Common.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include "../../common/hip/Util.hpp" +#include +#include +#include +#include +#include + +namespace hiptlrmvm +{ + template + struct HIPPhasePointers{ + HIPPhasePointers(); + size_t Acnt; + size_t Xcnt; + size_t Ycnt; + vector Ms; + vector Ks; + vector Ns; + T *A; + T *x; + T *y; + T **Abp; + T **xbp; + T **ybp; + }; + + template + void PhasePointersCopyNonPointers(HIPPhasePointers &dest, const PhasePointers &src); + + // Tlrmvm cuda is only responsible for cuda memory ops. + // Any host memory related ops should go to CPU instance. + template + class Tlrmvmhip + { + public: + explicit Tlrmvmhip(TlrmvmConfig tlrmvmconfig); + Tlrmvmhip(); + void UpdateConfig(TlrmvmConfig &tlrmvmConfig); + void SetTransposeConjugate(bool transpose, bool conjugate); + void StreamInit(int streamsize); + void StreamDestroy(); + void MemoryInit(); + void MemoryFree(); + void Phase1(); + void Phase1Transpose(); + void Phase1GetMembuffer(); + void AllocatePhase1Buffer(); + void Phase1CopyData(); + void Phase1GetMembufferTranspose(); + void AllocatePhase1BufferTranspose(); + void Phase1CopyDataTranspose(); + void Phase2(); + void Phase2Transpose(); + void Phase2Prepare(); + void Phase2PrepareTranspose(); + void Phase3(); + void Phase3Transpose(); + void Phase3GetMembuffer(); + void AllocatePhase3Buffer(); + void Phase3CopyData(); + void Phase3GetMembufferTranspose(); + void AllocatePhase3BufferTranspose(); + void Phase3CopyDataTranspose(); + void MVM(); + void MVMTranspose(); + void MVMNoTranspose(); + void MVMGraph(); + void MVMGraphTranspose(); + void MVMGraphNoTranspose(); + void setX(HostType * xvector, size_t xlength); + // seperate 2 functions. + void TryConjugateXvec(); + void TryConjugateResults(); + void CopyBackResults(); + + bool transpose; + bool conjugate; + // cpu instance + TlrmvmConfig config; + shared_ptr> tlrmvmcpu; + // GPU resources + hipStream_t * streamptr; + hipblasHandle_t * cublashandleptr; + int stream_size; + hipGraph_t graph; + bool graphCreated; + hipGraphExec_t instance; + hipEvent_t *events; + hipEvent_t event_start; + hipEvent_t event_phase2finish; + + DeviceType alpha; + DeviceType beta; + + // gpu pointers + HIPPhasePointers cudap1ptrs; + HIPPhasePointers cudap1transptrs; + size_t *d_phase2mapping; + size_t *d_phase2mapping_transpose; + HIPPhasePointers cudap3ptrs; + HIPPhasePointers cudap3transptrs; + }; + + + + +} // + + + diff --git a/src/tlrmvm/hip/TlrmvmhipConstRank.cpp b/src/tlrmvm/hip/TlrmvmhipConstRank.cpp new file mode 100644 index 0000000..e7ac72b --- /dev/null +++ b/src/tlrmvm/hip/TlrmvmhipConstRank.cpp @@ -0,0 +1,487 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#include +#include +#include + +#include "../../common/Common.hpp" +#include "../../common/AppUtil.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include "Tlrmvmhip.hpp" +#include "TlrmvmhipConstRank.hpp" +#include "hipkernel.cuh" + +namespace hiptlrmvm +{ + template + TlrmvmhipConstRank::TlrmvmhipConstRank() {} + + template + TlrmvmhipConstRank::TlrmvmhipConstRank(TlrmvmConfig tlrmvmconfig) + :config(tlrmvmconfig) + { + transpose = false; + conjugate = false; + init_alpha_beta(alpha, beta); + tlrmvmcpu = std::make_shared>(tlrmvmconfig); + } + + template + void TlrmvmhipConstRank::UpdateConfig(TlrmvmConfig &tlrmvmconfig) + { +// transpose = false; +// conjugate = false; +// init_alpha_beta(alpha, beta); +// tlrmvmcpu->UpdateConfig(tlrmvmconfig); + cout << "UpdateConfig not implemented." << endl; + exit(0); + } + + template + void TlrmvmhipConstRank::setX(HostType * xvector, size_t xlength){ + tlrmvmcpu->setX(xvector, xlength); + tlrmvmcpu->TryConjugateXvec(); + } + + template + void TlrmvmhipConstRank::TryConjugateXvec() { + // no transpose logic + tlrmvmcpu->TryConjugateXvec(); + CopyDataB2HD((HostType*)this->cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->xmat.Shape()[0]); + } + template + void TlrmvmhipConstRank::TryConjugateResults() { + if(!conjugate) return; + if(transpose){ + ConjugateDriver(cudap3transptrs.y, config.originN, stream); + CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM); + }else{ + ConjugateDriver(cudap3ptrs.y, config.originM, stream); + CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM); + } + } + + template + void TlrmvmhipConstRank::StreamInit(int streamsize){ + hipStreamCreateWithFlags(&stream, hipStreamNonBlocking); + hipblasCreate(&cublashandle); + hipblasSetStream(cublashandle, stream); + } + + template + void TlrmvmhipConstRank::StreamDestroy(){ + hipblasDestroy(cublashandle); + hipStreamDestroy(stream); + } + + template + void TlrmvmhipConstRank::MemoryInit(){ + tlrmvmcpu->MemoryInit(); + PhasePointersCopyNonPointers(cudap1ptrs, tlrmvmcpu->p1ptrs); + PhasePointersCopyNonPointers(cudap3ptrs, tlrmvmcpu->p3ptrs); + PhasePointersCopyNonPointers(cudap1transptrs, tlrmvmcpu->p1transptrs); + PhasePointersCopyNonPointers(cudap3transptrs, tlrmvmcpu->p3transptrs); + Phase1GetMembuffer(); + AllocatePhase1Buffer(); + Phase1CopyData(); + Phase2Prepare(); + Phase3GetMembuffer(); + AllocatePhase3Buffer(); + Phase3CopyData(); + // transpose + Phase1GetMembufferTranspose(); + AllocatePhase1BufferTranspose(); + Phase1CopyDataTranspose(); + Phase2PrepareTranspose(); + Phase3GetMembufferTranspose(); + AllocatePhase3BufferTranspose(); + Phase3CopyDataTranspose(); + + // init batch pointers + GetDeviceMemory(&d_p1Aptrs, cudap1ptrs.Ms.size()); + GetDeviceMemory(&d_p1xptrs, cudap1ptrs.Ms.size()); + GetDeviceMemory(&d_p1yptrs, cudap1ptrs.Ms.size()); + GetDeviceMemory(&d_p3Aptrs, cudap3ptrs.Ms.size()); + GetDeviceMemory(&d_p3xptrs, cudap3ptrs.Ms.size()); + GetDeviceMemory(&d_p3yptrs, cudap3ptrs.Ms.size()); + + CopyDataB2HD((HostType**)d_p1Aptrs, (HostType**)cudap1ptrs.Abp, cudap1ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p1xptrs, (HostType**)cudap1ptrs.xbp, cudap1ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p1yptrs, (HostType**)cudap1ptrs.ybp, cudap1ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3Aptrs, (HostType**)cudap3ptrs.Abp, cudap3ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3xptrs, (HostType**)cudap3ptrs.xbp, cudap3ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3yptrs, (HostType**)cudap3ptrs.ybp, cudap3ptrs.Ms.size()); + + GetDeviceMemory(&d_p1transAptrs, cudap1transptrs.Ms.size()); + GetDeviceMemory(&d_p1transxptrs, cudap1transptrs.Ms.size()); + GetDeviceMemory(&d_p1transyptrs, cudap1transptrs.Ms.size()); + GetDeviceMemory(&d_p3transAptrs, cudap3transptrs.Ms.size()); + GetDeviceMemory(&d_p3transxptrs, cudap3transptrs.Ms.size()); + GetDeviceMemory(&d_p3transyptrs, cudap3transptrs.Ms.size()); + + CopyDataB2HD((HostType**)d_p1transAptrs, (HostType**)cudap1transptrs.Abp, cudap1transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p1transxptrs, (HostType**)cudap1transptrs.xbp, cudap1transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p1transyptrs, (HostType**)cudap1transptrs.ybp, cudap1transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3transAptrs, (HostType**)cudap3transptrs.Abp, cudap3transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3transxptrs, (HostType**)cudap3transptrs.xbp, cudap3transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3transyptrs, (HostType**)cudap3transptrs.ybp, cudap3transptrs.Ms.size()); + } + + template + void TlrmvmhipConstRank::MemoryFree(){ + tlrmvmcpu->MemoryFree(); + FreehipHostMemory(cudap1ptrs.Abp); + FreehipHostMemory(cudap1ptrs.xbp); + FreehipHostMemory(cudap1ptrs.ybp); + FreeDeviceMemory(cudap1ptrs.A); + FreeDeviceMemory(cudap1ptrs.x); + FreeDeviceMemory(cudap1ptrs.y); + + FreehipHostMemory(cudap3ptrs.Abp); + FreehipHostMemory(cudap3ptrs.xbp); + FreehipHostMemory(cudap3ptrs.ybp); + FreeDeviceMemory(cudap3ptrs.A); + FreeDeviceMemory(cudap3ptrs.x); + FreeDeviceMemory(cudap3ptrs.y); + + FreehipHostMemory(cudap1transptrs.Abp); + FreehipHostMemory(cudap1transptrs.xbp); + FreehipHostMemory(cudap1transptrs.ybp); + FreeDeviceMemory(cudap1transptrs.y); + + FreehipHostMemory(cudap3transptrs.Abp); + FreehipHostMemory(cudap3transptrs.xbp); + FreehipHostMemory(cudap3transptrs.ybp); + FreeDeviceMemory(cudap3transptrs.y); + + FreeDeviceMemory(d_p1Aptrs); + FreeDeviceMemory(d_p1xptrs); + FreeDeviceMemory(d_p1yptrs); + FreeDeviceMemory(d_p3Aptrs); + FreeDeviceMemory(d_p3xptrs); + FreeDeviceMemory(d_p3yptrs); + + FreeDeviceMemory(d_p1transAptrs); + FreeDeviceMemory(d_p1transxptrs); + FreeDeviceMemory(d_p1transyptrs); + FreeDeviceMemory(d_p3transAptrs); + FreeDeviceMemory(d_p3transxptrs); + FreeDeviceMemory(d_p3transyptrs); + } + + template + void TlrmvmhipConstRank::Phase1(){ + hipDeviceSynchronize(); + for(int i=0; i + void TlrmvmhipConstRank::Phase1Transpose(){ + hipDeviceSynchronize(); + for(int i=0; i + void TlrmvmhipConstRank::Phase1GetMembuffer(){ + int batchsize = cudap1ptrs.Ms.size(); + GethipHostMemory(&cudap1ptrs.Abp, batchsize); + GethipHostMemory(&cudap1ptrs.xbp, batchsize); + GethipHostMemory(&cudap1ptrs.ybp, batchsize); + } + template + void TlrmvmhipConstRank::Phase1GetMembufferTranspose() + { + int batchsize = cudap1ptrs.Ms.size(); + GethipHostMemory(&cudap1transptrs.Abp, batchsize); + GethipHostMemory(&cudap1transptrs.xbp, batchsize); + GethipHostMemory(&cudap1transptrs.ybp, batchsize); + } + + template + void TlrmvmhipConstRank::AllocatePhase1Buffer(){ + GetDeviceMemory(&cudap1ptrs.A, cudap1ptrs.Acnt); + GetDeviceMemory(&cudap1ptrs.x, cudap1ptrs.Xcnt); + GetDeviceMemory(&cudap1ptrs.y, cudap1ptrs.Ycnt); + cudap1ptrs.Abp[0] = cudap1ptrs.A; + cudap1ptrs.xbp[0] = cudap1ptrs.x; + cudap1ptrs.ybp[0] = cudap1ptrs.y; + } + template + void TlrmvmhipConstRank::AllocatePhase1BufferTranspose(){ + cudap1transptrs.A = cudap3ptrs.A; + cudap1transptrs.x = cudap1ptrs.x; + GetDeviceMemory(&cudap1transptrs.y, cudap1transptrs.Ycnt); + cudap1transptrs.Abp[0] = cudap3ptrs.A; // use phase 3, U bases + cudap1transptrs.xbp[0] = cudap1ptrs.x; // use phase 1, x + cudap1transptrs.ybp[0] = cudap1transptrs.y; // create a new buffer + } + + template + void TlrmvmhipConstRank::Phase1CopyData(){ + auto AvMs = cudap1ptrs.Ms; + auto AvNs = cudap1ptrs.Ns; + auto AvKs = cudap1ptrs.Ks; + for(int i=1; ip1ptrs.A, tlrmvmcpu->p1ptrs.Acnt); + CopyDataB2HD((HostType*)cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->p1ptrs.Xcnt); + } + template + void TlrmvmhipConstRank::Phase1CopyDataTranspose(){ + for(int i=1; i + void TlrmvmhipConstRank::Phase2(){ + hipDeviceSynchronize(); + phase2_nosplit(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, stream); + hipDeviceSynchronize(); + } + template + void TlrmvmhipConstRank::Phase2Transpose(){ + hipDeviceSynchronize(); + phase2_nosplit(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, stream); + hipDeviceSynchronize(); + } + + template + void TlrmvmhipConstRank::Phase2Prepare(){ + GetDeviceMemory(&d_phase2mapping, tlrmvmcpu->h_phase2mapping.size()); + CopyDataB2HD(d_phase2mapping, tlrmvmcpu->h_phase2mapping.data(),tlrmvmcpu->h_phase2mapping.size()); + } + template + void TlrmvmhipConstRank::Phase2PrepareTranspose(){ + GetDeviceMemory(&d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.size()); + CopyDataB2HD(d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.data(), + tlrmvmcpu->h_phase2mappingTranspose.size()); + } + + template + void TlrmvmhipConstRank::Phase3(){ + hipDeviceSynchronize(); + for(int i=0; i + void TlrmvmhipConstRank::Phase3Transpose(){ + hipDeviceSynchronize(); + for(int i=0; i + void TlrmvmhipConstRank::Phase3GetMembuffer(){ + int batchsize = cudap3ptrs.Ms.size(); + GethipHostMemory(&cudap3ptrs.Abp, batchsize); + GethipHostMemory(&cudap3ptrs.xbp, batchsize); + GethipHostMemory(&cudap3ptrs.ybp, batchsize); + } + template + void TlrmvmhipConstRank::Phase3GetMembufferTranspose(){ + int batchsize = cudap3transptrs.Ms.size(); + GethipHostMemory(&cudap3transptrs.Abp, batchsize); + GethipHostMemory(&cudap3transptrs.xbp, batchsize); + GethipHostMemory(&cudap3transptrs.ybp, batchsize); + } + + template + void TlrmvmhipConstRank::AllocatePhase3Buffer(){ + GetDeviceMemory(&cudap3ptrs.A, cudap3ptrs.Acnt); + GetDeviceMemory(&cudap3ptrs.x, cudap3ptrs.Xcnt); + GetDeviceMemory(&cudap3ptrs.y, cudap3ptrs.Ycnt); + cudap3ptrs.Abp[0] = cudap3ptrs.A; + cudap3ptrs.xbp[0] = cudap3ptrs.x; + cudap3ptrs.ybp[0] = cudap3ptrs.y; + } + template + void TlrmvmhipConstRank::AllocatePhase3BufferTranspose(){ + cudap3transptrs.A = cudap1ptrs.A; + cudap3transptrs.x = cudap3ptrs.x; + GetDeviceMemory(&cudap3transptrs.y, cudap3transptrs.Ycnt); + cudap3transptrs.Abp[0] = cudap1ptrs.A; // use phase 1, V bases + cudap3transptrs.xbp[0] = cudap3ptrs.x; // use phase 3, x + cudap3transptrs.ybp[0] = cudap3transptrs.y; // create a new buffer + } + + template + void TlrmvmhipConstRank::Phase3CopyData(){ + auto AuMs = cudap3ptrs.Ms; + auto AuNs = cudap3ptrs.Ns; + auto AuKs = cudap3ptrs.Ks; + for(int i=1; iconfig.Mtg; i++){ + size_t AuMK = AuMs[i-1] * AuKs[i-1]; + size_t AuKN = AuKs[i-1] * AuNs[i-1]; + size_t AuMN = AuMs[i-1] * AuNs[i-1]; + cudap3ptrs.Abp[i] = cudap3ptrs.Abp[i-1] + AuMK; + cudap3ptrs.xbp[i] = cudap3ptrs.xbp[i-1] + AuKN; + cudap3ptrs.ybp[i] = cudap3ptrs.ybp[i-1] + AuMN; + } + // load phase 3 A to GPU + CopyDataB2HD((HostType*)cudap3ptrs.A, tlrmvmcpu->p3ptrs.A, cudap3ptrs.Acnt); + } + template + void TlrmvmhipConstRank::Phase3CopyDataTranspose(){ + for(int i=1; i + void TlrmvmhipConstRank::MVMTranspose() + { + hipDeviceSynchronize(); + for(int i=0; i(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, stream); + hipDeviceSynchronize(); + for(int i=0; i + void TlrmvmhipConstRank::MVMNoTranspose() + { + hipDeviceSynchronize(); + hipblasgemmbatched(cublashandle, HIPBLAS_OP_N, HIPBLAS_OP_N, + cudap1ptrs.Ms[0],cudap1ptrs.Ns[0],cudap1ptrs.Ks[0], + &alpha, (const DeviceType**)d_p1Aptrs, cudap1ptrs.Ms[0], + (const DeviceType**)d_p1xptrs, cudap1ptrs.Ks[0], + &beta,d_p1yptrs, cudap1ptrs.Ms[0], cudap1ptrs.Ms.size()); +// for(int i=0; i(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, stream); + hipDeviceSynchronize(); + hipblasgemmbatched(cublashandle, HIPBLAS_OP_N, HIPBLAS_OP_N, + cudap3ptrs.Ms[0],cudap3ptrs.Ns[0],cudap3ptrs.Ks[0], + &alpha, (const DeviceType**)d_p3Aptrs, cudap3ptrs.Ms[0], + (const DeviceType**)d_p3xptrs, cudap3ptrs.Ks[0], + &beta,d_p3yptrs, cudap3ptrs.Ms[0], cudap3ptrs.Ms.size()); +// for(int i=0; i + void TlrmvmhipConstRank::CopyBackResults() + { + // use cpu pointers to send output + if(transpose){ + CopyDataB2HD(tlrmvmcpu->p1transptrs.y, (HostType*)cudap1transptrs.y, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3transptrs.x, (HostType*)cudap3transptrs.x, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM); + }else{ + CopyDataB2HD(tlrmvmcpu->p1ptrs.y, (HostType*)cudap1ptrs.y, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3ptrs.x, (HostType*)cudap3ptrs.x, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM); + } + tlrmvmcpu->CopyToFinalresults(); + } + + template + void TlrmvmhipConstRank::MVM() { + if(transpose){ + MVMTranspose(); + }else{ + MVMNoTranspose(); + } + } + + template + void TlrmvmhipConstRank::SetTransposeConjugate(bool transpose, bool conjugate) { + this->transpose = transpose; + this->conjugate = conjugate; + tlrmvmcpu->SetTransposeConjugate(transpose, conjugate); + } + + + + template class TlrmvmhipConstRank; + template class TlrmvmhipConstRank; + template class TlrmvmhipConstRank, hipComplex>; + template class TlrmvmhipConstRank, hipDoubleComplex>; + +} // namespace cudatlrmvm + diff --git a/src/tlrmvm/hip/TlrmvmhipConstRank.hpp b/src/tlrmvm/hip/TlrmvmhipConstRank.hpp new file mode 100644 index 0000000..ce2c85e --- /dev/null +++ b/src/tlrmvm/hip/TlrmvmhipConstRank.hpp @@ -0,0 +1,103 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#pragma once + +#include "../../common/Common.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include "../../common/hip/Util.hpp" +#include "Tlrmvmhip.hpp" +#include +#include +#include +#include +#include + +namespace hiptlrmvm +{ + + template + class TlrmvmhipConstRank + { + public: + explicit TlrmvmhipConstRank(TlrmvmConfig tlrmvmconfig); + TlrmvmhipConstRank(); + void UpdateConfig(TlrmvmConfig &tlrmvmConfig); + void SetTransposeConjugate(bool transpose, bool conjugate); + void StreamInit(int streamsize); + void StreamDestroy(); + void MemoryInit(); + void MemoryFree(); + void Phase1(); + void Phase1Transpose(); + void Phase1GetMembuffer(); + void AllocatePhase1Buffer(); + void Phase1CopyData(); + void Phase1GetMembufferTranspose(); + void AllocatePhase1BufferTranspose(); + void Phase1CopyDataTranspose(); + void Phase2(); + void Phase2Transpose(); + void Phase2Prepare(); + void Phase2PrepareTranspose(); + void Phase3(); + void Phase3Transpose(); + void Phase3GetMembuffer(); + void AllocatePhase3Buffer(); + void Phase3CopyData(); + void Phase3GetMembufferTranspose(); + void AllocatePhase3BufferTranspose(); + void Phase3CopyDataTranspose(); + void MVM(); + void MVMTranspose(); + void MVMNoTranspose(); + void MVMGraph(); + void MVMGraphTranspose(); + void MVMGraphNoTranspose(); + void setX(HostType * xvector, size_t xlength); + // seperate 2 functions. + void TryConjugateXvec(); + void TryConjugateResults(); + void CopyBackResults(); + + bool transpose; + bool conjugate; + // cpu instance + TlrmvmConfig config; + shared_ptr> tlrmvmcpu; + // GPU resources + hipStream_t stream; + hipblasHandle_t cublashandle; + DeviceType alpha; + DeviceType beta; + + // gpu pointers + HIPPhasePointers cudap1ptrs; + HIPPhasePointers cudap1transptrs; + size_t *d_phase2mapping; + size_t *d_phase2mapping_transpose; + HIPPhasePointers cudap3ptrs; + HIPPhasePointers cudap3transptrs; + + DeviceType **d_p1Aptrs; + DeviceType **d_p1xptrs; + DeviceType **d_p1yptrs; + DeviceType **d_p3Aptrs; + DeviceType **d_p3xptrs; + DeviceType **d_p3yptrs; + + DeviceType **d_p1transAptrs; + DeviceType **d_p1transxptrs; + DeviceType **d_p1transyptrs; + DeviceType **d_p3transAptrs; + DeviceType **d_p3transxptrs; + DeviceType **d_p3transyptrs; + }; + + + + +} // + + + diff --git a/src/tlrmvm/hip/hipkernel.cpp b/src/tlrmvm/hip/hipkernel.cpp new file mode 100644 index 0000000..66f4e8a --- /dev/null +++ b/src/tlrmvm/hip/hipkernel.cpp @@ -0,0 +1,77 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +// +// Created by Yuxi Hong on 28/02/2022. +// + +#include "../../common/hip/Util.hpp" +#include "hipkernel.cuh" + + +namespace hiptlrmvm { + + template + __global__ void phase2_nosplit_kernel(const T * __restrict__ yv, + const size_t * __restrict__ phase2mapping, T * __restrict__ yu, size_t len) + { + size_t thread_x = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_x < len){ + yu[phase2mapping[thread_x]] = yv[thread_x]; + } + } + + template + void phase2_nosplit(const T *yv, const size_t * phase2mapping, T * yu, size_t len, hipStream_t stream){ + int dimx = 512; + int griddimx = (len+dimx-1) / dimx; + phase2_nosplit_kernel<<>>(yv, phase2mapping, yu, len); + HIPCHECK(hipGetLastError()); + } + + template void phase2_nosplit(const float*, const size_t *, float *, size_t, hipStream_t); + template void phase2_nosplit(const double*, const size_t *, double *, size_t, hipStream_t); + template void phase2_nosplit(const hipDoubleComplex*, const size_t *, + hipDoubleComplex *, size_t, hipStream_t); + template void phase2_nosplit(const hipComplex*, const size_t *, hipComplex *, size_t, hipStream_t); +// template void phase2_nosplit(const cuHalfComplex*, const size_t *, cuHalfComplex *, size_t, hipStream_t); + + + __forceinline__ __device__ float conj(float Invec){ + return Invec; + } + __forceinline__ __device__ double conj(double Invec){ + return Invec; + } + __forceinline__ __device__ hipComplex conj(hipComplex Invec){ + return {Invec.x, -Invec.y}; + } + __forceinline__ __device__ hipDoubleComplex conj(hipDoubleComplex Invec){ + return {Invec.x, -Invec.y}; + } + + template + __global__ void ConjugateKernel(T *Invec, size_t length) + { + size_t thread_x = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_x < length){ + Invec[thread_x] = conj(Invec[thread_x]); + } + } + + template + void ConjugateDriver(T *Invec, size_t length, hipStream_t stream){ + int dimx = 512; + int griddimx = (length+dimx-1) / dimx; + ConjugateKernel<<>>(Invec, length); + HIPCHECK(hipGetLastError()); + } + template void ConjugateDriver(float *Invec, size_t length, hipStream_t stream); + template void ConjugateDriver(double *Invec, size_t length, hipStream_t stream); + + template void ConjugateDriver(hipComplex *Invec, size_t length, hipStream_t stream); + template void ConjugateDriver(hipDoubleComplex *Invec, size_t length, hipStream_t stream); + + +} // namespace + diff --git a/src/tlrmvm/hip/hipkernel.cuh b/src/tlrmvm/hip/hipkernel.cuh new file mode 100644 index 0000000..d25112d --- /dev/null +++ b/src/tlrmvm/hip/hipkernel.cuh @@ -0,0 +1,25 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + + +namespace hiptlrmvm{ + + // normal phase 2 + template + void phase2_nosplit(const T *yv, const size_t * phase2mapping, T * yu, size_t len, hipStream_t stream); + + // in-place conjugate convert + template + void ConjugateDriver(T *Invec, size_t length, hipStream_t stream); + + + +} // namespace diff --git a/src/tlrmvm/hip/tlrmvmhiputil.cpp b/src/tlrmvm/hip/tlrmvmhiputil.cpp new file mode 100644 index 0000000..4714654 --- /dev/null +++ b/src/tlrmvm/hip/tlrmvmhiputil.cpp @@ -0,0 +1,94 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +// +// Created by Yuxi Hong on 08/04/2022. +// + +#include "tlrmvmhiputil.hpp" +#include +#include +#include + +namespace hiptlrmvm{ + + SingleGraph::SingleGraph() {} + + void SingleGraph::StreamInit(int streamsize) { + // single graph creation + this->streamsize = streamsize; + HIPCHECK(hipEventCreate(&event_start)); + HIPCHECK(hipEventCreate(&event_phase2finish)); + HIPCHECK(hipEventCreate(&event_phase1finish)); + events = new hipEvent_t[4*streamsize]; + for(int i=0; i<4*streamsize; i++) HIPCHECK(hipEventCreate(&events[i])); + graphCreated = false; + } + + void SingleGraph::syncallstreams(hipEvent_t *eventsptr, hipStream_t *streamptr, int stream_size) { + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(eventsptr[streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], eventsptr[streami],0); + } + hipEventRecord(eventsptr[0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], eventsptr[0],0); + } + } + + void SingleGraph::syncstream0(hipEvent_t *eventsptr, hipStream_t *streamptr, int stream_size) { + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(eventsptr[streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], eventsptr[streami],0); + } + } + + void SingleGraph::syncotherstreams(hipEvent_t event, hipStream_t * streamptr, int stream_size){ + hipEventRecord(event, streamptr[0]); + for(int streami=1; streamibatchsize = batchsize; + this->streamsize = streamsize; + // multi graph creation + event_start.resize(batchsize); + event_phase2finish.resize(batchsize); + graphCreated.resize(batchsize); + instance.resize(batchsize); + graph.resize(batchsize); + events = new hipEvent_t*[batchsize]; + for(int bi=0; bi *hy, size_t xlength){ + double rmax = 0; + double imax = 0; + for(int i=0; i +using std::vector; + +#include "../cpu/TlrmvmCPU.hpp" + +#ifdef USE_MPI +#include +#endif + +namespace hiptlrmvm { + + struct SingleGraph{ + SingleGraph(); + void StreamInit(int streamsize); + int streamsize; + hipGraph_t graph; + bool graphCreated; + hipGraphExec_t instance; + hipEvent_t* events; + hipEvent_t event_start; + hipEvent_t event_phase1finish; + hipEvent_t event_phase2finish; + void syncallstreams(hipEvent_t * events, hipStream_t * stream,int streamsize); + void syncstream0(hipEvent_t * events, hipStream_t * stream,int streamsize); + void syncotherstreams(hipEvent_t event, hipStream_t * stream,int streamsize); + }; + + struct MultiGraph{ + MultiGraph(); + void StreamInit(int batchsize, int streamsize); + int batchsize; + int streamsize; + vector graph; + vector graphCreated; + vector instance; + hipEvent_t* *events; + vector event_start; + vector event_phase2finish; + }; + + // BatchTlrmvmcudaINT8 + + struct CUDAI8basesPointers{ + CUDAI8basesPointers(); + size_t Acnt; + size_t Xcnt; + size_t Ycnt; + vector Ms; + vector Ks; + vector Ns; + + hipInt8Complex * Abuffer; // real data buffer + vector maxA; + hipComplex * maxA_device; // used to scale up to fp16 + vector Aelems; // each gemv A elems + vector Aelemsoffset; // each gemv A elems, prefix + size_t * Aelemsoffset_device; // used to scale up to fp16 + hipHalfComplex * ybuffer; // y buffer, alway a half buffer + + vector xelems; // each gemv x elems + vector xelemsoffset; // each gemv x elems, prefix + + vector yelems; // each gemv y elems + vector yelemsoffset; // each gemv y elems, prefix + + }; + + struct CUDAI8XPointers{ + CUDAI8XPointers(); + hipInt8Complex * xbuffer; + vector maxx; + hipComplex * maxx_device; // used to scale up to fp16 + vector xelems; // each gemv x elems + size_t* xelems_device; // each gemv x elems + vector xelemsoffset; // each gemv x elems, prefix + size_t* xelemsoffset_device; // each gemv x elems, prefix + hipComplex *p3xreductionbuffer_device; + }; + + struct CBMaxInfo{ + CBMaxInfo(); + size_t maxA; + size_t maxx; + size_t maxy; + size_t maxbatchsize; + }; + + void getcomplexvectormax(complex *hy, size_t xlength); + +} + diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 476d37e..5893bff 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,3 +44,20 @@ if(BUILD_CUDA) install(TARGETS ${cbins} DESTINATION test) endforeach() endif() + +if(BUILD_HIP) + + set(BINS + Test_hip_hipblas + Test_hip_tlrmvm + Test_hip_constrank +# Test_hip_tlrmvmgraph + ) + foreach(cbins ${BINS}) + WrapHIPBinary(${cbins} hip) + install(TARGETS ${cbins} DESTINATION test) + endforeach() +# install(FILES +# ${CMAKE_CURRENT_LIST_DIR}/hip/Test_hip_tlrmvm_correctness.sh +# DESTINATION test PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE) +endif() \ No newline at end of file diff --git a/test/cpp/Test_cpu_tlrmvm.cpp b/test/cpp/Test_cpu_tlrmvm.cpp index f02018b..04625e0 100644 --- a/test/cpp/Test_cpu_tlrmvm.cpp +++ b/test/cpp/Test_cpu_tlrmvm.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/test/cpp/ex2mpitlrmvm_complexfloat.cpp b/test/cpp/ex2mpitlrmvm_complexfloat.cpp index edb150d..d44f5ed 100644 --- a/test/cpp/ex2mpitlrmvm_complexfloat.cpp +++ b/test/cpp/ex2mpitlrmvm_complexfloat.cpp @@ -1,17 +1,20 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include -#include #include #include - #include #include +#define real complex using namespace std; int main (int argc, char ** argv){ int originM; int originN; int nb; + int loopsize; string acc; string datafolder; string problemname; @@ -22,7 +25,6 @@ int main (int argc, char ** argv){ vector bandstat; double bytesprocessed; size_t granksum; - int loopsize; auto argparser = ArgsParser(argc, argv); originM = argparser.getint("M"); originN = argparser.getint("N"); @@ -44,18 +46,17 @@ int main (int argc, char ** argv){ maskmat.Fill(0); for(int i=0; i> tlrmvmptr(tlrmvmconfig); - auto finalbuffer = new complex[tlrmvmptr.config.paddingM]; -// tlrmvmptr.xmat.Fill(0.001); - memset(finalbuffer, 0, sizeof(complex) * tlrmvmptr.config.paddingM); + TlrmvmCPU tlrmvmptr(tlrmvmconfig); + auto finalbuffer = new real[tlrmvmptr.config.paddingM]; + memset(finalbuffer, 0, sizeof(real) * tlrmvmptr.config.paddingM); tlrmvmptr.MemoryInit(); - auto curx = Matrix>(tlrmvmptr.config.originN, 1); - curx.Fill(complex(0.1,1.0)); + auto curx = Matrix(tlrmvmptr.config.originN, 1); + curx.Fill(real(0.1,1.0)); tlrmvmptr.setX(curx.RawPtr(), curx.Shape()[0]); for(int i=0; i> yv_pc = seismicpcmat.Phase1(); - auto hyv = Matrix>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1); + CFPPCMatrix pcmat(datafolder, acc, nb, problemname, originM, originN); + pcmat.setX(tlrmvmptr.xmat); + pcmat.GetDense(); + Matrix yv_pc = pcmat.Phase1(); + auto hyv = Matrix(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1); // cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl; - Matrix> yu_pc = seismicpcmat.Phase2(); - auto hyu = Matrix>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1); + Matrix yu_pc = pcmat.Phase2(); + auto hyu = Matrix(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1); // cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl; - Matrix> y_pc = seismicpcmat.Phase3(); - auto hy = Matrix>(finalbuffer, tlrmvmptr.config.originM, 1); - auto denseout = seismicpcmat.GetDense() * tlrmvmptr.xmat; + Matrix y_pc = pcmat.Phase3(); + auto hy = Matrix(finalbuffer, tlrmvmptr.config.originM, 1); + auto denseout = pcmat.GetDense() * tlrmvmptr.xmat; cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(denseout) << endl; #endif std::sort(mergetime.begin(), mergetime.end()); int N = mergetime.size(); cout << "median " << mergetime[N / 2] * 1e6 << " us."<< endl; - double bytes = TLRMVMBytesProcessed>(tlrmvmptr.config.granksum, + double bytes = TLRMVMBytesProcessed(tlrmvmptr.config.granksum, tlrmvmptr.config.nb, originM, originN); cout << "U and V bases size: " << bytes * 1e-6 << " MB." << endl; cout << "Bandwidth " << bytes / mergetime[N/2] * 1e-9 << " GB/s" << endl; } - delete[] finalbuffer; tlrmvmptr.MemoryFree(); + delete[] finalbuffer; MPI_Finalize(); return 0; } - - diff --git a/test/cpp/ex2mpitlrmvm_float.cpp b/test/cpp/ex2mpitlrmvm_float.cpp index 5685e0a..5fdb59f 100644 --- a/test/cpp/ex2mpitlrmvm_float.cpp +++ b/test/cpp/ex2mpitlrmvm_float.cpp @@ -1,11 +1,14 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include - #include #include #include #include +#define real float using namespace std; int main (int argc, char ** argv){ int originM; @@ -27,10 +30,10 @@ int main (int argc, char ** argv){ originN = argparser.getint("N"); nb = argparser.getint("nb"); loopsize = argparser.getint("loopsize"); - acc = argparser.getstring("errorthreshold"); - problemname = argparser.getstring("problemname"); + acc = argparser.getstring("threshold"); + problemname = argparser.getstring("problem"); datafolder = argparser.getstring("datafolder"); - char rpath[100]; + char rpath[300]; sprintf(rpath, "%s/%s_Rmat_nb%d_acc%s.bin", datafolder.c_str(), problemname.c_str(), nb, acc.c_str()); rankfile = string(rpath); TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, acc, problemname); @@ -47,21 +50,22 @@ int main (int argc, char ** argv){ maskmat.SetElem(i,j,1); } } - tlrmvmconfig.Maskmat = maskmat; - TlrmvmCPU tlrmvmptr(tlrmvmconfig); - double bytes = TLRMVMBytesProcessed(tlrmvmptr.config.granksum, - tlrmvmptr.config.nb, tlrmvmptr.config.paddingM, - tlrmvmptr.config.paddingN); + tlrmvmconfig.UpdateMaskmat(maskmat); + TlrmvmCPU tlrmvmptr(tlrmvmconfig); + auto finalbuffer = new real[tlrmvmptr.config.paddingM]; + memset(finalbuffer, 0, sizeof(real) * tlrmvmptr.config.paddingM); tlrmvmptr.MemoryInit(); - auto finalbuffer = new float[tlrmvmptr.config.originM]; - for(int i=0; i(tlrmvmptr.config.originN, 1); + curx.Fill(0.1); + tlrmvmptr.setX(curx.RawPtr(), curx.Shape()[0]); for(int i=0; i(end - start).count(); @@ -71,22 +75,25 @@ int main (int argc, char ** argv){ MPI_Allreduce(timestat.data(), mergetime.data(), timestat.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); if(rank == 0){ - FPPCMatrix seismicpcmat(datafolder, acc, nb, problemname, originM, originN); - seismicpcmat.setX(tlrmvmptr.xmat); - seismicpcmat.GetDense(); - Matrix yv_pc = seismicpcmat.Phase1(); - auto hyv = Matrix(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1); - // cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl; - Matrix yu_pc = seismicpcmat.Phase2(); - auto hyu = Matrix(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1); - // cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl; - Matrix y_pc = seismicpcmat.Phase3(); - auto hy = Matrix(tlrmvmptr.p3ptrs.y, tlrmvmptr.config.originM, 1); - cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(y_pc) << endl; +#ifndef USE_NEC + FPPCMatrix pcmat(datafolder, acc, nb, problemname, originM, originN); + pcmat.setX(tlrmvmptr.xmat); + pcmat.GetDense(); + Matrix yv_pc = pcmat.Phase1(); + auto hyv = Matrix(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1); +// cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl; + Matrix yu_pc = pcmat.Phase2(); + auto hyu = Matrix(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1); +// cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl; + Matrix y_pc = pcmat.Phase3(); + auto hy = Matrix(finalbuffer, tlrmvmptr.config.originM, 1); + auto denseout = pcmat.GetDense() * tlrmvmptr.xmat; + cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(denseout) << endl; +#endif std::sort(mergetime.begin(), mergetime.end()); int N = mergetime.size(); cout << "median " << mergetime[N / 2] * 1e6 << " us."<< endl; - double bytes = TLRMVMBytesProcessed(tlrmvmptr.config.granksum, + double bytes = TLRMVMBytesProcessed(tlrmvmptr.config.granksum, tlrmvmptr.config.nb, originM, originN); cout << "U and V bases size: " << bytes * 1e-6 << " MB." << endl; cout << "Bandwidth " << bytes / mergetime[N/2] * 1e-9 << " GB/s" << endl; @@ -96,5 +103,3 @@ int main (int argc, char ** argv){ MPI_Finalize(); return 0; } - - diff --git a/test/cpp/ex3_gendata.cpp b/test/cpp/ex3_gendata.cpp index 82ee60e..521b9b8 100644 --- a/test/cpp/ex3_gendata.cpp +++ b/test/cpp/ex3_gendata.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/test/cpp/mkl_example.h b/test/cpp/mkl_example.h index 85d9696..64b2ec2 100644 --- a/test/cpp/mkl_example.h +++ b/test/cpp/mkl_example.h @@ -1,16 +1,5 @@ -/******************************************************************************* -* Copyright 1999-2021 Intel Corporation. -* -* This software and the related documents are Intel copyrighted materials, and -* your use of them is governed by the express license under which they were -* provided to you (License). Unless the License provides otherwise, you may not -* use, modify, copy, publish, distribute, disclose or transmit this software or -* the related documents without Intel's prior written permission. -* -* This software and the related documents are provided as is, with no express -* or implied warranties, other than those that are expressly stated in the -* License. -*******************************************************************************/ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. /* ! Content: diff --git a/test/cpp/runexp1.sh b/test/cpp/runexp1.sh index 57b66b7..fd0a4d0 100644 --- a/test/cpp/runexp1.sh +++ b/test/cpp/runexp1.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + ./install/test/ex1basic_float --M=4802 --N=19078 \ --errorthreshold=0.0001 --problemname=mavis_000_R \ --datafolder=$WORK_ROOT --nb=256 diff --git a/test/cpp/runexp2A64FX.sh b/test/cpp/runexp2A64FX.sh index 0ea6218..1c3741f 100644 --- a/test/cpp/runexp2A64FX.sh +++ b/test/cpp/runexp2A64FX.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + # for NEC, for some reason the proof of concept matrix is running very slow, # one can try to by pass the logic so that you can get speed up. diff --git a/test/cpp/runexp2AMD.sh b/test/cpp/runexp2AMD.sh index b2da080..844ba47 100644 --- a/test/cpp/runexp2AMD.sh +++ b/test/cpp/runexp2AMD.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + # AMD CPU Experiments ################################################ diff --git a/test/cpp/runexp2AMD_synthetic.sh b/test/cpp/runexp2AMD_synthetic.sh index afbf5f8..8437f51 100644 --- a/test/cpp/runexp2AMD_synthetic.sh +++ b/test/cpp/runexp2AMD_synthetic.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + # run on AMD Milan 2 sockets 1 cpu/socket OMP_NUM_THREADS=8 mpirun -np 16 --map-by L3cache:PE=8 \ diff --git a/test/cpp/runexp2ICX.sh b/test/cpp/runexp2ICX.sh index 3c06750..3ea4e5a 100644 --- a/test/cpp/runexp2ICX.sh +++ b/test/cpp/runexp2ICX.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + OMP_NUM_THREADS=28 mpirun -np 2 --map-by L3cache:PE=28 \ ./install/test/ex2mpitlrmvm_complexfloat --M=9801 --N=9801 \ --errorthreshold=0.001 --problemname=SeismicFreq100 \ diff --git a/test/cpp/runexp2NEC.sh b/test/cpp/runexp2NEC.sh index 70e5b82..d051e4b 100644 --- a/test/cpp/runexp2NEC.sh +++ b/test/cpp/runexp2NEC.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + # for NEC, for some reason the proof of concept matrix is running very slow, # one can try to by pass the logic so that you can get speed up. diff --git a/test/cpp/runexp3_gendata.sh b/test/cpp/runexp3_gendata.sh index 0dd95c4..210e8a9 100644 --- a/test/cpp/runexp3_gendata.sh +++ b/test/cpp/runexp3_gendata.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + ./install/test/ex3_gendata --M=4802 --N=19078 \ --errorthreshold=0.0001 --problemname=Sytheticfloat \ --datafolder=$WORK_ROOT --nb=256 --constrank=100 --dtype=float diff --git a/test/cuda/Test_A100TimeDecomposition.cpp b/test/cuda/Test_A100TimeDecomposition.cpp index 1b6fce0..a2d5a40 100644 --- a/test/cuda/Test_A100TimeDecomposition.cpp +++ b/test/cuda/Test_A100TimeDecomposition.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/test/cuda/Test_BatchCall_Tlrmvm.cpp b/test/cuda/Test_BatchCall_Tlrmvm.cpp index 75427b0..f068fb2 100644 --- a/test/cuda/Test_BatchCall_Tlrmvm.cpp +++ b/test/cuda/Test_BatchCall_Tlrmvm.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/test/cuda/Test_MultiGPUBatchCall_Tlrmvm.cpp b/test/cuda/Test_MultiGPUBatchCall_Tlrmvm.cpp index df5c350..3d2f8df 100644 --- a/test/cuda/Test_MultiGPUBatchCall_Tlrmvm.cpp +++ b/test/cuda/Test_MultiGPUBatchCall_Tlrmvm.cpp @@ -1,3 +1,6 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + #include #include #include diff --git a/test/cuda/astronomy_cudatest.sh b/test/cuda/astronomy_cudatest.sh index 7c4199f..b783102 100644 --- a/test/cuda/astronomy_cudatest.sh +++ b/test/cuda/astronomy_cudatest.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + #/bin/bash ./install/test/ex4cudagraph_float --M=4802 --N=19078 \ --errorthreshold=0.0001 --problemname=mavis_000_R \ diff --git a/test/cuda/seismology_cudatest.sh b/test/cuda/seismology_cudatest.sh index e5c0fa0..dae19b8 100644 --- a/test/cuda/seismology_cudatest.sh +++ b/test/cuda/seismology_cudatest.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + # 1. Dense MVM Benchmark ./install/test/ex5dense 10000 10000 1000 complex #Results,DenseMVM,complex,V100,median time 1000.42 us,Bandwidth 799.827 GB/s diff --git a/test/cuda/seismology_mixedprec_cudatest.sh b/test/cuda/seismology_mixedprec_cudatest.sh index 92248d3..8771e5f 100644 --- a/test/cuda/seismology_mixedprec_cudatest.sh +++ b/test/cuda/seismology_mixedprec_cudatest.sh @@ -1,5 +1,10 @@ #!/bin/bash +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + # 2. TLR-MVM Benchmark [ ordering type : No ordering, NVIDIA] ./install/test/ex4cudagraph_csingle_mp --M=9801 --N=9801 \ --errorthreshold=0.001 --problemname=Mode4_Mck_freqslice_100 \ diff --git a/test/hip/Test_hip_constrank.cpp b/test/hip/Test_hip_constrank.cpp new file mode 100644 index 0000000..2683e6f --- /dev/null +++ b/test/hip/Test_hip_constrank.cpp @@ -0,0 +1,61 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#include +#include +#include +#include "common/Common.hpp" +#include "tlrmvm/Tlrmvm.hpp" + +using namespace hiptlrmvm; + +int main (int argc, char ** argv){ + auto argparser = ArgsParser(argc, argv); + auto originM = argparser.getint("M"); + auto originN = argparser.getint("N"); + auto nb = argparser.getint("nb"); + auto ranksize = argparser.getint("ranksize"); + auto loopsize = argparser.getint("loopsize"); + + // rank size should be smaller than nb. + TlrmvmConfig tlrmvmconfig(originM, originN, nb, ranksize); + /******************************** + * cuda instance + ********************************/ + TlrmvmhipConstRank, hipComplex> cudatlrmvmptr(tlrmvmconfig); + cudatlrmvmptr.StreamInit(0); + cudatlrmvmptr.MemoryInit(); + cudatlrmvmptr.SetTransposeConjugate(false, false); + cudatlrmvmptr.TryConjugateXvec(); + + // time + hipEvent_t start; + hipEvent_t stop; + hipEventCreate(&start); + hipEventCreate(&stop); + vector rawtime; + float milliseconds = 0; + + for(int i=0; i>(cudatlrmvmptr.config.granksum, nb, + originM, originN); + cout << "Bandwidth: " << bytes / rawtime[nruns/2] * 1e-9 << " GB/s." << endl; + + cudatlrmvmptr.MemoryFree(); + return 0; +} diff --git a/test/hip/Test_hip_hipblas.cpp b/test/hip/Test_hip_hipblas.cpp new file mode 100644 index 0000000..4892fd5 --- /dev/null +++ b/test/hip/Test_hip_hipblas.cpp @@ -0,0 +1,23 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#include +#include +#include +using namespace std; +int main(){ + hipEvent_t start; + hipEvent_t stop; + hipEventCreate(&start); + hipEventCreate(&stop); + hipEventRecord(start); + for(int i=0; i<10; i++){ + cout << "hello" << endl; + } + hipEventRecord(stop); + hipEventSynchronize(stop); + float milliseconds = 0; + hipEventElapsedTime(&milliseconds, start, stop); + cout << "time "<< milliseconds << endl; + +} diff --git a/test/hip/Test_hip_tlrmvm.cpp b/test/hip/Test_hip_tlrmvm.cpp new file mode 100644 index 0000000..d885a35 --- /dev/null +++ b/test/hip/Test_hip_tlrmvm.cpp @@ -0,0 +1,55 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#include +#include +#include +#include "common/Common.hpp" +#include "tlrmvm/Tlrmvm.hpp" + +using namespace hiptlrmvm; + +int main (int argc, char ** argv){ + auto argparser = ArgsParser(argc, argv); + auto originM = argparser.getint("M"); + auto originN = argparser.getint("N"); + auto nb = argparser.getint("nb"); + auto threshold = argparser.getstring("threshold"); + auto problem = argparser.getstring("problem"); + auto datafolder = argparser.getstring("datafolder"); + auto streams = argparser.getint("streams"); + TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, threshold, problem); + /******************************** + * cuda instance + ********************************/ + Tlrmvmhip, hipComplex> cudaptr(tlrmvmconfig); + cudaptr.StreamInit(streams); + cudaptr.MemoryInit(); + + // do the computation and send results back to cpu instance. + cudaptr.Phase1(); + cudaptr.Phase2(); + cudaptr.Phase3(); + cudaptr.CopyBackResults(); + + CFPPCMatrix seismicpcmat(datafolder, threshold, nb, problem, originM, originN); + auto tlrmvmcpu = cudaptr.tlrmvmcpu; + seismicpcmat.setX(cudaptr.tlrmvmcpu->xmat); + auto densemat = seismicpcmat.GetDense(); + Matrix> yv_pc = seismicpcmat.Phase1(); + auto hyv = Matrix>(tlrmvmcpu->p1ptrs.y, tlrmvmcpu->config.workmatgranksum, 1); + cout << "====================================================" << endl; + cout << "Test TLR-MVM CUDA Implementation. " << endl; + cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl; + Matrix> yu_pc = seismicpcmat.Phase2(); + auto hyu = Matrix>(tlrmvmcpu->p3ptrs.x, tlrmvmcpu->config.workmatgranksum, 1); + cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl; + Matrix> y_pc = seismicpcmat.Phase3(); + auto hy = Matrix>(tlrmvmcpu->p3ptrs.y, tlrmvmcpu->config.paddingM, 1); + cout << " Phase 3 Correctness : "<< hy.allclose(y_pc) << endl; + auto denseout = densemat * cudaptr.tlrmvmcpu->xmat; + cout << "dense results vs tlrmvm results " << hy.allclose(denseout) << endl; + cout << "====================================================" << endl; + cudaptr.MemoryFree(); + return 0; +} diff --git a/test/hip/Test_hip_tlrmvm_singlecall.cpp b/test/hip/Test_hip_tlrmvm_singlecall.cpp new file mode 100644 index 0000000..709e34d --- /dev/null +++ b/test/hip/Test_hip_tlrmvm_singlecall.cpp @@ -0,0 +1,47 @@ +// @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +// All rights reserved. + +#include +#include +#include +#include "common/Common.hpp" +#include "tlrmvm/Tlrmvm.hpp" + +using namespace hiptlrmvm; + +int main (int argc, char ** argv){ + auto argparser = ArgsParser(argc, argv); + auto originM = argparser.getint("M"); + auto originN = argparser.getint("N"); + auto nb = argparser.getint("nb"); + auto threshold = argparser.getstring("threshold"); + auto problem = argparser.getstring("problem"); + auto datafolder = argparser.getstring("datafolder"); + auto streams = argparser.getint("streams"); + TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, threshold, problem); + /******************************** + * cuda instance + ********************************/ + Tlrmvmhip, hipComplex> cudatlrmvmptr(tlrmvmconfig); + cudatlrmvmptr.StreamInit(streams); + cudatlrmvmptr.MemoryInit(); + cudatlrmvmptr.SetTransposeConjugate(false, false); + cudatlrmvmptr.TryConjugateXvec(); + + // do the computation and send results back to cpu instance. + cudatlrmvmptr.MVM(); + + cudatlrmvmptr.TryConjugateResults(); + cudatlrmvmptr.CopyBackResults(); + + CFPPCMatrix seismicpcmat(datafolder, threshold, nb, problem, originM, originN); + auto densemat = seismicpcmat.GetDense(); + auto hy = Matrix>(cudatlrmvmptr.tlrmvmcpu->finalresults, cudatlrmvmptr.tlrmvmcpu->config.originM, 1); + auto denseout = densemat * cudatlrmvmptr.tlrmvmcpu->xmat; + cout << "====================================================" << endl; + cout << "Test TLR-MVM conjugate single call Implementation. " << endl; + cout << "dense results vs tlrmvm results " << hy.allclose(denseout) << endl; + cout << "====================================================" << endl; + cudatlrmvmptr.MemoryFree(); + return 0; +} diff --git a/test/python/generateinput.py b/test/python/generateinput.py index c64b8e2..2fef764 100644 --- a/test/python/generateinput.py +++ b/test/python/generateinput.py @@ -1,4 +1,7 @@ -import numpy as np +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + +import numpy as np import sys import os import argparse diff --git a/test/python/geninput_astronomy.sh b/test/python/geninput_astronomy.sh index 60880d6..8ce45b5 100644 --- a/test/python/geninput_astronomy.sh +++ b/test/python/geninput_astronomy.sh @@ -1,3 +1,8 @@ +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + # float python install/test/python/generateinput.py \ --nb=256 --error_threshold=0.0001 --problemname=mavis_000_R \ diff --git a/test/python/geninput_seismology.sh b/test/python/geninput_seismology.sh index 6bf6645..747ff43 100644 --- a/test/python/geninput_seismology.sh +++ b/test/python/geninput_seismology.sh @@ -1,4 +1,9 @@ -# complex single +# +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. +# + +# complex single python install/test/python/generateinput.py \ --nb=256 --error_threshold=0.001 --problemname=SeismicFreq100 \ --datatype=csingle --TLRMVM_ROOT=$(pwd)/install \ diff --git a/test/python/mode4generateinput.py b/test/python/mode4generateinput.py index e69de29..a0df358 100644 --- a/test/python/mode4generateinput.py +++ b/test/python/mode4generateinput.py @@ -0,0 +1,3 @@ +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + diff --git a/test/python/seismicexample.py b/test/python/seismicexample.py index 83dff3b..4f81a8f 100644 --- a/test/python/seismicexample.py +++ b/test/python/seismicexample.py @@ -1,3 +1,6 @@ +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + import numpy as np import sys import os diff --git a/test/python/tlrmvmexample.py b/test/python/tlrmvmexample.py index 09a07ad..73575e6 100644 --- a/test/python/tlrmvmexample.py +++ b/test/python/tlrmvmexample.py @@ -3,7 +3,10 @@ We provide only TLRMVMCPU interface here, but one can easily follow the same way to provide interface to others. """ -import numpy as np +# @Copyright (c) 2022 King Abdullah University of Science and Technology (KAUST). +# All rights reserved. + +import numpy as np import sys import os import time diff --git a/thirdparty/pybind11 b/thirdparty/pybind11 deleted file mode 160000 index 45f792e..0000000 --- a/thirdparty/pybind11 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 45f792efdd92da094548e2095d6efdbfa7e536ee