From de3632d3a35c0a3bc942c403f073c30fa897386c Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Wed, 7 Sep 2022 13:43:58 +0100 Subject: [PATCH 01/22] Insert build system changes. --- src/Makefile | 6 +++ src/chain/Makefile | 13 +++++- src/configure | 79 ++++++++++++++++++++++++++++++++-- src/cudamatrix/Makefile | 13 +++++- src/makefiles/default_rules.mk | 10 ++++- src/nnet3/Makefile | 7 ++- src/nnet3bin/Makefile | 6 +++ 7 files changed, 127 insertions(+), 7 deletions(-) diff --git a/src/Makefile b/src/Makefile index 4d4efbc0172..bc4375e30f6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -34,6 +34,12 @@ SUBDIRS += $(CUDADECODER) endif endif +ifeq ($(ROCM), true) +ifeq ($(WITH_CUDADECODER), true) +SUBDIRS += $(CUDADECODER) +endif +endif + SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS)) SUBDIRS_BIN = $(filter %bin, $(SUBDIRS)) diff --git a/src/chain/Makefile b/src/chain/Makefile index fbad28f7de6..c4411f4b997 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -10,7 +10,7 @@ TESTFILES = chain-supervision-test language-model-test OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ language-model.o chain-denominator.o chain-training.o \ chain-generic-numerator.o -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) OBJFILES += chain-kernels.o endif @@ -28,7 +28,18 @@ ifeq ($(CUDA), true) endif # Implicit rule for kernel compilation, +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ +endif +ifeq ($(ROCM), true) +#%.hip : %.cu +# $(HIPIFY) $< 1> $@ 2> $@.stats +#%.o : %.hip +# $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +endif + include ../makefiles/default_rules.mk diff --git a/src/configure b/src/configure index ed627eceedc..feb2fd276ad 100755 --- a/src/configure +++ b/src/configure @@ -74,6 +74,9 @@ Configuration options: --cudatk-dir=DIR CUDA toolkit directory --cuda-arch=FLAGS Override the default CUDA_ARCH flags. See: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples. + --use-rocm Build with ROCm + --rocm-dir=DIR ROCM directory + --rocm-targets=TGTS Comma separated list of GPU targets to target through ROCm --debug-level=N Use assertion level 0 (disabled), 1, or 2 [default=1] --double-precision Build with BaseFloat set to double if yes [default=no], mostly useful for testing purposes. @@ -248,6 +251,63 @@ function check_for_slow_expf { fi } +# ROCM is used only in selected directories including src/cudamatrix, src/nnet* +# and src/chain*. It is used to accelerate the neural network training. +# The rest of Kaldi runs on CPUs. + +function configure_rocm { + # Check for ROCM in the system + if [ ! -d "$ROCMDIR" ]; then + for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do + if [ -f $base/bin/hipcc ]; then + ROCMDIR=$base + fi + done + fi + + if [ -d "$ROCMDIR" ]; then + if [ ! -f $ROCMDIR/bin/hipcc ]; then + failure "Cannnot find hipcc in ROCm directory $ROCMDIR" + fi + fi + echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)" + echo >> kaldi.mk + echo "# ROCm configuration" >> kaldi.mk + echo >> kaldi.mk + echo IS_GPU_BUILD = true >> kaldi.mk + echo ROCM = true">> kaldi.mk + echo "ROCMDIR = $ROCMDIR" >> kaldi.mk + echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk + + echo "CUDA_ARCH = " >> kaldi.mk + echo "ROCM_ARCH_FLAGS = " >> kaldi.mk + for i in ${ROCM_TARGETS//,/ } ; do + echo "Targetting ROCm arch $i" + echo "ROCM_ARCH_FLAGS += --offload-arch=$i" >> kaldi.mk + done + + echo "HOST_ARCH = `uname -m`" >> kaldi.mk + echo >> kaldi.mk + + # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, + # use direct calls to uname -m here + if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then + cat makefiles/hip_64bit.mk >> kaldi.mk + else + echo "\ +WARNING: ROCM will not be used! + ROCM is only supported with 64-bit Linux builds." + exit 1; + fi + + #add cusolver flags for newer toolkits + if [ "$CUSOLVER" == "true" ]; then + echo "ROCM_LDLIBS += -lcusolver" >> kaldi.mk + fi +} + + + # CUDA is used only in selected directories including src/cudamatrix, src/nnet* # and src/chain*. It is used to accelerate the neural network training. # The rest of Kaldi runs on CPUs. @@ -371,6 +431,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\ echo "# CUDA configuration" >> kaldi.mk echo >> kaldi.mk + echo IS_GPU_BUILD = true >> kaldi.mk echo CUDA = true >> kaldi.mk echo CUDATKDIR = $CUDATKDIR >> kaldi.mk echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk @@ -602,7 +663,8 @@ ENV_LDLIBS=$LDLIBS debug_level=1 double_precision=false dynamic_kaldi=false -use_cuda=true +use_cuda=false +use_rocm=false with_cudadecoder=true static_fst=false static_math=false @@ -651,8 +713,11 @@ do --atlas-root=*) GetSwitchExistingPathOrDie ATLASROOT "$1" shift ;; - --use-cuda) - use_cuda=true; + --use-rocm) + use_rocm=true; + shift ;; + --use-rocm=no) + use_rocm=false; shift ;; --use-cuda=yes) use_cuda=true; @@ -729,6 +794,13 @@ do --mathlib=*) GetSwitchValueOrDie MATHLIB "$1" shift ;; + --rocm-dir=*) + # ROCM is used in src/cudamatrix and src/nnet{,bin} only. + GetSwitchExistingPathOrDie ROCMDIR "$1" + shift ;; + --rocm-targets=*) + GetSwitchValueOrDie ROCM_TARGETS "$1" + shift ;; --cudatk-dir=*) # CUDA is used in src/cudamatrix and src/nnet{,bin} only. GetSwitchExistingPathOrDie CUDATKDIR "$1" @@ -1304,6 +1376,7 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)." failure "Unsupported linear algebra library '$MATHLIB'" fi $use_cuda && configure_cuda + $use_rocm && configure_rocm linux_configure_speex else failure "Could not detect the platform or we have not yet worked out the diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 45c2ba44fd7..31c7c5ef3e5 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -12,7 +12,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \ cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \ cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) OBJFILES += cu-kernels.o endif @@ -27,8 +27,19 @@ ifeq ($(CUDA), true) endif endif +ifeq ($(CUDA), true) # Implicit rule for kernel compilation, %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ +endif + +ifeq ($(ROCM), true) +#%.hip : %.cu +# $(HIPIFY) $< 1> $@ 2> $@.stats +#%.o : %.hip +# $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +endifn include ../makefiles/default_rules.mk diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk index 3ae5ed5e2dd..c27b7b0a108 100644 --- a/src/makefiles/default_rules.mk +++ b/src/makefiles/default_rules.mk @@ -145,12 +145,17 @@ ifneq ($(CC_SRCS),) CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS) endif -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_ENABLED), true) CUDA_SRCS=$(wildcard *.cu) # Check if any CUDA .cu sources exist to run dependency commands on. ifneq ($(CUDA_SRCS),) +ifeq ($(CUDA), true) NVCC_DEP_COMMAND = $(CUDATKDIR)/bin/nvcc -M $(CUDA_FLAGS) $(CUDA_INCLUDE) $(CUDA_SRCS) endif +ifeq ($(ROCM), true) +HIPCC_DEP_COMMAND = $(HIPCC) -M $(ROCM_FLAGS) $(ROCM_INCLUDE) $(CUDA_SRCS) +endif +endif endif .PHONY: depend @@ -162,6 +167,9 @@ endif ifneq ($(NVCC_DEP_COMMAND),) -$(NVCC_DEP_COMMAND) >> .depend.mk endif +ifneq ($(HIPCC_DEP_COMMAND),) + -$(HIPCC_DEP_COMMAND) >> .depend.mk +endif # removing automatic making of "depend" as it's quite slow. #.depend.mk: depend diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index 0bf1bebe096..b6c75ac7118 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -3,9 +3,14 @@ all: include ../kaldi.mk +ifeq ($(CUDA), true) LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) - +endif +ifeq ($(ROCM), true) +LDFLAGS += $(ROCM_LDFLAGS) +LDLIBS += $(ROCM_LDLIBS) +endif TESTFILES = natural-gradient-online-test nnet-graph-test \ nnet-descriptor-test nnet-parse-test nnet-component-test \ diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile index 039fc258b13..2bd23273982 100644 --- a/src/nnet3bin/Makefile +++ b/src/nnet3bin/Makefile @@ -3,8 +3,14 @@ all: EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk +ifeq ($(CUDA), true) LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) +endif +ifeq ($(ROCM), true) +LDFLAGS += $(ROCM_LDFLAGS) +LDLIBS += $(ROCM_LDLIBS) +endif BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \ nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \ From 64c27545ce49357fe900de377eb266e9fe11f46d Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Wed, 7 Sep 2022 10:03:38 -0500 Subject: [PATCH 02/22] Remove extra quote. --- src/configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/configure b/src/configure index feb2fd276ad..21e439eeb4b 100755 --- a/src/configure +++ b/src/configure @@ -275,7 +275,7 @@ function configure_rocm { echo "# ROCm configuration" >> kaldi.mk echo >> kaldi.mk echo IS_GPU_BUILD = true >> kaldi.mk - echo ROCM = true">> kaldi.mk + echo ROCM = true >> kaldi.mk echo "ROCMDIR = $ROCMDIR" >> kaldi.mk echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk From ee18146a6ce723de6c26a78890f6e83b484c0460 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Thu, 8 Sep 2022 07:05:47 -0500 Subject: [PATCH 03/22] Add hipify header. --- src/configure | 3 +- src/cudamatrix/Makefile | 4 +- src/cudamatrix/cu-device.cc | 8 +- src/cudamatrix/cu-kernels.cu | 9 ++- src/hip/hipify.h | 22 +++++ src/hip/math_constants.h | 152 +++++++++++++++++++++++++++++++++++ src/makefiles/hip_64bit.mk | 21 +++++ 7 files changed, 214 insertions(+), 5 deletions(-) create mode 100644 src/hip/hipify.h create mode 100644 src/hip/math_constants.h create mode 100644 src/makefiles/hip_64bit.mk diff --git a/src/configure b/src/configure index 21e439eeb4b..fa0b77373a0 100755 --- a/src/configure +++ b/src/configure @@ -258,9 +258,10 @@ function check_for_slow_expf { function configure_rocm { # Check for ROCM in the system if [ ! -d "$ROCMDIR" ]; then - for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do + for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do if [ -f $base/bin/hipcc ]; then ROCMDIR=$base + break fi done fi diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 31c7c5ef3e5..512028c6c13 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -39,7 +39,7 @@ ifeq ($(ROCM), true) #%.o : %.hip # $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ %.o : %.cu - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -endifn + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +endif include ../makefiles/default_rules.mk diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 39bcf373ace..5bcb0552924 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -23,10 +23,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +#else #include #include #include - +#endif // __IS_HIP_COMPILE__ #include #include #include diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 8044ff699bc..c644cbc0784 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -28,10 +28,17 @@ #include #include #include +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "cudamatrix/cu-kernels-ansi.h" +#include +#include +#else #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION - +#endif //__IS_HIP_COMPILE__ /*********************************************************************** * Generic __device__ functions diff --git a/src/hip/hipify.h b/src/hip/hipify.h new file mode 100644 index 00000000000..41b7a02cb04 --- /dev/null +++ b/src/hip/hipify.h @@ -0,0 +1,22 @@ +#ifndef __HIPIFY_H__ +#define __HIPIFY_H__ + +inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} + +// +// HIP types +// +#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaGetDevice hipGetDevice +#define cudaStream_t hipStream_t +#define cudaStreamLegacy ((hipStream_t)1) +#define cudaStreamPerThread ((hipStream_t)2) + +// +// HIPCUB +// +#define cub hipcub + + +#endif //__HIPIFY_H__ diff --git a/src/hip/math_constants.h b/src/hip/math_constants.h new file mode 100644 index 00000000000..7fb8fce8e71 --- /dev/null +++ b/src/hip/math_constants.h @@ -0,0 +1,152 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__MATH_CONSTANTS_H__) +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ +#define CUDART_INF_F __int_as_float(0x7f800000) +#define CUDART_NAN_F __int_as_float(0x7fffffff) +#define CUDART_MIN_DENORM_F __int_as_float(0x00000001) +#define CUDART_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define CUDART_NEG_ZERO_F __int_as_float(0x80000000) +#define CUDART_ZERO_F 0.0f +#define CUDART_ONE_F 1.0f +#define CUDART_SQRT_HALF_F 0.707106781f +#define CUDART_SQRT_HALF_HI_F 0.707106781f +#define CUDART_SQRT_HALF_LO_F 1.210161749e-08f +#define CUDART_SQRT_TWO_F 1.414213562f +#define CUDART_THIRD_F 0.333333333f +#define CUDART_PIO4_F 0.785398163f +#define CUDART_PIO2_F 1.570796327f +#define CUDART_3PIO4_F 2.356194490f +#define CUDART_2_OVER_PI_F 0.636619772f +#define CUDART_SQRT_2_OVER_PI_F 0.797884561f +#define CUDART_PI_F 3.141592654f +#define CUDART_L2E_F 1.442695041f +#define CUDART_L2T_F 3.321928094f +#define CUDART_LG2_F 0.301029996f +#define CUDART_LGE_F 0.434294482f +#define CUDART_LN2_F 0.693147181f +#define CUDART_LNT_F 2.302585093f +#define CUDART_LNPI_F 1.144729886f +#define CUDART_TWO_TO_M126_F 1.175494351e-38f +#define CUDART_TWO_TO_126_F 8.507059173e37f +#define CUDART_NORM_HUGE_F 3.402823466e38f +#define CUDART_TWO_TO_23_F 8388608.0f +#define CUDART_TWO_TO_24_F 16777216.0f +#define CUDART_TWO_TO_31_F 2147483648.0f +#define CUDART_TWO_TO_32_F 4294967296.0f +#define CUDART_REMQUO_BITS_F 3 +#define CUDART_REMQUO_MASK_F (~((~0)< Date: Thu, 8 Sep 2022 18:07:47 -0500 Subject: [PATCH 04/22] Add more entries to hipificatiion header to deal with the BLAS routines. --- src/cudamatrix/cu-allocator.h | 7 ++ src/cudamatrix/cu-array-inl.h | 5 ++ src/cudamatrix/cu-common.h | 9 +++ src/cudamatrix/cu-device.h | 14 +++- src/cudamatrix/cu-matrix.cc | 6 ++ src/cudamatrix/cublas-wrappers.h | 17 ++-- src/hip/hipify.h | 129 +++++++++++++++++++++++++++++++ src/makefiles/hip_64bit.mk | 2 +- 8 files changed, 181 insertions(+), 8 deletions(-) diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index d7d65da806a..a3baa2fb33d 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -23,10 +23,17 @@ #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +#else #include #include #include #endif +#endif #include #include diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 53de59fe4fc..36b829046ed 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -28,7 +28,12 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#else #include +#endif #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 83f8a39a8b9..617f4363269 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -31,11 +31,20 @@ #if HAVE_CUDA +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +//TODO: tests with ROCTX #include +#include +#else #include #include #include #include #include +#endif #define CU_SAFE_CALL(fun) \ { \ diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 2f278eb85b9..515fa4d7d25 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -28,14 +28,26 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +#include +#include +#else #include #include #include #include #include - +#endif #if CUDA_VERSION >= 9010 +#ifdef __IS_HIP_COMPILE__ +#include +#else #include +#endif #else // cusolver not supported. // Setting a few types to minimize compiler guards. diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index c67842d38bf..a522f13451a 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -27,9 +27,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h index 63dbe630568..dc5c0e0ced5 100644 --- a/src/cudamatrix/cublas-wrappers.h +++ b/src/cudamatrix/cublas-wrappers.h @@ -28,14 +28,17 @@ namespace kaldi { #if HAVE_CUDA == 1 +#ifndef CUBLAS_R_32F +#define CUBLAS_R_32F CUDA_R_32F +#endif inline cublasStatus_t cublas_gemm( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { #if CUDA_VERSION >= 11000 - return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUDA_R_32F,lda,B,CUDA_R_32F,ldb,&beta, - C,CUDA_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(), + return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUBLAS_R_32F,lda,B,CUBLAS_R_32F,ldb,&beta, + C,CUBLAS_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc); @@ -63,8 +66,8 @@ inline cublasStatus_t cublas_gemmBatched( const float *A[], int lda, const float *B[], int ldb, float beta, float *C[], int ldc, int batchCount) { #if CUDA_VERSION >= 11000 - return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUDA_R_32F, lda, - (const void**)B, CUDA_R_32F, ldb, &beta, (void**)C, CUDA_R_32F, ldc, batchCount, + return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUBLAS_R_32F, lda, + (const void**)B, CUBLAS_R_32F, ldb, &beta, (void**)C, CUBLAS_R_32F, ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); @@ -219,6 +222,7 @@ inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo, // cuSPARSE wrappers // #if CUDA_VERSION >= 10020 +#ifndef __IS_HIP_COMPILE__ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal, const int *csrRowPtr, @@ -243,6 +247,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, return status; } +#endif inline cusparseStatus_t cusparse_csrmm2(cusparseHandle_t handle, cusparseOperation_t transA, @@ -319,7 +324,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int *cscRowInd, int *cscColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) { -#if CUDA_VERSION >= 10020 +#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__) return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, CUDA_R_32F, copyValues, idxBase); @@ -336,7 +341,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int *cscRowInd, int *cscColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) { -#if CUDA_VERSION >= 10020 +#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__) return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, CUDA_R_64F, copyValues, idxBase); diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 41b7a02cb04..697afc7a6d3 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -5,14 +5,143 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} // // HIP types +// TODO: Verify that HIPBLAS_R_32F and HIPBLAS_GEMM_DEFAULT can be sensible replacements for tensor ops. // + #define cudaDevAttrWarpSize hipDeviceAttributeWarpSize #define cudaDeviceGetAttribute hipDeviceGetAttribute #define cudaGetDevice hipGetDevice +#define cudaGetErrorString hipGetErrorString #define cudaStream_t hipStream_t #define cudaStreamLegacy ((hipStream_t)1) #define cudaStreamPerThread ((hipStream_t)2) +#define cublasStatus_t hipblasStatus_t +#define cudaError_t hipError_t +#define cusparseDestroy hipsparseDestroy +#define cudaGetLastError hipGetLastError +#define cudaFree hipFree +#define cudaGetErrorString hipGetErrorString +#define cublasCreate hipblasCreate +#define cublasSetStream hipblasSetStream +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define curandCreateGenerator hiprandCreateGenerator +#define curandSetStream hiprandSetStream +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaGetDeviceProperties hipGetDeviceProperties +#define curandDestroyGenerator hiprandDestroyGenerator +#define cusparseDestroy hipsparseDestroy +#define cudaDeviceProp hipDeviceProp_t +#define cublasOperation_t hipblasOperation_t +#define cublasStatus_t hipblasStatus_t +#define cusparseStatus_t hipsparseStatus_t +#define curandStatus_t hiprandStatus_t +#define cublasHandle_t hipblasHandle_t +#define cusparseHandle_t hipsparseHandle_t +#define curandGenerator_t hiprandGenerator_t +#define cublasGemmAlgo_t hipblasGemmAlgo_t +#define cusolverDnHandle_t hipsolverDnHandle_t +#define cublasComputeType_t hipblasDatatype_t +#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed +#define curandSetGeneratorOffset hiprandSetGeneratorOffset +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cublasDaxpy_v2 hipblasDaxpy +#define cublasSaxpy_v2 hipblasSaxpy +#define cublasDscal_v2 hipblasDscal +#define cublasSscal_v2 hipblasSscal +#define cudaSetDevice hipSetDevice +#define cudaSuccess hipSuccess +#define cusolverDnCreate hipsolverDnCreate +#define cusolverDnSetStream hipsolverDnSetStream +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define cusparseCreate hipsparseCreate +#define cusolverDnDestroy hipsolverDnDestroy +#define cusparseSetStream hipsparseSetStream +#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT +#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing. +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaDeviceReset hipDeviceReset +#define cudaComputeModeExclusive hipComputeModeExclusive +#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess +#define cudaErrorInvalidDevice hipErrorInvalidDevice +#define cublasDestroy hipblasDestroy +#define cuDeviceGetName hipDeviceGetName +#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse +#define curandGenerateUniform hiprandGenerateUniform +#define curandGenerateUniformDouble hiprandGenerateUniformDouble +#define curandGenerateNormal hiprandGenerateNormal +#define curandGenerateNormalDouble hiprandGenerateNormalDouble +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define cusparseMatDescr_t hipsparseMatDescr_t +#define cudaMemsetAsync hipMemsetAsync +#define cublasGemmEx hipblasGemmEx +#define cublasDgemm_v2 hipblasDgemm +#define cublasSger_v2 hipblasSger +#define cublasDger_v2 hipblasDger +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasDgemmBatched hipblasDgemmBatched +#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) +#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT +#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) +#define cublasFillMode_t hipblasFillMode_t +#define cublasSsyrk_v2 hipblasSsyrk +#define cublasDsyrk_v2 hipblasDsyrk +#define cublasSdot_v2 hipblasSdot +#define cublasSasum_v2 hipblasSasum +#define cublasDnrm2_v2 hipblasDnrm2 +#define cublasScopy_v2 hipblasScopy +#define cublasDcopy_v2 hipblasDcopy +#define cublasSgemv_v2 hipblasSgemv +#define cublasDgemv_v2 hipblasDgemv +#define cublasSspmv_v2 hipblasSspmv +#define cublasDspmv_v2 hipblasDspmv +#define cublasDtpmv_v2 hipblasDtpmv +#define cublasSspr_v2 hipblasSspr +#define cublasDspr_v2 hipblasDspr +#define cudaDataType hipDataType +#define cusparseAction_t hipsparseAction_t +#define cublasDdot_v2 hipblasDdot +#define cublasDasum_v2 hipblasDasum +#define cublasSnrm2_v2 hipblasSnrm2 +#define cublasStpmv_v2 hipblasStpmv +#define cusparseIndexBase_t hipsparseIndexBase_t +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define cusparseOperation_t hipsparseOperation_t +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseGetMatIndexBase hipsparseGetMatIndexBase +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseDnMatDescr_t hipsparseDnMatDescr_t +#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN +#define cusparseCreateDnMat hipsparseCreateDnMat +#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 +#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize +#define cusparseSpMM hipsparseSpMM +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseDestroyDnMat hipsparseDestroyDnMat +#define cusparseScsr2csc hipsparseScsr2csc +#define CUDA_R_64F HIP_R_64F +#define CUDA_R_32F HIP_R_32F +#define CUBLAS_R_64F HIPBLAS_R_64F +#define CUBLAS_R_32F HIPBLAS_R_32F +#define cusparseDcsr2csc hipsparseDcsr2csc +#define cusparseCreateMatDescr hipsparseCreateMatDescr +#define cusparseDestroyMatDescr hipsparseDestroyMatDescr +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_OP_N HIPBLAS_OP_N +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemset2DAsync hipMemset2DAsync // // HIPCUB // diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 453d9d5fe62..b405d84a15b 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -5,7 +5,7 @@ ifndef ROCMDIR $(error ROCMDIR not defined.) endif -CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 \ +CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC) From 07f2f36e398aa09a59a6655c212f8c1233f81216 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Thu, 8 Sep 2022 18:36:28 -0500 Subject: [PATCH 05/22] Cudmatrix hipification complete. --- src/cudamatrix/cu-allocator.cc | 7 +++++ src/cudamatrix/cu-array.cc | 5 +++ src/cudamatrix/cu-block-matrix.cc | 6 ++++ src/cudamatrix/cu-common.cc | 5 +++ src/cudamatrix/cu-compressed-matrix.cc | 6 ++++ src/cudamatrix/cu-packed-matrix.cc | 6 ++++ src/cudamatrix/cu-sp-matrix.cc | 6 ++++ src/cudamatrix/cu-sparse-matrix.cc | 6 ++++ src/cudamatrix/cu-tp-matrix.cc | 6 ++++ src/cudamatrix/cu-vector.cc | 6 ++++ src/hip/hipify.h | 42 ++++++++++++++++++++++++++ 11 files changed, 101 insertions(+) diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index e438c604509..8e08d3ef2a1 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -23,9 +23,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #include +#endif + #include #include diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc index 53eccdd44c5..2017ebce5c7 100644 --- a/src/cudamatrix/cu-array.cc +++ b/src/cudamatrix/cu-array.cc @@ -22,8 +22,13 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#else #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index e0c64912207..a2bd910eba0 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -19,9 +19,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include #include "base/timer.h" diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 10fc00da681..585d980ed19 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -22,7 +22,12 @@ #include "cudamatrix/cu-common.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#else #include +#endif #include "base/kaldi-common.h" #include "cudamatrix/cu-matrixdim.h" diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index be02921169d..0a5537b4248 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -19,9 +19,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 756d580c7cf..f0563a6123f 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -21,9 +21,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index d1efc0cff9c..a328457ca11 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -19,9 +19,15 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 703aa40e735..c0ebddfc95e 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -22,9 +22,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 377c34239f0..6929911fb5e 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -19,9 +19,15 @@ // limitations under the License. #if HAVE_CUDA==1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 8736782a3e0..fa5d94fb0bc 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -22,9 +22,15 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 697afc7a6d3..10010ceb70f 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -139,9 +139,51 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cusparseDestroyMatDescr hipsparseDestroyMatDescr #define CUBLAS_OP_T HIPBLAS_OP_T #define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_C HIPBLAS_OP_C #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemset2DAsync hipMemset2DAsync +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN +#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED +#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED +#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE +#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH +#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR +#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED +#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR +#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED +#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT +#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED +#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES +#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS +#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH +#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED +#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED +#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR +#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE +#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE +#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED +#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE +#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR +#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define cudaMalloc hipMalloc +#define cudaMallocPitch hipMallocPitch +#define cuMemGetInfo_v2 hipMemGetInfo + // // HIPCUB // From fde6f7f478ce18af0142885fd625a33ce2946671 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 9 Sep 2022 06:54:00 -0500 Subject: [PATCH 06/22] Ignore Eclipse synchronized project files. --- .gitignore | 4 ++++ src/chain/Makefile | 2 +- src/chain/chain-kernels-ansi.h | 4 ++++ src/chain/chain-kernels.cu | 5 +++++ src/makefiles/hip_64bit.mk | 8 +++----- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 9f8c727d4d0..53a4079d9ef 100644 --- a/.gitignore +++ b/.gitignore @@ -90,3 +90,7 @@ venv/ # CMakeLists.txt files are currently autogenerated, must not be committed. /src/**/CMakeLists.txt /build* + +# Eclipse sync project +.ptp-sync +.ptp-sync-folder diff --git a/src/chain/Makefile b/src/chain/Makefile index c4411f4b997..678bb03ef33 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -38,7 +38,7 @@ ifeq ($(ROCM), true) #%.o : %.hip # $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ %.o : %.cu - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index f5814d7c11c..48c80cc8d92 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -22,6 +22,10 @@ #define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_ #include "chain/chain-datastruct.h" +#ifdef __IS_HIP_COMPILE__ +#include +#endif + #if HAVE_CUDA == 1 extern "C" { diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index a63944f0012..739b9005854 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -20,6 +20,11 @@ #include #include "chain/chain-kernels-ansi.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#endif + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 #error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \ configure with --use-cuda=no (this will disable the use of GPU). diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index b405d84a15b..6ca4ea7d1b6 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -13,9 +13,7 @@ ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread -#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64 -#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib -ROCM_LDFLAGS += - +#TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. +CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib #CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt -ROCM_LDLIBS += +CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64 From 21ca60dfeeee2496801869ee96667cfd73df4aa6 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 9 Sep 2022 08:02:20 -0500 Subject: [PATCH 07/22] Hipify complete including NVTX. --- src/chain/chain-kernels.cu | 1 - src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 2 +- src/cudamatrix/cu-block-matrix.cc | 2 +- src/cudamatrix/cu-common.cc | 84 ++++++++++++++------------ src/cudamatrix/cu-common.h | 2 +- src/cudamatrix/cu-compressed-matrix.cc | 2 +- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 2 +- src/cudamatrix/cu-matrix.cc | 2 +- src/cudamatrix/cu-packed-matrix.cc | 2 +- src/cudamatrix/cu-sp-matrix.cc | 2 +- src/cudamatrix/cu-sparse-matrix.cc | 2 +- src/cudamatrix/cu-tp-matrix.cc | 2 +- src/cudamatrix/cu-vector.cc | 2 +- src/makefiles/hip_64bit.mk | 7 ++- 16 files changed, 65 insertions(+), 53 deletions(-) diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 739b9005854..2a30128750c 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -22,7 +22,6 @@ #ifdef __IS_HIP_COMPILE__ #include -#include #endif #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index 8e08d3ef2a1..82d682588d8 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index a3baa2fb33d..0cc1f7e6a4b 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index a2bd910eba0..04885296445 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 585d980ed19..6275bc9073a 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -25,8 +25,10 @@ #ifdef __IS_HIP_COMPILE__ #include #include +#define API_NAME_PREFIX "HIP" #else #include +#define API_NAME_PREFIX "CU" #endif #include "base/kaldi-common.h" @@ -36,6 +38,9 @@ namespace kaldi { #ifdef USE_NVTX NvtxTracer::NvtxTracer(const char* name) { +#ifdef __IS_HIP_COMPILE__ + roctxRangePushA(name); +#else const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; const int num_colors = sizeof(colors)/sizeof(uint32_t); int color_id = ((int)name[0])%num_colors; @@ -48,9 +53,14 @@ NvtxTracer::NvtxTracer(const char* name) { eventAttrib.message.ascii = name; nvtxRangePushEx(&eventAttrib); // nvtxRangePushA(name); +#endif } NvtxTracer::~NvtxTracer() { +#ifdef __IS_HIP_COMPILE__ + roctxRangePop(); +#else nvtxRangePop(); +#endif } #endif @@ -92,16 +102,16 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, const char* cublasGetStatusStringK(cublasStatus_t status) { // Defined in CUDA include file: cublas.h or cublas_api.h switch(status) { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; + case CUBLAS_STATUS_SUCCESS: return API_NAME_PREFIX "BLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; } return "CUBLAS_STATUS_UNKNOWN_ERROR"; } @@ -110,43 +120,43 @@ const char* cusparseGetStatusString(cusparseStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust // Defined in CUDA include file: cusparse.h switch(status) { - case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS"; - case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED"; - case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED"; - case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE"; - case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH"; - case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR"; - case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED"; - case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; + case CUSPARSE_STATUS_SUCCESS: return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_ALLOC_FAILED: return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_INVALID_VALUE: return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_MAPPING_ERROR: return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_EXECUTION_FAILED: return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_ZERO_PIVOT: return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT"; #if CUDA_VERSION >= 11000 - case CUSPARSE_STATUS_NOT_SUPPORTED: return "CUSPARSE_STATUS_NOT_SUPPORTED"; - case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES"; + case CUSPARSE_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED"; + case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; #endif } - return "CUSPARSE_STATUS_UNKNOWN_ERROR"; + return "SPARSE_STATUS_UNKNOWN_ERROR"; } const char* curandGetStatusString(curandStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html // Defined in CUDA include file: curand.h switch(status) { - case CURAND_STATUS_SUCCESS: return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: return "CURAND_STATUS_INTERNAL_ERROR"; + case CURAND_STATUS_SUCCESS: return API_NAME_PREFIX "RAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; } - return "CURAND_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR"; } } // namespace kaldi diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 617f4363269..a0c879414d4 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -36,7 +36,7 @@ #include #include #include -//TODO: tests with ROCTX #include +#include #include #else #include diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index 0a5537b4248..de4fe6f8da2 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 5bcb0552924..41f8d6f83d5 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 515fa4d7d25..9286b6fe14a 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -44,7 +44,7 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ -#include +#include #else #include #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index a522f13451a..675ed74aeb4 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,7 +29,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index f0563a6123f..5acfc7443c4 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,7 +23,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index a328457ca11..adfb3e0b517 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index c0ebddfc95e..45742571a41 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 6929911fb5e..51fb744a855 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index fa5d94fb0bc..62ff16cb7f9 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 6ca4ea7d1b6..0ff628d67f6 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -5,11 +5,14 @@ ifndef ROCMDIR $(error ROCMDIR not defined.) endif -CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ + +ROCM_USEROCTX = -DUSE_NVTX + +CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC) -ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \ +ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread From 104023482690fbdc92d1cb190a85de8b697f86be Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 9 Sep 2022 09:21:01 -0500 Subject: [PATCH 08/22] Format files for the hipification. --- src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 2 +- src/cudamatrix/cu-array-inl.h | 2 +- src/cudamatrix/cu-array.cc | 2 +- src/cudamatrix/cu-block-matrix.cc | 2 +- src/cudamatrix/cu-common.cc | 13 +- src/cudamatrix/cu-common.h | 2 +- src/cudamatrix/cu-compressed-matrix.cc | 2 +- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 2 +- src/cudamatrix/cu-kernels.cu | 2 +- src/cudamatrix/cu-matrix.cc | 2 +- src/cudamatrix/cu-packed-matrix.cc | 2 +- src/cudamatrix/cu-sp-matrix.cc | 2 +- src/cudamatrix/cu-sparse-matrix.cc | 2 +- src/cudamatrix/cu-tp-matrix.cc | 2 +- src/cudamatrix/cu-vector.cc | 2 +- src/hip/hipify.h | 347 ++++++++++++------------- src/makefiles/hip_64bit.mk | 5 +- 19 files changed, 198 insertions(+), 199 deletions(-) diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index 82d682588d8..abd08a9b015 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -26,7 +26,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 0cc1f7e6a4b..1ed7e54b541 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -27,7 +27,7 @@ #include #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 36b829046ed..1fd80502cf9 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -30,7 +30,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include "hipify.h" #else #include #endif diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc index 2017ebce5c7..333e8fbed1c 100644 --- a/src/cudamatrix/cu-array.cc +++ b/src/cudamatrix/cu-array.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include "hipify.h" #else #include #endif diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index 04885296445..fd17fe61893 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -22,7 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 6275bc9073a..2e77062f20d 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -24,7 +24,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#include +#include "hipify.h" #define API_NAME_PREFIX "HIP" #else #include @@ -112,8 +112,12 @@ const char* cublasGetStatusStringK(cublasStatus_t status) { case CUBLAS_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; case CUBLAS_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; case CUBLAS_STATUS_LICENSE_ERROR: return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; +#ifdef __IS_HIP_COMPILE__ + case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; + case HIPBLAS_STATUS_INVALID_ENUM: return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; +#endif } - return "CUBLAS_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR"; } const char* cusparseGetStatusString(cusparseStatus_t status) { @@ -135,7 +139,7 @@ const char* cusparseGetStatusString(cusparseStatus_t status) { case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; #endif } - return "SPARSE_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR"; } const char* curandGetStatusString(curandStatus_t status) { @@ -155,6 +159,9 @@ const char* curandGetStatusString(curandStatus_t status) { case CURAND_STATUS_INITIALIZATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; case CURAND_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; case CURAND_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; +#ifdef __IS_HIP_COMPILE__ + case HIPRAND_STATUS_NOT_IMPLEMENTED: return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED"; +#endif } return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR"; } diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index a0c879414d4..da7c57bde36 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -37,7 +37,7 @@ #include #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index de4fe6f8da2..e42c93f1b67 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -22,7 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 41f8d6f83d5..705bfbeee59 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -27,7 +27,7 @@ #include #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 9286b6fe14a..d7edf5a5a1c 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -34,7 +34,7 @@ #include #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index c644cbc0784..9a99f19b58f 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -30,7 +30,7 @@ #include #ifdef __IS_HIP_COMPILE__ #include -#include +#include "hipify.h" #include "cudamatrix/cu-kernels-ansi.h" #include #include diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 675ed74aeb4..c1d72ede87e 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -30,7 +30,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 5acfc7443c4..c9d686d0ce8 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -24,7 +24,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index adfb3e0b517..a6c7d7720e4 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -22,7 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 45742571a41..a21e5163701 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 51fb744a855..378cc8e4e38 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -22,7 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 62ff16cb7f9..cf13d631a0d 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 10010ceb70f..89daad6bc28 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -5,187 +5,180 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} // // HIP types -// TODO: Verify that HIPBLAS_R_32F and HIPBLAS_GEMM_DEFAULT can be sensible replacements for tensor ops. // - -#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize -#define cudaDeviceGetAttribute hipDeviceGetAttribute -#define cudaGetDevice hipGetDevice -#define cudaGetErrorString hipGetErrorString -#define cudaStream_t hipStream_t -#define cudaStreamLegacy ((hipStream_t)1) -#define cudaStreamPerThread ((hipStream_t)2) -#define cublasStatus_t hipblasStatus_t -#define cudaError_t hipError_t -#define cusparseDestroy hipsparseDestroy -#define cudaGetLastError hipGetLastError - -#define cudaFree hipFree -#define cudaGetErrorString hipGetErrorString -#define cublasCreate hipblasCreate -#define cublasSetStream hipblasSetStream -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define curandCreateGenerator hiprandCreateGenerator -#define curandSetStream hiprandSetStream -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaGetDeviceProperties hipGetDeviceProperties -#define curandDestroyGenerator hiprandDestroyGenerator -#define cusparseDestroy hipsparseDestroy -#define cudaDeviceProp hipDeviceProp_t -#define cublasOperation_t hipblasOperation_t -#define cublasStatus_t hipblasStatus_t -#define cusparseStatus_t hipsparseStatus_t -#define curandStatus_t hiprandStatus_t -#define cublasHandle_t hipblasHandle_t -#define cusparseHandle_t hipsparseHandle_t -#define curandGenerator_t hiprandGenerator_t -#define cublasGemmAlgo_t hipblasGemmAlgo_t -#define cusolverDnHandle_t hipsolverDnHandle_t -#define cublasComputeType_t hipblasDatatype_t -#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed -#define curandSetGeneratorOffset hiprandSetGeneratorOffset -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cublasDaxpy_v2 hipblasDaxpy -#define cublasSaxpy_v2 hipblasSaxpy -#define cublasDscal_v2 hipblasDscal -#define cublasSscal_v2 hipblasSscal -#define cudaSetDevice hipSetDevice -#define cudaSuccess hipSuccess -#define cusolverDnCreate hipsolverDnCreate -#define cusolverDnSetStream hipsolverDnSetStream -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define cusparseCreate hipsparseCreate -#define cusolverDnDestroy hipsolverDnDestroy -#define cusparseSetStream hipsparseSetStream -#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT -#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing. -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaDeviceReset hipDeviceReset -#define cudaComputeModeExclusive hipComputeModeExclusive -#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess -#define cudaErrorInvalidDevice hipErrorInvalidDevice -#define cublasDestroy hipblasDestroy -#define cuDeviceGetName hipDeviceGetName -#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse -#define curandGenerateUniform hiprandGenerateUniform -#define curandGenerateUniformDouble hiprandGenerateUniformDouble -#define curandGenerateNormal hiprandGenerateNormal -#define curandGenerateNormalDouble hiprandGenerateNormalDouble -#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE -#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE -#define cusparseMatDescr_t hipsparseMatDescr_t -#define cudaMemsetAsync hipMemsetAsync -#define cublasGemmEx hipblasGemmEx -#define cublasDgemm_v2 hipblasDgemm -#define cublasSger_v2 hipblasSger -#define cublasDger_v2 hipblasDger -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasDgemmBatched hipblasDgemmBatched -#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) -#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT -#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT -#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) -#define cublasFillMode_t hipblasFillMode_t -#define cublasSsyrk_v2 hipblasSsyrk -#define cublasDsyrk_v2 hipblasDsyrk -#define cublasSdot_v2 hipblasSdot -#define cublasSasum_v2 hipblasSasum -#define cublasDnrm2_v2 hipblasDnrm2 -#define cublasScopy_v2 hipblasScopy -#define cublasDcopy_v2 hipblasDcopy -#define cublasSgemv_v2 hipblasSgemv -#define cublasDgemv_v2 hipblasDgemv -#define cublasSspmv_v2 hipblasSspmv -#define cublasDspmv_v2 hipblasDspmv -#define cublasDtpmv_v2 hipblasDtpmv -#define cublasSspr_v2 hipblasSspr -#define cublasDspr_v2 hipblasDspr -#define cudaDataType hipDataType -#define cusparseAction_t hipsparseAction_t -#define cublasDdot_v2 hipblasDdot -#define cublasDasum_v2 hipblasDasum -#define cublasSnrm2_v2 hipblasSnrm2 -#define cublasStpmv_v2 hipblasStpmv -#define cusparseIndexBase_t hipsparseIndexBase_t -#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS -#define cusparseOperation_t hipsparseOperation_t -#define cusparseSpMatDescr_t hipsparseSpMatDescr_t -#define cusparseGetMatIndexBase hipsparseGetMatIndexBase -#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I -#define cusparseCreateCsr hipsparseCreateCsr -#define cusparseDnMatDescr_t hipsparseDnMatDescr_t -#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN -#define cusparseCreateDnMat hipsparseCreateDnMat -#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 -#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize -#define cusparseSpMM hipsparseSpMM -#define cusparseDestroySpMat hipsparseDestroySpMat -#define cusparseDestroyDnMat hipsparseDestroyDnMat -#define cusparseScsr2csc hipsparseScsr2csc -#define CUDA_R_64F HIP_R_64F -#define CUDA_R_32F HIP_R_32F -#define CUBLAS_R_64F HIPBLAS_R_64F -#define CUBLAS_R_32F HIPBLAS_R_32F -#define cusparseDcsr2csc hipsparseDcsr2csc -#define cusparseCreateMatDescr hipsparseCreateMatDescr -#define cusparseDestroyMatDescr hipsparseDestroyMatDescr -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_C HIPBLAS_OP_C -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemset2DAsync hipMemset2DAsync -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN -#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED -#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED -#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE -#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH -#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR -#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED -#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_C HIPBLAS_OP_C +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_R_32F HIPBLAS_R_32F +#define CUBLAS_R_64F HIPBLAS_R_64F +#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUDA_R_32F HIP_R_32F +#define CUDA_R_64F HIP_R_64F +#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT +#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED +#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH +#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR +#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE +#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE +#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED +#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE +#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE +#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS +#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR +#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH +#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN +#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 +#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED +#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH +#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED +#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES +#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR +#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE +#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR #define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT -#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED -#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES -#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS -#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH -#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED -#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED -#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR -#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE -#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE -#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED -#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE -#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE -#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED -#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH -#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED -#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR -#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC -#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO -#define cudaMalloc hipMalloc -#define cudaMallocPitch hipMallocPitch -#define cuMemGetInfo_v2 hipMemGetInfo +#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED +#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT +#define cuDeviceGetName hipDeviceGetName +#define cuMemGetInfo_v2 hipMemGetInfo +#define cublasComputeType_t hipblasDatatype_t +#define cublasCreate hipblasCreate +#define cublasDasum_v2 hipblasDasum +#define cublasDaxpy_v2 hipblasDaxpy +#define cublasDcopy_v2 hipblasDcopy +#define cublasDdot_v2 hipblasDdot +#define cublasDestroy hipblasDestroy +#define cublasDgemmBatched hipblasDgemmBatched +#define cublasDgemm_v2 hipblasDgemm +#define cublasDgemv_v2 hipblasDgemv +#define cublasDger_v2 hipblasDger +#define cublasDnrm2_v2 hipblasDnrm2 +#define cublasDscal_v2 hipblasDscal +#define cublasDspmv_v2 hipblasDspmv +#define cublasDspr_v2 hipblasDspr +#define cublasDsyrk_v2 hipblasDsyrk +#define cublasDtpmv_v2 hipblasDtpmv +#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) +#define cublasFillMode_t hipblasFillMode_t +#define cublasGemmAlgo_t hipblasGemmAlgo_t +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasOperation_t hipblasOperation_t +#define cublasSasum_v2 hipblasSasum +#define cublasSaxpy_v2 hipblasSaxpy +#define cublasScopy_v2 hipblasScopy +#define cublasSdot_v2 hipblasSdot +#define cublasSetStream hipblasSetStream +#define cublasSgemv_v2 hipblasSgemv +#define cublasSger_v2 hipblasSger +#define cublasSnrm2_v2 hipblasSnrm2 +#define cublasSscal_v2 hipblasSscal +#define cublasSspmv_v2 hipblasSspmv +#define cublasSspr_v2 hipblasSspr +#define cublasSsyrk_v2 hipblasSsyrk +#define cublasStatus_t hipblasStatus_t +#define cublasStatus_t hipblasStatus_t +#define cublasStpmv_v2 hipblasStpmv +#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) +#define cudaComputeModeExclusive hipComputeModeExclusive +#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess +#define cudaDataType hipDataType +#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceReset hipDeviceReset +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse +#define cudaErrorInvalidDevice hipErrorInvalidDevice +#define cudaError_t hipError_t +#define cudaFree hipFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocPitch hipMallocPitch +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemset2DAsync hipMemset2DAsync +#define cudaMemsetAsync hipMemsetAsync +#define cudaSetDevice hipSetDevice +#define cudaStreamLegacy ((hipStream_t)1) +#define cudaStreamPerThread ((hipStream_t)2) +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define curandCreateGenerator hiprandCreateGenerator +#define curandDestroyGenerator hiprandDestroyGenerator +#define curandGenerateNormal hiprandGenerateNormal +#define curandGenerateNormalDouble hiprandGenerateNormalDouble +#define curandGenerateUniform hiprandGenerateUniform +#define curandGenerateUniformDouble hiprandGenerateUniformDouble +#define curandGenerator_t hiprandGenerator_t +#define curandSetGeneratorOffset hiprandSetGeneratorOffset +#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed +#define curandSetStream hiprandSetStream +#define curandStatus_t hiprandStatus_t +#define cusolverDnCreate hipsolverDnCreate +#define cusolverDnDestroy hipsolverDnDestroy +#define cusolverDnHandle_t hipsolverDnHandle_t +#define cusolverDnSetStream hipsolverDnSetStream +#define cusparseAction_t hipsparseAction_t +#define cusparseCreate hipsparseCreate +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseCreateDnMat hipsparseCreateDnMat +#define cusparseCreateMatDescr hipsparseCreateMatDescr +#define cusparseDcsr2csc hipsparseDcsr2csc +#define cusparseDestroy hipsparseDestroy +#define cusparseDestroy hipsparseDestroy +#define cusparseDestroyDnMat hipsparseDestroyDnMat +#define cusparseDestroyMatDescr hipsparseDestroyMatDescr +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseDnMatDescr_t hipsparseDnMatDescr_t +#define cusparseGetMatIndexBase hipsparseGetMatIndexBase +#define cusparseHandle_t hipsparseHandle_t +#define cusparseIndexBase_t hipsparseIndexBase_t +#define cusparseMatDescr_t hipsparseMatDescr_t +#define cusparseOperation_t hipsparseOperation_t +#define cusparseScsr2csc hipsparseScsr2csc +#define cusparseSetStream hipsparseSetStream +#define cusparseSpMM hipsparseSpMM +#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseStatus_t hipsparseStatus_t // -// HIPCUB +// HIPCUB namespace. // #define cub hipcub diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 0ff628d67f6..0c558a770d6 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -5,8 +5,8 @@ ifndef ROCMDIR $(error ROCMDIR not defined.) endif - -ROCM_USEROCTX = -DUSE_NVTX +# Uncomment if willing to use ROCTX capabilities. +# ROCM_USEROCTX = -DUSE_NVTX CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) @@ -18,5 +18,4 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib -#CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64 From 801115d710904ca505e318e9cd9cc3ffa7fc0f87 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 9 Sep 2022 09:57:45 -0500 Subject: [PATCH 09/22] Add hipification entries dropped by mistake. --- src/hip/hipify.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 89daad6bc28..7a0300ae02b 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -7,9 +7,12 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} // HIP types // #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. #define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT #define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs. #define CUBLAS_OP_C HIPBLAS_OP_C #define CUBLAS_OP_N HIPBLAS_OP_N #define CUBLAS_OP_N HIPBLAS_OP_N @@ -146,6 +149,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define curandGenerateUniformDouble hiprandGenerateUniformDouble #define curandGenerator_t hiprandGenerator_t #define curandSetGeneratorOffset hiprandSetGeneratorOffset +#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing. #define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed #define curandSetStream hiprandSetStream #define curandStatus_t hiprandStatus_t From 081de1ebcc44b846c4953bb3923818d6142b90cc Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 12 Sep 2022 06:06:19 -0500 Subject: [PATCH 10/22] Change IS_GPU_ENABLED to IS_GPU_BUILD in depends build. --- src/makefiles/default_rules.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk index c27b7b0a108..21a3b053639 100644 --- a/src/makefiles/default_rules.mk +++ b/src/makefiles/default_rules.mk @@ -145,7 +145,7 @@ ifneq ($(CC_SRCS),) CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS) endif -ifeq ($(IS_GPU_ENABLED), true) +ifeq ($(IS_GPU_BUILD), true) CUDA_SRCS=$(wildcard *.cu) # Check if any CUDA .cu sources exist to run dependency commands on. ifneq ($(CUDA_SRCS),) From 00098bf097ca7e9e804562c937b20c6714adf2f8 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 12 Sep 2022 17:11:35 -0500 Subject: [PATCH 11/22] Add build logic for ROCm < 5.2.0. --- src/configure | 28 +++++++++++++++++++++------- src/hip/hipify.h | 21 +++++++++++++++++++++ src/makefiles/hip_64bit.mk | 17 ++++++++++++----- 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/configure b/src/configure index fa0b77373a0..ffb87abe106 100755 --- a/src/configure +++ b/src/configure @@ -259,7 +259,7 @@ function configure_rocm { # Check for ROCM in the system if [ ! -d "$ROCMDIR" ]; then for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do - if [ -f $base/bin/hipcc ]; then + if [ -f $base/bin/hipcc ] && [ -f $base/bin/hipconfig ]; then ROCMDIR=$base break fi @@ -268,7 +268,7 @@ function configure_rocm { if [ -d "$ROCMDIR" ]; then if [ ! -f $ROCMDIR/bin/hipcc ]; then - failure "Cannnot find hipcc in ROCm directory $ROCMDIR" + failure "Cannnot find hipcc and hipconfig in ROCm directory $ROCMDIR" fi fi echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)" @@ -289,7 +289,20 @@ function configure_rocm { echo "HOST_ARCH = `uname -m`" >> kaldi.mk echo >> kaldi.mk - + + ROCM_MAJOR_VERSION=$(hipconfig -v | cut -d. -f1) + echo "ROCM_MAJOR_VERSION = $ROCM_MAJOR_VERSION" >> kaldi.mk + ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2) + echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk + + # Enable HIP implementation for CXX compile commands. ROCm 5.2.0 onwards use + # __HIP_PLATFORM_AMD__ others __HIP_PLATFORM_HCC__ + if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then + echo "CXXFLAGS += -D__HIP_PLATFORM_AMD__=1" >> kaldi.mk + else + echo "CXXFLAGS += -D__HIP_PLATFORM_HCC__=1" >> kaldi.mk + fi + # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, # use direct calls to uname -m here if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then @@ -300,10 +313,11 @@ WARNING: ROCM will not be used! ROCM is only supported with 64-bit Linux builds." exit 1; fi - - #add cusolver flags for newer toolkits - if [ "$CUSOLVER" == "true" ]; then - echo "ROCM_LDLIBS += -lcusolver" >> kaldi.mk + + if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then + echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk + else + echo "ROCM_FLAGS += -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1" >> kaldi.mk fi } diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 7a0300ae02b..bdefa9cc4dd 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -3,6 +3,20 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} + +#undef hipLaunchKernelGGLInternal +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM +#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ + do { \ + kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamPerThread) : (streamId) )>>>(__VA_ARGS__); \ + } while (0) +#else +#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ + do { \ + kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamDefault) : (streamId) )>>>(__VA_ARGS__); \ + } while (0) +#endif + // // HIP types // @@ -153,10 +167,17 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed #define curandSetStream hiprandSetStream #define curandStatus_t hiprandStatus_t +#if ROCM_MAJOR_VERSION == 5 && ROCM_MINOR_VERSION >= 1 || ROCM_MAJOR_VERSION > 5 #define cusolverDnCreate hipsolverDnCreate #define cusolverDnDestroy hipsolverDnDestroy #define cusolverDnHandle_t hipsolverDnHandle_t #define cusolverDnSetStream hipsolverDnSetStream +#else +#define cusolverDnCreate hipsolverCreate +#define cusolverDnDestroy hipsolverDestroy +#define cusolverDnHandle_t hipsolverHandle_t +#define cusolverDnSetStream hipsolverSetStream +#endif #define cusparseAction_t hipsparseAction_t #define cusparseCreate hipsparseCreate #define cusparseCreateCsr hipsparseCreateCsr diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 0c558a770d6..3976624032d 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -8,13 +8,20 @@ endif # Uncomment if willing to use ROCTX capabilities. # ROCM_USEROCTX = -DUSE_NVTX -CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ - -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) +# Specific HIP/ROCm components should be included prior to the generic include to avoid +# deprecation warnings. +CXXFLAGS += -Werror $(ROCM_USEROCTX) -DHAVE_CUDA=1 \ + -D__IS_HIP_COMPILE__=1 \ + -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ + -DCUDA_VERSION=11000 \ + -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) -ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC) +ROCM_INCLUDE = -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -isystem $(OPENFSTINC) ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ - -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ - -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread + -D__IS_HIP_COMPILE__=1 \ + -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ + -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ + -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib From 9b8dffb3a594293fbf4286233df610ae6041b284 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 12 Sep 2022 17:33:16 -0500 Subject: [PATCH 12/22] Complete ROCm 5.0.2 build with no per-thread streams yet. --- src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 2 +- src/cudamatrix/cu-block-matrix.cc | 2 +- src/cudamatrix/cu-common.h | 6 +++--- src/cudamatrix/cu-compressed-matrix.cc | 2 +- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 8 ++++---- src/cudamatrix/cu-matrix.cc | 2 +- src/cudamatrix/cu-packed-matrix.cc | 2 +- src/cudamatrix/cu-sp-matrix.cc | 2 +- src/cudamatrix/cu-sparse-matrix.cc | 2 +- src/cudamatrix/cu-tp-matrix.cc | 2 +- src/cudamatrix/cu-vector.cc | 2 +- src/makefiles/hip_64bit.mk | 2 +- 14 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index abd08a9b015..3b47ee525eb 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 1ed7e54b541..09ba2c9aa13 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index fd17fe61893..309d68fccf7 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index da7c57bde36..99165cc592f 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,10 +32,10 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ -#include +#include #include -#include -#include +#include +#include #include #include "hipify.h" #else diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index e42c93f1b67..dfcbf41d131 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 705bfbeee59..c073ab358ea 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index d7edf5a5a1c..1311668ec33 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,11 +29,11 @@ #include #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include -#include -#include +#include +#include #include "hipify.h" #else #include @@ -44,7 +44,7 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ -#include +#include #else #include #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index c1d72ede87e..96c1ef14ed4 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,7 +29,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index c9d686d0ce8..8a5865f71af 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,7 +23,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index a6c7d7720e4..fabd06c9b16 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index a21e5163701..3853ffa7e45 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 378cc8e4e38..dd3a333c9a5 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index cf13d631a0d..cc6332ba48c 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 3976624032d..160f5fb5c0f 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -10,7 +10,7 @@ endif # Specific HIP/ROCm components should be included prior to the generic include to avoid # deprecation warnings. -CXXFLAGS += -Werror $(ROCM_USEROCTX) -DHAVE_CUDA=1 \ +CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ -DCUDA_VERSION=11000 \ From e84d8f072496c9427e804f8189854da9ff49c04b Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 13 Sep 2022 07:44:43 -0500 Subject: [PATCH 13/22] Add cudadecoder support for ROCm 5.2.x. --- src/chain/Makefile | 16 ++++-- src/configure | 9 +++- src/cudadecoder/Makefile | 22 +++++++- .../batched-static-nnet3-kernels.cu | 5 ++ .../batched-static-nnet3-kernels.h | 5 ++ ...hed-threaded-nnet3-cuda-online-pipeline.cc | 5 ++ .../batched-threaded-nnet3-cuda-pipeline.cc | 5 ++ .../batched-threaded-nnet3-cuda-pipeline2.cc | 5 ++ src/cudadecoder/cuda-decoder-kernels-utils.h | 4 +- src/cudadecoder/cuda-decoder-kernels.cu | 6 +++ src/cudadecoder/cuda-decoder.cc | 24 +++++---- src/cudadecoder/cuda-decoder.h | 5 ++ src/cudadecoder/cuda-fst.cc | 6 +++ src/cudadecoderbin/Makefile | 4 +- .../batched-wav-nnet3-cuda-online.cc | 6 +++ src/cudadecoderbin/batched-wav-nnet3-cuda.cc | 6 +++ src/cudadecoderbin/batched-wav-nnet3-cuda2.cc | 7 +++ src/cudafeat/Makefile | 23 +++++++- ...eature-online-batched-cmvn-cuda-kernels.cu | 5 ++ ...ure-online-batched-ivector-cuda-kernels.cu | 5 ++ .../feature-online-batched-ivector-cuda.cc | 16 ++++++ ...re-online-batched-spectral-cuda-kernels.cu | 6 +++ .../feature-online-batched-spectral-cuda.h | 5 ++ src/cudafeat/feature-online-cmvn-cuda.cu | 8 +++ src/cudafeat/feature-spectral-cuda.cu | 6 +++ src/cudafeat/feature-spectral-cuda.h | 5 ++ src/cudafeat/feature-window-cuda.cu | 5 ++ .../online-batched-feature-pipeline-cuda.cc | 7 ++- .../online-batched-feature-pipeline-cuda.h | 4 ++ .../online-ivector-feature-cuda-kernels.cu | 6 +++ src/cudafeat/online-ivector-feature-cuda.cc | 14 ++++- src/cudamatrix/Makefile | 16 ++++-- src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 2 +- src/cudamatrix/cu-block-matrix.cc | 2 +- src/cudamatrix/cu-common.h | 6 +-- src/cudamatrix/cu-compressed-matrix.cc | 2 +- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 8 +-- src/cudamatrix/cu-kernels.cu | 1 + src/cudamatrix/cu-matrix.cc | 2 +- src/cudamatrix/cu-packed-matrix.cc | 2 +- src/cudamatrix/cu-sp-matrix.cc | 2 +- src/cudamatrix/cu-sparse-matrix.cc | 2 +- src/cudamatrix/cu-tp-matrix.cc | 2 +- src/cudamatrix/cu-vector.cc | 2 +- src/hip/hipify.h | 54 ++++++++++++++----- src/makefiles/hip_64bit.mk | 18 +++++-- 48 files changed, 318 insertions(+), 62 deletions(-) diff --git a/src/chain/Makefile b/src/chain/Makefile index 678bb03ef33..5cc8d8901a1 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -33,13 +33,21 @@ ifeq ($(CUDA), true) $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ endif ifeq ($(ROCM), true) -#%.hip : %.cu -# $(HIPIFY) $< 1> $@ 2> $@.stats -#%.o : %.hip -# $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) +.PRECIOUS: %.hip +%.hip : %.cu + LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + cat $< | \ + sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ + sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ + cat > $@ +%.o : %.hip + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif +endif include ../makefiles/default_rules.mk diff --git a/src/configure b/src/configure index ffb87abe106..ca3df9563ab 100755 --- a/src/configure +++ b/src/configure @@ -316,8 +316,9 @@ WARNING: ROCM will not be used! if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk + echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = false" >> kaldi.mk else - echo "ROCM_FLAGS += -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1" >> kaldi.mk + echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = true" >> kaldi.mk fi } @@ -1055,7 +1056,11 @@ if $use_cuda; then fi echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk else - echo "WITH_CUDADECODER = false" >> kaldi.mk + if $use_rocm; then + echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk + else + echo "WITH_CUDADECODER = false" >> kaldi.mk + fi fi echo >> kaldi.mk diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile index e2569e89ab7..062e9a47d41 100644 --- a/src/cudadecoder/Makefile +++ b/src/cudadecoder/Makefile @@ -3,13 +3,15 @@ all: ; EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif TESTFILES = @@ -34,8 +36,26 @@ LDLIBS += $(CUDA_LDLIBS) # Implicit rule for kernel compilation +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) +endif +ifeq ($(ROCM), true) +ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) +.PRECIOUS: %.hip +%.hip : %.cu + LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + cat $< | \ + sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ + sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ + cat > $@ +%.o : %.hip + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +else +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +endif +endif else all: diff --git a/src/cudadecoder/batched-static-nnet3-kernels.cu b/src/cudadecoder/batched-static-nnet3-kernels.cu index f02a78ed1af..429d9f72326 100644 --- a/src/cudadecoder/batched-static-nnet3-kernels.cu +++ b/src/cudadecoder/batched-static-nnet3-kernels.cu @@ -17,6 +17,11 @@ #include "cudadecoder/batched-static-nnet3-kernels.h" +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "hipify.h" +#endif + #include namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h index 45064e15071..0bcb1997576 100644 --- a/src/cudadecoder/batched-static-nnet3-kernels.h +++ b/src/cudadecoder/batched-static-nnet3-kernels.h @@ -17,7 +17,12 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "base/kaldi-types.h" #ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_ diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc index 6e78d7212fd..c7012b686e0 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc @@ -21,7 +21,12 @@ #include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc index 89e93e5d98c..d5cf7dae2d7 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc @@ -26,7 +26,12 @@ #include +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "base/kaldi-utils.h" #include "cudadecoder/cuda-fst.h" diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc index c076910672a..f6a3455db01 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc @@ -23,7 +23,12 @@ #include +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h index fc0d2cddd2c..add66312817 100644 --- a/src/cudadecoder/cuda-decoder-kernels-utils.h +++ b/src/cudadecoder/cuda-decoder-kernels-utils.h @@ -137,7 +137,7 @@ __device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) { value.i2 = val; if (old.i2.x <= val.x) return; do { - assumed = old; + assumed.ull = old.ull; old.ull = atomicCAS(ptr64, assumed.ull, value.ull); } while (old.ull != assumed.ull && old.i2.x > value.i2.x); } @@ -148,7 +148,7 @@ __device__ void atomicSubI2(int2 *ptr, int2 sub) { UInt64UnionInt2 old, assumed, value; old.ull = *ptr64; do { - assumed = old; + assumed.ull = old.ull; value.i2.x = assumed.i2.x - sub.x; value.i2.y = assumed.i2.y - sub.y; old.ull = atomicCAS(ptr64, assumed.ull, value.ull); diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu index 3a835d02b76..6a14371911d 100644 --- a/src/cudadecoder/cuda-decoder-kernels.cu +++ b/src/cudadecoder/cuda-decoder-kernels.cu @@ -15,7 +15,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include "float.h" +#include +#include "hipify.h" +#else #include +#endif #include "cuda-decoder-kernels.h" #include "cuda-decoder-kernels-utils.h" diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc index 1ec456ac32c..06dceae73a5 100644 --- a/src/cudadecoder/cuda-decoder.cc +++ b/src/cudadecoder/cuda-decoder.cc @@ -37,8 +37,14 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "hipify.h" +#else #include #include +#endif #include "base/kaldi-utils.h" #include "cudadecoder/cuda-decoder-kernels.h" @@ -184,35 +190,35 @@ void CudaDecoder::AllocateDeviceData() { void CudaDecoder::AllocateHostData() { channel_to_compute_.resize(nlanes_); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_and_acoustic_cost_concat_, + (void**)&h_extra_and_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_acoustic_cost_concat_, + (void**)&h_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_prev_tokens_concat_, + (void**)&h_extra_prev_tokens_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_infotoken_concat_, + (void**)&h_infotoken_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR( - cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_, + cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_acoustic_cost_concat_tmp_, + (void**)&h_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_prev_tokens_concat_tmp_, + (void**)&h_extra_prev_tokens_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_infotoken_concat_tmp_, + (void**)&h_infotoken_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_))); h_lanes_counters_.Resize( nlanes_ + 1, 1); // +1 because we sometimes need last+1 value (for offsets) KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_))); + (void**)&h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_))); h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_); h_all_tokens_acoustic_cost_.resize(nchannels_); diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h index de2bd09f47c..510904aa004 100644 --- a/src/cudadecoder/cuda-decoder.h +++ b/src/cudadecoder/cuda-decoder.h @@ -20,7 +20,12 @@ #if HAVE_CUDA +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include #include diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc index 56066ee069d..3af37eb7676 100644 --- a/src/cudadecoder/cuda-fst.cc +++ b/src/cudadecoder/cuda-fst.cc @@ -22,8 +22,14 @@ #include "cudadecoder/cuda-fst.h" #include "cudamatrix/cu-common.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "hipify.h" +#else #include #include +#endif namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile index 1f093299eb4..96b00c06101 100644 --- a/src/cudadecoderbin/Makefile +++ b/src/cudadecoderbin/Makefile @@ -2,13 +2,15 @@ all: ; include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc index 1aba7144af1..56368853df2 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc @@ -23,9 +23,15 @@ #error CUDA support must be configured to compile this binary. #endif +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "roctracer/roctx.h" +#include "hipify.h" +#else #include #include #include +#endif #include #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc index 46138116bd8..05af50d7a3b 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc @@ -17,9 +17,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "roctracer/roctx.h" +#include "hipify.h" +#else #include #include #include +#endif #include #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h" #include "cudamatrix/cu-allocator.h" diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc index 992b34598d2..c14571f2ed9 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc @@ -18,9 +18,16 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include "hipify.h" +#else #include #include #include +#endif #include diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile index 54bcc53af1e..c3a4489e18e 100644 --- a/src/cudafeat/Makefile +++ b/src/cudafeat/Makefile @@ -2,13 +2,15 @@ all: ; include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif TESTFILES = @@ -37,9 +39,26 @@ LDLIBS += $(CUDA_LDLIBS) # Implicit rule for kernel compilation +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) - +endif +ifeq ($(ROCM), true) +ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) +.PRECIOUS: %.hip +%.hip : %.cu + LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + cat $< | \ + sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ + sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ + cat > $@ +%.o : %.hip + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +else +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +endif +endif else all: $(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]") diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu index c839548d6eb..09b0caff255 100644 --- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu @@ -15,7 +15,12 @@ // See the License for the specific language governing permissions and // limitations under the License. // +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h" __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index 0b57d6a32ea..0b4cfce812c 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -16,7 +16,12 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h" #include "cudamatrix/cu-common.h" namespace kaldi { diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc index 538e268dd98..6d68c93f917 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda.cc +++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc @@ -15,6 +15,22 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include "hipify.h" +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched +#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched +// The BLAS enumerators are used instead of the SOLVER ones. +#ifdef CUBLAS_FILL_MODE_LOWER +#undef CUBLAS_FILL_MODE_LOWER +#endif +#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER +#ifdef CUDA_R_32F +#undef CUDA_R_32F +#endif +#define CUDA_R_32F HIPBLAS_R_32F +#endif + #include "cudafeat/feature-online-batched-ivector-cuda.h" #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h" diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu index c43adaccc2e..f847311d755 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu @@ -17,8 +17,14 @@ #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "hipify.h" +#else #include #include +#endif #include "cudafeat/lane-desc.h" #include "cudamatrix/cu-rand.h" diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index e4549c7177c..113657ce317 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -19,8 +19,13 @@ #define KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include #endif +#endif #include "cudafeat/feature-spectral-cuda.h" #include "cudafeat/feature-window-cuda.h" diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index ba13b4fe484..8d4648d04bb 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -15,11 +15,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif + #include "cudafeat/feature-online-cmvn-cuda.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-vector.h" +#ifndef __IS_HIP_COMPILE__ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { float2 retval; retval.x = a.x - b.x; @@ -32,6 +39,7 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) { retval.y = a.y + b.y; return retval; } +#endif #if __CUDA_ARCH__ == 750 __launch_bounds__ (1024, 1) diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu index 3912661c4fd..c320c85a029 100644 --- a/src/cudafeat/feature-spectral-cuda.cu +++ b/src/cudafeat/feature-spectral-cuda.cu @@ -17,8 +17,14 @@ #include "cudafeat/feature-spectral-cuda.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "hipify.h" +#else #include #include +#endif #include "cudamatrix/cu-rand.h" diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 8683372098c..5625592a717 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -19,8 +19,13 @@ #define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include #endif +#endif #include "cudafeat/feature-window-cuda.h" #include "cudamatrix/cu-matrix.h" diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu index b8db5bd46d3..6ba45e682c1 100644 --- a/src/cudafeat/feature-window-cuda.cu +++ b/src/cudafeat/feature-window-cuda.cu @@ -17,7 +17,12 @@ #include "cudafeat/feature-window-cuda.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "matrix/matrix-functions.h" diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc index 981345404f5..650b51ec3c7 100644 --- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc +++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc @@ -20,7 +20,12 @@ #include "cudafeat/online-batched-feature-pipeline-cuda.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif namespace kaldi { @@ -95,7 +100,7 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda( current_samples_stash_ = new int32_t[num_channels_]; // allocated pinned memory for storing channel desc - CU_SAFE_CALL(cudaMallocHost(&h_lanes_, sizeof(LaneDesc) * max_lanes_)); + CU_SAFE_CALL(cudaMallocHost((void**)&h_lanes_, sizeof(LaneDesc) * max_lanes_)); // allocate device memory lanes_ = diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.h b/src/cudafeat/online-batched-feature-pipeline-cuda.h index fa000f03b62..6c588c40c24 100644 --- a/src/cudafeat/online-batched-feature-pipeline-cuda.h +++ b/src/cudafeat/online-batched-feature-pipeline-cuda.h @@ -23,6 +23,10 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include "hipify.h" +#endif + #include "base/kaldi-error.h" #include "feat/feature-window.h" #include "matrix/matrix-lib.h" diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu index 12d9b071f59..378ea18e689 100644 --- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu +++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu @@ -15,7 +15,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif + #include "cudafeat/online-ivector-feature-cuda-kernels.h" #include "cudamatrix/cu-common.h" namespace kaldi { diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index bd4964860e0..c3b15d72a5b 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -16,8 +16,19 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +// The BLAS enumerators are used instead of the SOLVER ones. +#ifdef CUBLAS_FILL_MODE_LOWER +#undef CUBLAS_FILL_MODE_LOWER +#endif +#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER +#else #include #endif +#endif + #include #include "base/io-funcs.h" @@ -288,13 +299,14 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats( // Forming new non-SP matrix for cusolver. CuMatrix A(quadratic); + + #ifdef CHOLESKY // query temp buffer size int L_work; CUSOLVER_SAFE_CALL( cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), A.Data(), A.Stride(), &L_work)); - // allocate temp buffer float *workspace = static_cast( CuDevice::Instantiate().Malloc(L_work * sizeof(float))); diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 512028c6c13..5cd4adcffd8 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -34,12 +34,20 @@ ifeq ($(CUDA), true) endif ifeq ($(ROCM), true) -#%.hip : %.cu -# $(HIPIFY) $< 1> $@ 2> $@.stats -#%.o : %.hip -# $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) +.PRECIOUS: %.hip +%.hip : %.cu + LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + cat $< | \ + sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ + sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ + cat > $@ +%.o : %.hip + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif +endif include ../makefiles/default_rules.mk diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index 3b47ee525eb..abd08a9b015 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 09ba2c9aa13..1ed7e54b541 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index 309d68fccf7..fd17fe61893 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 99165cc592f..da7c57bde36 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,10 +32,10 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ -#include +#include #include -#include -#include +#include +#include #include #include "hipify.h" #else diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index dfcbf41d131..e42c93f1b67 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index c073ab358ea..705bfbeee59 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 1311668ec33..d7edf5a5a1c 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,11 +29,11 @@ #include #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include -#include -#include +#include +#include #include "hipify.h" #else #include @@ -44,7 +44,7 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ -#include +#include #else #include #endif diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 9a99f19b58f..1d6e0664541 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -23,6 +23,7 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. + // In this file is the CUDA code of the CUDA kernels, plus the ANSI-C wrappers #include diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 96c1ef14ed4..c1d72ede87e 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,7 +29,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 8a5865f71af..c9d686d0ce8 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,7 +23,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index fabd06c9b16..a6c7d7720e4 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 3853ffa7e45..a21e5163701 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index dd3a333c9a5..378cc8e4e38 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index cc6332ba48c..cf13d631a0d 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/hip/hipify.h b/src/hip/hipify.h index bdefa9cc4dd..24b5f2f8eb3 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -1,29 +1,22 @@ #ifndef __HIPIFY_H__ #define __HIPIFY_H__ +#ifdef __HIPCC__ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} - - -#undef hipLaunchKernelGGLInternal -#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM -#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ - do { \ - kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamPerThread) : (streamId) )>>>(__VA_ARGS__); \ - } while (0) +// AMDGCN only support this rounding mode. +#define __fdiv_rd __fdiv_rn #else -#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ - do { \ - kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamDefault) : (streamId) )>>>(__VA_ARGS__); \ - } while (0) +#define __align__(x) __attribute__((aligned (x))) #endif // // HIP types // #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. +#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. #define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT +#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER #define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs. @@ -46,6 +39,8 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS #define CUDA_R_32F HIP_R_32F #define CUDA_R_64F HIP_R_64F +#define CUFFT_R2C HIPFFT_R2C +#define CUFFT_SUCCESS HIPFFT_SUCCESS #define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT #define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED #define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH @@ -104,6 +99,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cublasGemmAlgo_t hipblasGemmAlgo_t #define cublasGemmBatchedEx hipblasGemmBatchedEx #define cublasGemmEx hipblasGemmEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx #define cublasHandle_t hipblasHandle_t #define cublasOperation_t hipblasOperation_t #define cublasSasum_v2 hipblasSasum @@ -133,15 +129,29 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse #define cudaErrorInvalidDevice hipErrorInvalidDevice #define cudaError_t hipError_t +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDestroy hipEventDestroy +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t #define cudaFree hipFree +#define cudaFreeHost hipFreeHost #define cudaGetDevice hipGetDevice #define cudaGetDeviceCount hipGetDeviceCount #define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorName hipGetErrorName #define cudaGetErrorString hipGetErrorString #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterDefault hipHostRegisterDefault +#define cudaHostUnregister hipHostUnregister #define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc #define cudaMallocPitch hipMallocPitch +#define cudaMemcpy hipMemcpy #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice @@ -150,11 +160,20 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaMemset2DAsync hipMemset2DAsync #define cudaMemsetAsync hipMemsetAsync #define cudaSetDevice hipSetDevice +#define cudaStreamCreate hipStreamCreate +#define cudaStreamDestroy hipStreamDestroy #define cudaStreamLegacy ((hipStream_t)1) #define cudaStreamPerThread ((hipStream_t)2) #define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent #define cudaStream_t hipStream_t #define cudaSuccess hipSuccess +#define cufftComplex hipfftComplex +#define cufftDestroy hipfftDestroy +#define cufftExecR2C hipfftExecR2C +#define cufftHandle hipfftHandle +#define cufftPlanMany hipfftPlanMany +#define cufftSetStream hipfftSetStream #define curandCreateGenerator hiprandCreateGenerator #define curandDestroyGenerator hiprandDestroyGenerator #define curandGenerateNormal hiprandGenerateNormal @@ -178,6 +197,11 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cusolverDnHandle_t hipsolverHandle_t #define cusolverDnSetStream hipsolverSetStream #endif +#define cusolverDnSpotrf hipsolverDnSpotrf +#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched +#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize +#define cusolverDnSpotrs hipsolverDnSpotrs +#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched #define cusparseAction_t hipsparseAction_t #define cusparseCreate hipsparseCreate #define cusparseCreateCsr hipsparseCreateCsr @@ -201,7 +225,9 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize #define cusparseSpMatDescr_t hipsparseSpMatDescr_t #define cusparseStatus_t hipsparseStatus_t - +#define nvtxRangePop roctxRangePop +#define nvtxRangePush roctxRangePush +#define nvtxRangePushA roctxRangePushA // // HIPCUB namespace. // diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 160f5fb5c0f..e2f43ecd55c 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -14,9 +14,21 @@ CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ -DCUDA_VERSION=11000 \ - -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) + -I$(ROCMDIR)/hipsparse/include \ + -I$(ROCMDIR)/hipfft/include \ + -I$(ROCMDIR)/hipblas/include \ + -I$(ROCMDIR)/hiprand/include \ + -I$(ROCMDIR)/rocrand/include \ + -I$(ROCMDIR)/include \ + -I.. -I../hip -fPIC -pthread -isystem $(OPENFSTINC) -ROCM_INCLUDE = -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -isystem $(OPENFSTINC) +ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \ + -I$(ROCMDIR)/hipfft/include \ + -I$(ROCMDIR)/hipblas/include \ + -I$(ROCMDIR)/hiprand/include \ + -I$(ROCMDIR)/rocrand/include \ + -I$(ROCMDIR)/include \ + -I.. -I../hip -isystem $(OPENFSTINC) ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ @@ -25,4 +37,4 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib -CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64 +CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 From aed0ce594e72bc935ab1f2fade0f26aa5229a3b9 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 13 Sep 2022 11:44:33 -0500 Subject: [PATCH 14/22] Complete support for ROCm 5.0.2. --- src/chain/Makefile | 2 +- src/cudadecoder/Makefile | 2 +- src/cudafeat/Makefile | 2 +- .../feature-online-batched-ivector-cuda.cc | 41 +++++++++++++++++-- .../feature-online-batched-spectral-cuda.h | 4 ++ src/cudafeat/feature-spectral-cuda.h | 4 ++ src/cudafeat/online-ivector-feature-cuda.cc | 17 ++++++++ src/cudamatrix/Makefile | 2 +- src/cudamatrix/cu-allocator.cc | 4 ++ src/cudamatrix/cu-allocator.h | 4 ++ src/cudamatrix/cu-block-matrix.cc | 4 ++ src/cudamatrix/cu-common.h | 7 +++- src/cudamatrix/cu-compressed-matrix.cc | 4 ++ src/cudamatrix/cu-device.cc | 5 ++- src/cudamatrix/cu-device.h | 11 ++++- src/cudamatrix/cu-matrix.cc | 4 ++ src/cudamatrix/cu-packed-matrix.cc | 4 ++ src/cudamatrix/cu-sp-matrix.cc | 4 ++ src/cudamatrix/cu-sparse-matrix.cc | 4 ++ src/cudamatrix/cu-tp-matrix.cc | 4 ++ src/cudamatrix/cu-vector.cc | 4 ++ src/hip/hipify.h | 16 +++++--- 22 files changed, 138 insertions(+), 15 deletions(-) diff --git a/src/chain/Makefile b/src/chain/Makefile index 5cc8d8901a1..5b177981ad8 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -36,7 +36,7 @@ ifeq ($(ROCM), true) ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) .PRECIOUS: %.hip %.hip : %.cu - LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ cat $< | \ sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile index 062e9a47d41..d4eda345564 100644 --- a/src/cudadecoder/Makefile +++ b/src/cudadecoder/Makefile @@ -44,7 +44,7 @@ ifeq ($(ROCM), true) ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) .PRECIOUS: %.hip %.hip : %.cu - LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ cat $< | \ sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile index c3a4489e18e..c0f54a854e8 100644 --- a/src/cudafeat/Makefile +++ b/src/cudafeat/Makefile @@ -47,7 +47,7 @@ ifeq ($(ROCM), true) ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) .PRECIOUS: %.hip %.hip : %.cu - LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ cat $< | \ sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc index 6d68c93f917..68c247b43e9 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda.cc +++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc @@ -17,9 +17,6 @@ #ifdef __IS_HIP_COMPILE__ #include "hipify.h" -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched -#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched // The BLAS enumerators are used instead of the SOLVER ones. #ifdef CUBLAS_FILL_MODE_LOWER #undef CUBLAS_FILL_MODE_LOWER @@ -385,6 +382,43 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats( #if CUDA_VERSION >= 9010 int nrhs = 1; + +#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2) + // query temp buffer size + int L_work; + + // perform factorization in batched + CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched_bufferSize( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, + ivector_dim_, &L_work, num_lanes)); + // allocate temp buffer + float *workspace = static_cast( + CuDevice::Instantiate().Malloc(L_work * sizeof(float))); + + // perform factorization in batched + CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, + ivector_dim_, workspace, L_work, d_infoArray_, num_lanes)); + + int L_work2; + + // perform factorization in batched + CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched_bufferSize( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, + quad_array_, ivector_dim_, ivec_array_, ivector_dim_, &L_work2, num_lanes)); + // allocate temp buffer + float *workspace2 = static_cast( + CuDevice::Instantiate().Malloc(L_work2 * sizeof(float))); + + // solve for rhs in batched + CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, + quad_array_, ivector_dim_, ivec_array_, ivector_dim_, workspace2, L_work2, d_infoArray_, + num_lanes)); + + CuDevice::Instantiate().Free(workspace); + CuDevice::Instantiate().Free(workspace2); +#else // perform factorization in batched CUSOLVER_SAFE_CALL(cusolverDnSpotrfBatched( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, @@ -395,6 +429,7 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, quad_array_, ivector_dim_, ivec_array_, ivector_dim_, d_infoArray_, num_lanes)); +#endif #endif // cusolver solves in place. Ivectors are now in linear_ diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index 113657ce317..202232c6b23 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -20,7 +20,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 5625592a717..66f0dce395a 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -20,7 +20,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index c3b15d72a5b..56dbac93165 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -317,9 +317,26 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats( A.Stride(), workspace, L_work, d_info_)); // solve for rhs +#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2) + // query temp buffer size + int L_work2; + CUSOLVER_SAFE_CALL( + hipsolverSpotrs_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, + A.Data(), A.Stride(), ivector->Data(), ivector_dim_, &L_work2)); + // allocate temp buffer + float *workspace2 = static_cast( + CuDevice::Instantiate().Malloc(L_work2 * sizeof(float))); + + CUSOLVER_SAFE_CALL(hipsolverSpotrs( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, + A.Data(), A.Stride(), ivector->Data(), ivector_dim_, workspace2, L_work2, d_info_)); + + CuDevice::Instantiate().Free(workspace2); +#else CUSOLVER_SAFE_CALL(cusolverDnSpotrs( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_)); +#endif CuDevice::Instantiate().Free(workspace); #else diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 5cd4adcffd8..3c1100753e5 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -37,7 +37,7 @@ ifeq ($(ROCM), true) ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) .PRECIOUS: %.hip %.hip : %.cu - LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ cat $< | \ sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index abd08a9b015..d81dca002ce 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,7 +25,11 @@ #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 1ed7e54b541..f776bbb620e 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,7 +24,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index fd17fe61893..7983cd250e7 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,7 +21,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index da7c57bde36..c4bdf569d3c 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,10 +32,15 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#include +#else #include +#include +#endif #include #include -#include #include #include "hipify.h" #else diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index e42c93f1b67..442d2dbac67 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,7 +21,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 705bfbeee59..3dada172ba8 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -21,10 +21,13 @@ // limitations under the License. - #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index d7edf5a5a1c..67b9f1d9e9b 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,11 +29,16 @@ #include #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#include +#else #include +#include +#endif #include #include #include -#include #include "hipify.h" #else #include @@ -44,7 +49,11 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #else #include #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index c1d72ede87e..9897917a33f 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,7 +29,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index c9d686d0ce8..4de0fcba63d 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,7 +23,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index a6c7d7720e4..86a3cd9a726 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,7 +21,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index a21e5163701..93d10099466 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,7 +24,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 378cc8e4e38..739bab3dd59 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,7 +21,11 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index cf13d631a0d..1deb1cb8733 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,7 +24,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 24b5f2f8eb3..b631ac08a23 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -191,17 +191,22 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cusolverDnDestroy hipsolverDnDestroy #define cusolverDnHandle_t hipsolverDnHandle_t #define cusolverDnSetStream hipsolverDnSetStream +#define cusolverDnSpotrf hipsolverDnSpotrf +#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched +#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize +#define cusolverDnSpotrs hipsolverDnSpotrs +#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched #else #define cusolverDnCreate hipsolverCreate #define cusolverDnDestroy hipsolverDestroy #define cusolverDnHandle_t hipsolverHandle_t #define cusolverDnSetStream hipsolverSetStream +#define cusolverDnSpotrf hipsolverSpotrf +#define cusolverDnSpotrfBatched hipsolverSpotrfBatched +#define cusolverDnSpotrf_bufferSize hipsolverSpotrf_bufferSize +#define cusolverDnSpotrs hipsolverSpotrs +#define cusolverDnSpotrsBatched hipsolverSpotrsBatched #endif -#define cusolverDnSpotrf hipsolverDnSpotrf -#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched -#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize -#define cusolverDnSpotrs hipsolverDnSpotrs -#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched #define cusparseAction_t hipsparseAction_t #define cusparseCreate hipsparseCreate #define cusparseCreateCsr hipsparseCreateCsr @@ -235,3 +240,4 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #endif //__HIPIFY_H__ + From 6d8dd4c2337f224bb7f230cbb41d5e5311c75632 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 20 Oct 2023 11:48:09 +0000 Subject: [PATCH 15/22] Fix __CUDA_ARCH__ issue and add more hipification. --- src/chain/chain-kernels.cu | 1 + src/cudafeat/feature-online-cmvn-cuda.cu | 1 + src/cudafeatbin/Makefile | 8 +++++--- src/cudafeatbin/apply-batched-cmvn-online-cuda.cc | 2 ++ .../compute-fbank-online-batched-cuda.cc | 2 ++ .../compute-mfcc-online-batched-cuda.cc | 2 ++ .../compute-online-feats-batched-cuda.cc | 2 ++ src/cudafeatbin/compute-online-feats-cuda.cc | 2 ++ src/cudamatrix/cu-kernels.cu | 1 + src/hip/hipify.h | 5 +++++ src/makefiles/hip_64bit.mk | 15 +++++++++++---- 11 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 2a30128750c..ad6691fc895 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -21,6 +21,7 @@ #include "chain/chain-kernels-ansi.h" #ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 #include #endif diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index 8d4648d04bb..1c896f1307f 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -16,6 +16,7 @@ // limitations under the License. #ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 #include #include "hipify.h" #else diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile index 9dbb5d30fa1..ed1c413c939 100644 --- a/src/cudafeatbin/Makefile +++ b/src/cudafeatbin/Makefile @@ -3,12 +3,14 @@ all: ; EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, -ifndef CUDA_ARCH - $(error CUDA_ARCH is undefined, run 'src/configure') +ifeq ($(CUDA), true) + ifndef CUDA_ARCH + $(error CUDA_ARCH is undefined, run 'src/configure') + endif endif LDFLAGS += $(CUDA_LDFLAGS) diff --git a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc index 24e7cbd4a70..44ef403f21a 100644 --- a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc +++ b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc @@ -18,8 +18,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc index 36cfc4ad90c..ff9415b8f11 100644 --- a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc +++ b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc index 99883f3114a..3fcc1aea659 100644 --- a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc +++ b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-online-feats-batched-cuda.cc b/src/cudafeatbin/compute-online-feats-batched-cuda.cc index 787aceeca0d..2cd6bbb6a93 100644 --- a/src/cudafeatbin/compute-online-feats-batched-cuda.cc +++ b/src/cudafeatbin/compute-online-feats-batched-cuda.cc @@ -16,9 +16,11 @@ // limitations under the License. #if HAVE_CUDA +#ifndef __IS_HIP_COMPILE__ #include #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc index b9135c3cee6..70380f8ccad 100644 --- a/src/cudafeatbin/compute-online-feats-cuda.cc +++ b/src/cudafeatbin/compute-online-feats-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include "base/kaldi-common.h" #include "util/common-utils.h" #include "cudafeat/online-cuda-feature-pipeline.h" diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 1d6e0664541..1b0cf1f2c90 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -30,6 +30,7 @@ #include #include #ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 #include #include "hipify.h" #include "cudamatrix/cu-kernels-ansi.h" diff --git a/src/hip/hipify.h b/src/hip/hipify.h index b631ac08a23..723b5b1f059 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -148,6 +148,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaHostRegister hipHostRegister #define cudaHostRegisterDefault hipHostRegisterDefault #define cudaHostUnregister hipHostUnregister +#define cudaLaunchHostFunc hipLaunchHostFunc #define cudaMalloc hipMalloc #define cudaMallocHost hipHostMalloc #define cudaMallocPitch hipMallocPitch @@ -157,12 +158,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemGetInfo hipMemGetInfo #define cudaMemset2DAsync hipMemset2DAsync #define cudaMemsetAsync hipMemsetAsync +#define cudaProfilerStop hipProfilerStop #define cudaSetDevice hipSetDevice #define cudaStreamCreate hipStreamCreate +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamDestroy hipStreamDestroy #define cudaStreamLegacy ((hipStream_t)1) +#define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamPerThread ((hipStream_t)2) #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index e2f43ecd55c..8d85872aa9b 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -29,12 +29,19 @@ ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \ -I$(ROCMDIR)/rocrand/include \ -I$(ROCMDIR)/include \ -I.. -I../hip -isystem $(OPENFSTINC) + +# TODO: Consider passing __CUDA_ARCH__=800 here as it is mostly supported by ROCm. +# However this macro has some side effect with HIPCC that makes it assume +# CUDA is active and everything is device compiles. ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ - -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ - -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 + -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \ + -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics -#TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. +# TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. +# We allow the libraries we link against to have undefined symbols so as this can be build in +# systems with no development version of these libraries (e.g. ncurses). CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib -CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 +CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined +LDLIBS += -Wl,--allow-shlib-undefined From f584420d8c1448e8e70f9106aa49712f63d06347 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 6 Nov 2023 16:57:15 +0000 Subject: [PATCH 16/22] Fix tests with zero size matrices and needing syncwarp for LDS sharing. --- ...ure-online-batched-ivector-cuda-kernels.cu | 30 +++-- ...re-online-batched-spectral-cuda-kernels.cu | 4 +- src/cudafeat/feature-online-cmvn-cuda.cu | 4 +- src/cudafeat/feature-spectral-cuda.cu | 4 +- .../online-ivector-feature-cuda-kernels.cu | 26 ++-- src/cudamatrix/cu-device.cc | 4 + src/cudamatrix/cu-kernels.cu | 127 ++++++++++++++---- src/cudamatrix/cu-math-test.cc | 11 +- src/cudamatrix/cu-math.cc | 2 +- src/cudamatrix/cu-matrix-test.cc | 24 +++- src/cudamatrix/cu-matrix.cc | 12 +- src/cudamatrix/cu-sparse-matrix.cc | 6 +- src/cudamatrix/cu-vector.cc | 13 +- src/hip/hipify.h | 35 ++++- src/makefiles/hip_64bit.mk | 7 +- 15 files changed, 219 insertions(+), 90 deletions(-) diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index 0b4cfce812c..e5b89d163e5 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -50,7 +50,7 @@ void square_batched_matrix(int32_t chunk_frames, int32_t num_cols, const float *feats, int32_t ldf, int32_t stridef, float *feats_sq, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (chunk_frames + threads.y - 1) / threads.y, num_lanes); @@ -101,8 +101,10 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss, float *posteriors, int32_t ldp, int32_t stridep, int32_t right, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); - dim3 blocks((num_gauss + 31) / 32, (num_chunk_frames + 31) / 32, num_lanes); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + num_lanes); zero_invalid_posteriors_kernel<<>>( num_chunk_frames, num_gauss, posteriors, ldp, stridep, right, lanes, @@ -215,8 +217,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim, int32_t stridest, float *spliced_feats, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - int threads = (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is 1024 threads dim3 blocks(num_chunk_frames, num_lanes); @@ -311,10 +313,10 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, // First we need to shift feats to handle the case where num_chunk_frames // is less than stash size - KALDI_ASSERT(stash_size <= 32); - // This only works if stash size is <= 32 as we rely on __syncthreads() + KALDI_ASSERT(stash_size <= GPU_WARP_SIZE); + // This only works if stash size is <= GPU_WARP_SIZE as we rely on __syncthreads() // to avoid read/write hazards when reading/writing in-place - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks(num_lanes); shift_feats_kernel<<>>(chunk_size, feats, feat_dim, ldf, @@ -324,8 +326,8 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, { int threads = - (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(stash_size, num_lanes); // Then we need to copy feats from source into stash @@ -507,8 +509,8 @@ __global__ void batched_convert_sp_to_dense_kernel(int32_t n, float *A_sp, void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A, int32_t lda, int32_t stridea, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); - int block = (n + 31) / 32; // blocks in x and y dimensions + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + int block = (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE; // blocks in x and y dimensions dim3 blocks(block, block, num_lanes); batched_convert_sp_to_dense_kernel<<>>( @@ -584,7 +586,7 @@ void initialize_channels(int32_t num_gauss, int32_t feat_dim, float *gamma, int32_t strideg, float *X, int32_t ldx, int32_t stridex, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t blocks = num_lanes; initialize_channels_kernel<<>>( @@ -629,7 +631,7 @@ void apply_and_update_stash(int32_t num_gauss, int32_t feat_dim, float *gamma, int32_t ldx, int32_t stridex, float *X_stash, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t blocks = num_lanes; apply_and_update_stash_kernel<<>>( diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu index f847311d755..27375f4914e 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu @@ -68,7 +68,7 @@ __global__ void batched_mel_banks_compute_kernel( // perfom local sum float sum = 0; if (frame < num_frames) { // exclude frames beyond the end - for (int idx = tid; idx < size; idx += 32) { + for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) { sum += v[idx] * w[idx]; } } @@ -487,7 +487,7 @@ void cuda_mel_banks_compute(const LaneDesc *lanes, int32_t num_lanes, float energy_floor, int32 *offsets, int32 *sizes, float **vecs, const float *feats, int32_t ldf, float *mels, int32_t ldm, bool use_log) { - dim3 Bl(32, 8); + dim3 Bl(GPU_WARP_SIZE, 8); dim3 Gr(num_bins, (max_chunk_frames + Bl.y - 1) / Bl.y, num_lanes); batched_mel_banks_compute_kernel<<>>( lanes, num_lanes, max_chunk_frames, energy_floor, offsets, sizes, vecs, diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index 1c896f1307f..f8947a3b5ed 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -188,8 +188,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase &feats_in, stats.Stride()); CU_SAFE_CALL(cudaGetLastError()); - threads = (feat_dim + 31) / 32 * 32; // round up to 32 threads - if (threads > 1024) threads = 1024; + threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to GPU_WARP_SIZE threads + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; const CuMatrix &gstats = cmvn_state_.global_cmvn_stats; const CuMatrix &sstats = cmvn_state_.speaker_cmvn_stats; diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu index c320c85a029..9c0d5df5288 100644 --- a/src/cudafeat/feature-spectral-cuda.cu +++ b/src/cudafeat/feature-spectral-cuda.cu @@ -134,7 +134,7 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor, // perfom local sum float sum = 0; - for (int idx = tid; idx < size; idx += 32) { + for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) { sum += v[idx] * w[idx]; } @@ -493,7 +493,7 @@ void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_w // mel banks int num_bins = bin_size_; cu_mel_energies_.Resize(num_frames, num_bins, kUndefined); - dim3 mel_threads(32, 8); + dim3 mel_threads(GPU_WARP_SIZE, 8); dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y); mel_banks_compute_kernel<<>>( num_frames, std::numeric_limits::epsilon(), offsets_, sizes_, diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu index 378ea18e689..dffc9fd3c8f 100644 --- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu +++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu @@ -26,17 +26,17 @@ #include "cudamatrix/cu-common.h" namespace kaldi { -// Meant to be called with blockDim= 32x32 +// Meant to be called with blockDim = GPU_WARP_SIZE x GPU_MAX_WARPS_PER_BLOCK __global__ void batched_gemv_reduce_kernel(int rows, int cols, const float* __restrict__ A, int lda, const float* __restrict__ X, int ldx, float* C) { // Specialize WarpReduce for type float typedef cub::WarpReduce WarpReduce; - // Allocate WarpReduce shared memory for 32 warps - __shared__ typename WarpReduce::TempStorage temp_storage[32]; + // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps + __shared__ typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK]; - __shared__ float s_A[32][32 + 1]; //+1 to avoid bank conflicts on transpose + __shared__ float s_A[GPU_MAX_WARPS_PER_BLOCK][GPU_WARP_SIZE + 1]; //+1 to avoid bank conflicts on transpose int bid = blockIdx.x; // batch id int tid = threadIdx.x; // thread id @@ -47,13 +47,13 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols, // Offset to input vector to starting column for batch const float* __restrict__ X_in = X + bid * ldx; - for (int i = 0; i < cols; i += 32) { // threadIdx.x, keep all threads present + for (int i = 0; i < cols; i += GPU_WARP_SIZE) { // threadIdx.x, keep all threads present int c = i + tid; float sum = 0.0f; // Perform dot product for (int j = 0; j < rows; - j += 32) { // threadIdx.y, keep all threads present + j += GPU_MAX_WARPS_PER_BLOCK) { // threadIdx.y, keep all threads present int r = j + wid; float val = 0.0f; @@ -139,9 +139,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows, int32_t lda, float scale, float* retval) { // Specialize WarpReduce for type float - typedef cub::BlockReduce + typedef cub::BlockReduce BlockReduce; - // Allocate WarpReduce shared memory for 32 warps + // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps __shared__ typename BlockReduce::TempStorage temp_storage; float sum = 0.0f; @@ -207,7 +207,7 @@ __global__ void update_linear_and_quadratic_terms_kernel( void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, const float* AT, int B_stride, const float* B, float* C) { - batched_gemv_reduce_kernel<<>>( + batched_gemv_reduce_kernel<<>>( rows, cols, AT, A_stride, B, B_stride, C); CU_SAFE_CALL(cudaGetLastError()); } @@ -215,8 +215,8 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left, int32_t size, const float* feats, int32_t ldf, float* sfeats, int32_t lds) { - int threads = (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads splice_features_kernel<<>>( num_frames, feat_dim, left, size, feats, ldf, sfeats, lds); @@ -238,7 +238,7 @@ void update_linear_and_quadratic_terms(int32_t n, float old_num_frames, void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols, float* A, int32_t lda, float scale, float* sum) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (num_rows + threads.y - 1) / threads.y); @@ -249,7 +249,7 @@ void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols, void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats, int32_t ldf, float* feats_sq, int32_t lds) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (num_rows + threads.y - 1) / threads.y); diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 3dada172ba8..25775fb1b05 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -249,8 +249,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) { return; } else { // Suggest to use compute exclusive mode + #ifdef __IS_HIP_COMPILE__ + KALDI_WARN << "Not in compute-exclusive mode."; + #else KALDI_WARN << "Not in compute-exclusive mode. Suggestion: use " "'nvidia-smi -c 3' to set compute exclusive mode"; + #endif // We want to choose the device more carefully, so release the CUDA context. e = cudaDeviceReset(); if (e != cudaSuccess) { diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 1b0cf1f2c90..792932c18d5 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -966,6 +966,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA, # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { smem.sum[tid] += smem.sum[tid + shift]; + __syncwarp(); } } @@ -1118,8 +1119,8 @@ void trace_mat_mat_trans_atomic(Real *d_result, cudaStream_t stream) { // Assuming *d_result is set to zero already - constexpr int THREADS_X = 32; - constexpr int THREADS_Y = 16; + constexpr int THREADS_X = GPU_WARP_SIZE; + constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK/2; dim3 thrds(THREADS_X, THREADS_Y); @@ -1176,6 +1177,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA, # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { ssum[tid] += ssum[tid + shift]; + __syncwarp(); } } @@ -1219,6 +1221,7 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M, # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { ssum[tid] += ssum[tid + shift]; + __syncwarp(); } } @@ -1270,6 +1273,7 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M, # pragma unroll for (int shift = warpSize; shift >= TileDim; shift >>= 1) { ssum[tid] += ssum[tid + shift]; + __syncwarp(); } } @@ -1353,6 +1357,7 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M, # pragma unroll for (int shift = warpSize; shift >= TileDim; shift >>= 1) { smem.sum[tid] += smem.sum[tid + shift]; + __syncwarp(); } } @@ -1805,6 +1810,7 @@ static void _vec_transform_reduce( if (tid < warpSize) { for (int shift = warpSize; shift > 0; shift >>= 1) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + __syncwarp(); } } @@ -1904,7 +1910,6 @@ __global__ void _strided_reduction_fused_kernel(Real * __restrict__ dots, const int idx = colStart + (j + u*stride) * d.stride; vals[u] = op.Transform(data[idx]); } - #pragma unroll for (int u = 0; u < unroll_count; ++u) { thread_data = op.Reduce(thread_data, vals[u]); @@ -2018,6 +2023,7 @@ static void _transform_reduce_mat_rows( if (tid < warpSize) { for (int shift = warpSize; shift > 0; shift >>= 1) sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + __syncwarp(); } // Output to vector result. @@ -2042,9 +2048,27 @@ static void _transform_reduce_mat_cols( for (int j = tid; j < d.cols; j += CU1DBLOCK) { tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); } + + // if (tid == 0) { + // for (int j = 0; j < d.cols; j += 1) + // tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); + // result[i] = tdata; + + // } + // return; + sdata[tid] = tdata; __syncthreads(); + // if (tid == 0) { + // tdata = 0; + // for (int j = 0; j < CU1DBLOCK; j += 1) + // tdata = op.Reduce(tdata, op.Transform(sdata[j])); + // result[i] = tdata; + // } + + // return; + // Tree reduce # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { @@ -2053,12 +2077,30 @@ static void _transform_reduce_mat_cols( __syncthreads(); } + // if (tid == 0) { + // tdata = 0; + // for (int j = 0; j < 2*warpSize; j += 1) + // tdata = op.Reduce(tdata, op.Transform(sdata[j])); + // result[i] = tdata; + // } + + // return; + + // Reduce last warp. Threads implicitly synchronized within a warp. if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) - sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + for (int shift = warpSize; shift > 0; shift >>= 1) { + sdata[tid] += sdata[tid + shift]; + __syncwarp(); + //__syncthreads(); // Why this needed? + } } + if (tid == 0) + result[i] = sdata[0]; + + return; + // Output to vector result. if (tid == 0) { result[i] = op.PostReduce(sdata[0], result[i]); @@ -2117,6 +2159,7 @@ static void _group_transform_reduce( # pragma unroll for (int shift = warp_reduce_size; shift > 0; shift >>= 1) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); + __syncwarp(); } } @@ -2981,6 +3024,7 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv, for (int shift = warpSize; shift > 0; shift >>= 1) { sprod[tid] += sprod[tid + shift]; snorm[tid] += snorm[tid + shift]; + __syncwarp(); } } @@ -3271,6 +3315,7 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id, smax[tid] = smax[tid + num_working_threads]; sidx[tid] = sidx[tid + num_working_threads]; } + __syncwarp(0xffffffffu >> (32-num_working_threads)); } } @@ -3999,7 +4044,7 @@ struct BatchedMatrixCopyDesc { MatrixCopyDesc batch[MAX_BATCH_SIZE]; }; -// launched with a block size of 32x32 (32 rows, 32 cols per CTA) +// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) // grid dim x,y expands to fill out average in x/y across batches // grid dim.z is batch template @@ -4380,7 +4425,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B, void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { - _trace_mat_mat<32> <<>>(A,B,dA,B_stride,value); + _trace_mat_mat <<>>(A,B,dA,B_stride,value); } void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha, @@ -4401,6 +4446,11 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha, } else if (Bl.x == 32) { _add_diag_mat_mat_MTN<32> <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x == 64) { + _add_diag_mat_mat_MTN<64> <<>>(alpha, M, stride_M, N, dim_N, beta, + v, stride_v); +#endif } } @@ -4409,9 +4459,13 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha, const float* N, const MatrixDim dim_N, const float beta, float* v) { if (Bl.x == 16) { - _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<16><<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { _add_diag_mat_mat_MN<32><<>>(alpha,M,stride_M,N,dim_N,beta,v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x==64) { + _add_diag_mat_mat_MN<64><<>>(alpha,M,stride_M,N,dim_N,beta,v); +#endif } } @@ -4451,6 +4505,7 @@ void cudaF_vector_copy_elements(dim3 Gr, dim3 Bl, float *data, int dim, transpose, elements); } + void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) { @@ -5086,7 +5141,7 @@ void cudaD_trace_mat_mat_trans(const double* A, void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { - _trace_mat_mat<32> <<>>(A,B,dA,B_stride,value); + _trace_mat_mat <<>>(A,B,dA,B_stride,value); } void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha, @@ -5107,6 +5162,11 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha, } else if (Bl.x == 32) { _add_diag_mat_mat_MTN<32> <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x == 64) { + _add_diag_mat_mat_MTN<64> <<>>(alpha, M, stride_M, N, dim_N, beta, + v, stride_v); +#endif } } @@ -5115,9 +5175,13 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha, const double* N, const MatrixDim dim_N, const double beta, double* v) { if (Bl.x == 16) { - _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<16><<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { _add_diag_mat_mat_MN<32><<>>(alpha,M,stride_M,N,dim_N,beta,v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x==64) { + _add_diag_mat_mat_MN<64><<>>(alpha,M,stride_M,N,dim_N,beta,v); +#endif } } @@ -5488,25 +5552,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim, @@ -5802,7 +5866,14 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, // Launches a kernel that does nothing, explicitly using the legacy default stream; // this will synchronize all threads without blocking. void cuda_legacy_noop() { +#ifdef __IS_HIP_COMPILE__ + // HIP doesn't currently support cudaStreamLegacy stream so we force to use the + // non-per-thread API to get similar semantics. + auto k = reinterpret_cast(_noop_kernel); + hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0); +#else _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>(); +#endif } void cudaF_mat_copy_range_clamped( @@ -5812,8 +5883,8 @@ void cudaF_mat_copy_range_clamped( float *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(32,32); - dim3 blocks((num_cols+31)/32,(num_rows+31)/32); + dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5826,8 +5897,8 @@ void cudaD_mat_copy_range_clamped( double *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(32,32); - dim3 blocks((num_cols+31)/32,(num_rows+31)/32); + dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5837,7 +5908,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs, int32_t *ldo) { - dim3 threads(32,32); + dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5863,8 +5934,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, + dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly @@ -5886,8 +5957,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, + dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, remaining); // no memcpy needed here. Memory will be passed down directly @@ -5902,7 +5973,7 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs, int32_t *ldo) { - dim3 threads(32,32); + dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5928,8 +5999,8 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, + dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly @@ -5951,8 +6022,8 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, + dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, remaining); // no memcpy needed here. Memory will be passed down directly diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index e1d59e777be..1245fb28bad 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -214,9 +214,9 @@ void UnitTestLstmNonlinearity() { for (int32 loop = 0; loop < 10; loop++) { // problem dimensions. - int32 num_rows = RandInt(5, 20), - cell_dim = RandInt(2, 200), - dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); + int32 num_rows = RandInt(5, 20), //16 + cell_dim = RandInt(2, 200), //45 + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); //3 // Pick the (input or params block), and output block, for which we'll // spot-check the derivative values. This will give us test failures @@ -232,7 +232,6 @@ void UnitTestLstmNonlinearity() { else test_params = -1; - CuMatrix input(num_rows, cell_dim * 5 + dropout_dim), params(3, cell_dim), output_deriv(num_rows, cell_dim * 2); @@ -277,11 +276,11 @@ void UnitTestLstmNonlinearity() { for (int32 i = 0; i < test_dim; i++) { CuMatrix delta_input(num_rows, 5 * cell_dim + dropout_dim), delta_params(3, cell_dim); - if (test_input >= 0) { + if (test_input >= 0) { // -1 delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn(); delta_input.Scale(delta); } - if (test_params >= 0) { + if (test_params >= 0) { // 0 delta_params.Row(test_params).SetRandn(); delta_params.Scale(delta); } diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index 3fbeff3a470..d0d8e4e771f 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -818,7 +818,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, // Use 2D block (8x32 threads) as we need to compute column sum. // Use 1D grid to cover the data matrix width `cell_dim`. - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); // dim3 dimGrid(n_blocks(cell_dim, dimBlock.x), // n_blocks(num_rows, dimBlock.y)); diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index be8483e48f5..26a5281ec05 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2675,10 +2675,18 @@ static void UnitTestCuMatrixSetRandn() { template static void UnitTestCuMatrixSetRandUniform() { + + // if (CuDevice::Instantiate().Enabled()) { + // CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(GetCurandHandle(), 123456)); + // } + for (int32 i = 0; i < 2; i++) { - MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200; + MatrixIndexT rows = 180+Rand() % 200, cols = 200+Rand() % 200; CuMatrix M(rows, cols); M.SetRandUniform(); + // M.SetZero(); + // M.Add(0.5); + // M.SetZeroAboveDiag(); M.Add(-0.5); // we'll be testing the central moments, so // center it around zero first. @@ -2693,6 +2701,16 @@ static void UnitTestCuMatrixSetRandUniform() { for (int32 pow = 1; pow < central_moments.Dim(); pow++) { CuMatrix Mpow(M); Mpow.ApplyPow(pow); + + // if (CuDevice::Instantiate().Enabled()) { + // CuVector col_sum(rows, kUndefined); + // cuda_sum_mat_cols(rows, CU1DBLOCK, col_sum.Data(), Mpow.Data(), Mpow.Dim()); + // KALDI_LOG << "Sums vector is " << col_sum; + // Real ans = col_sum.Sum(); + // KALDI_LOG << "Total sum is " << ans; + // KALDI_ERR << "Stopping!"; + // } + Real observed_moment = Mpow.Sum() / (rows * cols); // see http://en.wikipedia.org/wiki/Normal_distribution#Moments, // note that mu = 0 and sigma = 1. @@ -2705,10 +2723,12 @@ static void UnitTestCuMatrixSetRandUniform() { upper_bound = expected_moment + allowed_deviation; if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) { KALDI_LOG << "Random matrix is " << M; + //KALDI_LOG << "Random vector sum is " << col_sum; KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment << ", expected " << expected_moment << ", allowed range " << lower_bound << " to " << upper_bound; } + KALDI_LOG << "Moment[" << pow << "] is " << observed_moment << " (" << expected_moment << ")"; } } } @@ -3061,7 +3081,7 @@ template void CudaMatrixUnitTest() { int main() { SetVerboseLevel(1); int32 loop = 0; - bool test_threads = true; + bool test_threads = false; // num_threads only matters if test_threads == true. Don't make it // to large, because it will affect CPU usage if you are using CPU. int32 num_threads = 4; diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 9897917a33f..56acf340823 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -253,7 +253,7 @@ void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, } else { // 2D thread block with warps (blockDim.x) along the row-dim of input M. // Each (8x32) thread block will transpose (32x32) data - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(M.NumCols(), warpSize), n_blocks(M.NumRows(), warpSize)); @@ -859,7 +859,7 @@ void CuMatrixBase::DiffGroupPnorm(const CuMatrixBase &in_value, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); dim3 dimGrid(n_blocks(NumCols(), dimBlock.x), n_blocks(NumRows(), dimBlock.y)); @@ -1009,7 +1009,7 @@ void CuMatrixBase::AddSmat(Real alpha, const CuSparseMatrix &A, // We use warpSize threads per row to access only the nonzero elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows of A. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y)); @@ -2186,7 +2186,7 @@ Real TraceMatMat(const CuMatrixBase &A, // if the matrix is not in a very bad shape. // (wider or taller than 32x8192) // CPU will then reduce to 1 element. - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize), n_blocks(A.NumRows(), kWarpSize)); @@ -2408,7 +2408,7 @@ void CuMatrixBase::CopyColsFromVec(const CuVectorBase &rv) { // and use transposed copy to fill *this // see CuMatrixBase::CopyFromMat() for more detail of the impl MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ }; - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(rv_dim.cols, warpSize), n_blocks(rv_dim.rows, warpSize)); @@ -2418,7 +2418,7 @@ void CuMatrixBase::CopyColsFromVec(const CuVectorBase &rv) { } else if (rv.Dim() == num_rows_) { // use 2D block (8x32) and large enough grid to cover matrix *this // dimBlock.x need to be at least warpSize for coalesced memory access. - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(num_cols_, dimBlock.x), n_blocks(num_rows_, dimBlock.y)); diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 93d10099466..1a82ce0d4df 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -148,7 +148,7 @@ void CuSparseMatrix::SelectRows(const CuArray &row_indexes, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all selected rows. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(row_indexes.Dim(), dimBlock.y)); @@ -558,7 +558,7 @@ Real TraceMatSmat(const CuMatrixBase &A, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows of B. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y)); @@ -648,7 +648,7 @@ void CuSparseMatrix::CopyToMat(CuMatrixBase *M, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(NumRows(), dimBlock.y)); diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 1deb1cb8733..f6426297e49 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -639,7 +639,10 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, N.Data(), N.Stride(), beta, data_); } else { // Case 2: diag(M'*N) == sum(M.*N, 1) - // 16x16 or 8x32 2D block for coalesced memory access. + // (2*CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE/2 + // or + // (CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE + // 2D block for coalesced memory access. // Grid shape is designed as follows, // 1. for small matrices, use 1D grid with only 1 row of 16x16 block, // to avoid multiple kernel launch; @@ -647,11 +650,11 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // use 1- or 2-D grid so that the grid contains // at least and not much larger than 'kOptNumBlocks' blocks // to fully utilize the GPU; - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; const int32 kOptNumBlocks = 512; const int32 tile_dim = (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ? - 16 : 32; + GPU_WARP_SIZE/2 : GPU_WARP_SIZE; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x), n_blocks(N.NumRows(), dimBlock.y)); @@ -678,7 +681,7 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // One block per 'tile_dim' columns of N. // 1D grid expands along the row of N. int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16; + sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(), @@ -687,7 +690,7 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // Case 4: diag(M'*N') == sum(N'.*M, 1) // Same kernel and config as case 3 except M and N are swapped. int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16; + sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(M.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(), diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 723b5b1f059..56d7e869a32 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -2,7 +2,19 @@ #define __HIPIFY_H__ #ifdef __HIPCC__ -inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} +inline __device__ void __syncwarp(unsigned mask=0xffffffff) { + // On CDNA hardware wave-fronts (warps) execute always in + // lock step. Though it might still be important to signal + // that the compiler can't reorder code around certain code + // sections that rely on data sharing mecanisms like LDS + // (shared memory). So this implements a No-op but is seen + // by the compiler as having side effects. + __asm__("s_nop 0"); + + // A saffest option, arguably less performant would be to use: + // __asm__("s_waitcnt lgkmcnt(0)"); Í + // to explicitly do a memory fence. +} // AMDGCN only support this rounding mode. #define __fdiv_rd __fdiv_rn #else @@ -153,7 +165,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaMallocHost hipHostMalloc #define cudaMallocPitch hipMallocPitch #define cudaMemcpy hipMemcpy -#define cudaMemcpy2DAsync hipMemcpy2DAsync +// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized +// copies, which should be canceled by ROCm 5.7.1+. Then the following would +// be sufficient: +// #define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpy2DAsync(a,b,c,d,width,height,e,f) \ + [&]() -> hipError_t { \ + if (width && height) \ + return hipMemcpy2DAsync(a,b,c,d,width,height,e,f); \ + return hipSuccess; \ + }() #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost @@ -166,8 +187,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaStreamCreate hipStreamCreate #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamDestroy hipStreamDestroy -#define cudaStreamLegacy ((hipStream_t)1) -#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamPerThread ((hipStream_t)2) #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent @@ -243,6 +263,13 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} // #define cub hipcub +// +// Callback qualifier +// +#define CUDART_CB +#define GPU_WARP_SIZE 64 +#define GPU_MAX_THREADS_PER_BLOCK 1024 +#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK/GPU_WARP_SIZE) #endif //__HIPIFY_H__ diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 8d85872aa9b..aec3e359f53 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -37,11 +37,14 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \ - -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics + -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics \ + $(EXTRA_ROCM_FLAGS) + # TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. # We allow the libraries we link against to have undefined symbols so as this can be build in # systems with no development version of these libraries (e.g. ncurses). CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined -LDLIBS += -Wl,--allow-shlib-undefined +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) From ba4e18fcb2987b7172057aa5fc2613a9e1c1f2f8 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 6 Nov 2023 17:14:29 +0000 Subject: [PATCH 17/22] Move misplaced #pragma unroll. --- src/cudamatrix/cu-kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index ac532790b86..349b21b6591 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -2135,8 +2135,8 @@ static void _group_transform_reduce( __syncthreads(); // tree-reduce to 2x warpSize elements per group -# pragma unroll int shift = threads_per_group / 2; +# pragma unroll for (; shift > warpSize; shift >>= 1) { if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); From dac0b272cfff3fba9be4b3cfdd2767271e0d4760 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 6 Nov 2023 23:46:48 +0000 Subject: [PATCH 18/22] Working version trimmed of legacy ROCm < 5.2 code. --- .gitignore | 4 - src/chain/Makefile | 12 -- src/configure | 22 +-- src/cudadecoder/Makefile | 12 -- src/cudadecoder/cuda-decoder.cc | 2 +- src/cudafeat/Makefile | 12 -- .../feature-online-batched-ivector-cuda.cc | 38 ----- .../feature-online-batched-spectral-cuda.h | 4 - src/cudafeat/feature-online-cmvn-cuda.cu | 1 + src/cudafeat/feature-spectral-cuda.h | 4 - src/cudafeat/online-ivector-feature-cuda.cc | 20 +-- src/cudamatrix/Makefile | 12 -- src/cudamatrix/cu-allocator.cc | 4 - src/cudamatrix/cu-allocator.h | 4 - src/cudamatrix/cu-block-matrix.cc | 4 - src/cudamatrix/cu-common.h | 5 - src/cudamatrix/cu-compressed-matrix.cc | 4 - src/cudamatrix/cu-device.cc | 5 +- src/cudamatrix/cu-device.h | 9 -- src/cudamatrix/cu-kernels.cu | 33 +--- src/cudamatrix/cu-math-test.cc | 11 +- src/cudamatrix/cu-matrix-test.cc | 30 +--- src/cudamatrix/cu-matrix.cc | 4 - src/cudamatrix/cu-packed-matrix.cc | 4 - src/cudamatrix/cu-sp-matrix.cc | 4 - src/cudamatrix/cu-sparse-matrix.cc | 4 - src/cudamatrix/cu-tp-matrix.cc | 4 - src/cudamatrix/cu-vector.cc | 4 - src/hip/hipify.h | 12 -- src/hip/math_constants.h | 152 ------------------ src/makefiles/hip_64bit.mk | 3 + 31 files changed, 29 insertions(+), 414 deletions(-) delete mode 100644 src/hip/math_constants.h diff --git a/.gitignore b/.gitignore index 53a4079d9ef..9f8c727d4d0 100644 --- a/.gitignore +++ b/.gitignore @@ -90,7 +90,3 @@ venv/ # CMakeLists.txt files are currently autogenerated, must not be committed. /src/**/CMakeLists.txt /build* - -# Eclipse sync project -.ptp-sync -.ptp-sync-folder diff --git a/src/chain/Makefile b/src/chain/Makefile index 5b177981ad8..dbe6c38709f 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -33,21 +33,9 @@ ifeq ($(CUDA), true) $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ endif ifeq ($(ROCM), true) -ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) -.PRECIOUS: %.hip -%.hip : %.cu - LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ - cat $< | \ - sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ - sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ - cat > $@ -%.o : %.hip - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif -endif include ../makefiles/default_rules.mk diff --git a/src/configure b/src/configure index 5f9c48a6cde..37a75a5cade 100755 --- a/src/configure +++ b/src/configure @@ -295,12 +295,11 @@ function configure_rocm { ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2) echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk - # Enable HIP implementation for CXX compile commands. ROCm 5.2.0 onwards use - # __HIP_PLATFORM_AMD__ others __HIP_PLATFORM_HCC__ - if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then - echo "CXXFLAGS += -D__HIP_PLATFORM_AMD__=1" >> kaldi.mk - else - echo "CXXFLAGS += -D__HIP_PLATFORM_HCC__=1" >> kaldi.mk + # Only ROCm 5.2+ is supported. + if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -lt 2 ] || [ $ROCM_MAJOR_VERSION -lt 5 ] ; then + echo "\ +WARNING: ROCm $ROCM_MAJOR_VERSION.$ROCM_MINOR_VERSION found but ROCm 5.2 or above is required." + exit 1; fi # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, @@ -309,17 +308,10 @@ function configure_rocm { cat makefiles/hip_64bit.mk >> kaldi.mk else echo "\ -WARNING: ROCM will not be used! - ROCM is only supported with 64-bit Linux builds." +WARNING: ROCm will not be used! + ROCm is only supported with 64-bit Linux builds." exit 1; fi - - if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then - echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk - echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = false" >> kaldi.mk - else - echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = true" >> kaldi.mk - fi } diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile index d4eda345564..a7972f1831d 100644 --- a/src/cudadecoder/Makefile +++ b/src/cudadecoder/Makefile @@ -41,21 +41,9 @@ ifeq ($(CUDA), true) $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) endif ifeq ($(ROCM), true) -ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) -.PRECIOUS: %.hip -%.hip : %.cu - LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ - cat $< | \ - sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ - sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ - cat > $@ -%.o : %.hip - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) -else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) endif -endif else all: diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc index 06dceae73a5..9baa274e2ea 100644 --- a/src/cudadecoder/cuda-decoder.cc +++ b/src/cudadecoder/cuda-decoder.cc @@ -199,7 +199,7 @@ void CudaDecoder::AllocateHostData() { (void**)&h_extra_prev_tokens_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_infotoken_concat_, + (void**)&h_infotoken_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR( cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_, diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile index c0f54a854e8..d7739dae623 100644 --- a/src/cudafeat/Makefile +++ b/src/cudafeat/Makefile @@ -44,21 +44,9 @@ ifeq ($(CUDA), true) $(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) endif ifeq ($(ROCM), true) -ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) -.PRECIOUS: %.hip -%.hip : %.cu - LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ - cat $< | \ - sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ - sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ - cat > $@ -%.o : %.hip - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) -else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) endif -endif else all: $(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]") diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc index 68c247b43e9..1699f8c1e77 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda.cc +++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc @@ -382,43 +382,6 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats( #if CUDA_VERSION >= 9010 int nrhs = 1; - -#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2) - // query temp buffer size - int L_work; - - // perform factorization in batched - CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched_bufferSize( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, - ivector_dim_, &L_work, num_lanes)); - // allocate temp buffer - float *workspace = static_cast( - CuDevice::Instantiate().Malloc(L_work * sizeof(float))); - - // perform factorization in batched - CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, - ivector_dim_, workspace, L_work, d_infoArray_, num_lanes)); - - int L_work2; - - // perform factorization in batched - CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched_bufferSize( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, - quad_array_, ivector_dim_, ivec_array_, ivector_dim_, &L_work2, num_lanes)); - // allocate temp buffer - float *workspace2 = static_cast( - CuDevice::Instantiate().Malloc(L_work2 * sizeof(float))); - - // solve for rhs in batched - CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, - quad_array_, ivector_dim_, ivec_array_, ivector_dim_, workspace2, L_work2, d_infoArray_, - num_lanes)); - - CuDevice::Instantiate().Free(workspace); - CuDevice::Instantiate().Free(workspace2); -#else // perform factorization in batched CUSOLVER_SAFE_CALL(cusolverDnSpotrfBatched( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, @@ -429,7 +392,6 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, quad_array_, ivector_dim_, ivec_array_, ivector_dim_, d_infoArray_, num_lanes)); -#endif #endif // cusolver solves in place. Ivectors are now in linear_ diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index 202232c6b23..113657ce317 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -20,11 +20,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index f8947a3b5ed..bb78028118f 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -27,6 +27,7 @@ #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-vector.h" +// HIP builds do not required packed floating point operators definition. #ifndef __IS_HIP_COMPILE__ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { float2 retval; diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 66f0dce395a..5625592a717 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -20,11 +20,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index 56dbac93165..fa0e9f68237 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -299,14 +299,13 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats( // Forming new non-SP matrix for cusolver. CuMatrix A(quadratic); - - #ifdef CHOLESKY // query temp buffer size int L_work; CUSOLVER_SAFE_CALL( cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), A.Data(), A.Stride(), &L_work)); + // allocate temp buffer float *workspace = static_cast( CuDevice::Instantiate().Malloc(L_work * sizeof(float))); @@ -317,26 +316,9 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats( A.Stride(), workspace, L_work, d_info_)); // solve for rhs -#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2) - // query temp buffer size - int L_work2; - CUSOLVER_SAFE_CALL( - hipsolverSpotrs_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, - A.Data(), A.Stride(), ivector->Data(), ivector_dim_, &L_work2)); - // allocate temp buffer - float *workspace2 = static_cast( - CuDevice::Instantiate().Malloc(L_work2 * sizeof(float))); - - CUSOLVER_SAFE_CALL(hipsolverSpotrs( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, - A.Data(), A.Stride(), ivector->Data(), ivector_dim_, workspace2, L_work2, d_info_)); - - CuDevice::Instantiate().Free(workspace2); -#else CUSOLVER_SAFE_CALL(cusolverDnSpotrs( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_)); -#endif CuDevice::Instantiate().Free(workspace); #else diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 3c1100753e5..45c10b78899 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -34,20 +34,8 @@ ifeq ($(CUDA), true) endif ifeq ($(ROCM), true) -ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) -.PRECIOUS: %.hip -%.hip : %.cu - LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ - cat $< | \ - sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ - sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ - cat > $@ -%.o : %.hip - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif -endif include ../makefiles/default_rules.mk diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index d81dca002ce..abd08a9b015 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,11 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index f776bbb620e..1ed7e54b541 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,11 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index 7983cd250e7..fd17fe61893 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,11 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index c4bdf569d3c..41ef7536a7f 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,13 +32,8 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#include -#else #include #include -#endif #include #include #include diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index 442d2dbac67..e42c93f1b67 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,11 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 25775fb1b05..4d0be20ddc3 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -21,13 +21,10 @@ // limitations under the License. + #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 67b9f1d9e9b..bb1170314c4 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,13 +29,8 @@ #include #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#include -#else #include #include -#endif #include #include #include @@ -49,11 +44,7 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #else #include #endif diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 349b21b6591..3d7fae5c15e 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -27,15 +27,18 @@ #include #include -#include #ifdef __IS_HIP_COMPILE__ #define __CUDA_ARCH__ 800 +#include #include #include "hipify.h" +#define CUDART_INF HIP_INF +#define CUDART_INF_F HIP_INF_F #include "cudamatrix/cu-kernels-ansi.h" #include #include #else +#include #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION @@ -2048,27 +2051,9 @@ static void _transform_reduce_mat_cols( for (int j = tid; j < d.cols; j += CU1DBLOCK) { tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); } - - // if (tid == 0) { - // for (int j = 0; j < d.cols; j += 1) - // tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); - // result[i] = tdata; - - // } - // return; - sdata[tid] = tdata; __syncthreads(); - // if (tid == 0) { - // tdata = 0; - // for (int j = 0; j < CU1DBLOCK; j += 1) - // tdata = op.Reduce(tdata, op.Transform(sdata[j])); - // result[i] = tdata; - // } - - // return; - // Tree reduce # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { @@ -2077,16 +2062,6 @@ static void _transform_reduce_mat_cols( __syncthreads(); } - // if (tid == 0) { - // tdata = 0; - // for (int j = 0; j < 2*warpSize; j += 1) - // tdata = op.Reduce(tdata, op.Transform(sdata[j])); - // result[i] = tdata; - // } - - // return; - - // Reduce last warp. Threads implicitly synchronized within a warp. for (int shift = warpSize; shift > 0; shift >>= 1) { if (tid < warpSize) { diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index 1245fb28bad..e1d59e777be 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -214,9 +214,9 @@ void UnitTestLstmNonlinearity() { for (int32 loop = 0; loop < 10; loop++) { // problem dimensions. - int32 num_rows = RandInt(5, 20), //16 - cell_dim = RandInt(2, 200), //45 - dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); //3 + int32 num_rows = RandInt(5, 20), + cell_dim = RandInt(2, 200), + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); // Pick the (input or params block), and output block, for which we'll // spot-check the derivative values. This will give us test failures @@ -232,6 +232,7 @@ void UnitTestLstmNonlinearity() { else test_params = -1; + CuMatrix input(num_rows, cell_dim * 5 + dropout_dim), params(3, cell_dim), output_deriv(num_rows, cell_dim * 2); @@ -276,11 +277,11 @@ void UnitTestLstmNonlinearity() { for (int32 i = 0; i < test_dim; i++) { CuMatrix delta_input(num_rows, 5 * cell_dim + dropout_dim), delta_params(3, cell_dim); - if (test_input >= 0) { // -1 + if (test_input >= 0) { delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn(); delta_input.Scale(delta); } - if (test_params >= 0) { // 0 + if (test_params >= 0) { delta_params.Row(test_params).SetRandn(); delta_params.Scale(delta); } diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 26a5281ec05..ecddd24db19 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2675,19 +2675,11 @@ static void UnitTestCuMatrixSetRandn() { template static void UnitTestCuMatrixSetRandUniform() { - - // if (CuDevice::Instantiate().Enabled()) { - // CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(GetCurandHandle(), 123456)); - // } - for (int32 i = 0; i < 2; i++) { - MatrixIndexT rows = 180+Rand() % 200, cols = 200+Rand() % 200; + MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200; CuMatrix M(rows, cols); M.SetRandUniform(); - // M.SetZero(); - // M.Add(0.5); - // M.SetZeroAboveDiag(); - + M.Add(-0.5); // we'll be testing the central moments, so // center it around zero first. // Got these moments from http://mathworld.wolfram.com/UniformDistribution.html @@ -2701,16 +2693,6 @@ static void UnitTestCuMatrixSetRandUniform() { for (int32 pow = 1; pow < central_moments.Dim(); pow++) { CuMatrix Mpow(M); Mpow.ApplyPow(pow); - - // if (CuDevice::Instantiate().Enabled()) { - // CuVector col_sum(rows, kUndefined); - // cuda_sum_mat_cols(rows, CU1DBLOCK, col_sum.Data(), Mpow.Data(), Mpow.Dim()); - // KALDI_LOG << "Sums vector is " << col_sum; - // Real ans = col_sum.Sum(); - // KALDI_LOG << "Total sum is " << ans; - // KALDI_ERR << "Stopping!"; - // } - Real observed_moment = Mpow.Sum() / (rows * cols); // see http://en.wikipedia.org/wiki/Normal_distribution#Moments, // note that mu = 0 and sigma = 1. @@ -2723,13 +2705,11 @@ static void UnitTestCuMatrixSetRandUniform() { upper_bound = expected_moment + allowed_deviation; if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) { KALDI_LOG << "Random matrix is " << M; - //KALDI_LOG << "Random vector sum is " << col_sum; - KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment + KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment << ", expected " << expected_moment << ", allowed range " << lower_bound << " to " << upper_bound; } - KALDI_LOG << "Moment[" << pow << "] is " << observed_moment << " (" << expected_moment << ")"; - } + } } } @@ -3081,7 +3061,7 @@ template void CudaMatrixUnitTest() { int main() { SetVerboseLevel(1); int32 loop = 0; - bool test_threads = false; + bool test_threads = true; // num_threads only matters if test_threads == true. Don't make it // to large, because it will affect CPU usage if you are using CPU. int32 num_threads = 4; diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 56acf340823..fd31758f0e6 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,11 +29,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 4de0fcba63d..c9d686d0ce8 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,11 +23,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index 86a3cd9a726..a6c7d7720e4 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,11 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 35ba3ee0c81..cda575b1914 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,11 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 739bab3dd59..378cc8e4e38 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,11 +21,7 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index f6426297e49..c88b3ebf50c 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,11 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 56d7e869a32..efe4848c009 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -211,7 +211,6 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) { #define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed #define curandSetStream hiprandSetStream #define curandStatus_t hiprandStatus_t -#if ROCM_MAJOR_VERSION == 5 && ROCM_MINOR_VERSION >= 1 || ROCM_MAJOR_VERSION > 5 #define cusolverDnCreate hipsolverDnCreate #define cusolverDnDestroy hipsolverDnDestroy #define cusolverDnHandle_t hipsolverDnHandle_t @@ -221,17 +220,6 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) { #define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize #define cusolverDnSpotrs hipsolverDnSpotrs #define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched -#else -#define cusolverDnCreate hipsolverCreate -#define cusolverDnDestroy hipsolverDestroy -#define cusolverDnHandle_t hipsolverHandle_t -#define cusolverDnSetStream hipsolverSetStream -#define cusolverDnSpotrf hipsolverSpotrf -#define cusolverDnSpotrfBatched hipsolverSpotrfBatched -#define cusolverDnSpotrf_bufferSize hipsolverSpotrf_bufferSize -#define cusolverDnSpotrs hipsolverSpotrs -#define cusolverDnSpotrsBatched hipsolverSpotrsBatched -#endif #define cusparseAction_t hipsparseAction_t #define cusparseCreate hipsparseCreate #define cusparseCreateCsr hipsparseCreateCsr diff --git a/src/hip/math_constants.h b/src/hip/math_constants.h deleted file mode 100644 index 7fb8fce8e71..00000000000 --- a/src/hip/math_constants.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__MATH_CONSTANTS_H__) -#define __MATH_CONSTANTS_H__ - -/* single precision constants */ -#define CUDART_INF_F __int_as_float(0x7f800000) -#define CUDART_NAN_F __int_as_float(0x7fffffff) -#define CUDART_MIN_DENORM_F __int_as_float(0x00000001) -#define CUDART_MAX_NORMAL_F __int_as_float(0x7f7fffff) -#define CUDART_NEG_ZERO_F __int_as_float(0x80000000) -#define CUDART_ZERO_F 0.0f -#define CUDART_ONE_F 1.0f -#define CUDART_SQRT_HALF_F 0.707106781f -#define CUDART_SQRT_HALF_HI_F 0.707106781f -#define CUDART_SQRT_HALF_LO_F 1.210161749e-08f -#define CUDART_SQRT_TWO_F 1.414213562f -#define CUDART_THIRD_F 0.333333333f -#define CUDART_PIO4_F 0.785398163f -#define CUDART_PIO2_F 1.570796327f -#define CUDART_3PIO4_F 2.356194490f -#define CUDART_2_OVER_PI_F 0.636619772f -#define CUDART_SQRT_2_OVER_PI_F 0.797884561f -#define CUDART_PI_F 3.141592654f -#define CUDART_L2E_F 1.442695041f -#define CUDART_L2T_F 3.321928094f -#define CUDART_LG2_F 0.301029996f -#define CUDART_LGE_F 0.434294482f -#define CUDART_LN2_F 0.693147181f -#define CUDART_LNT_F 2.302585093f -#define CUDART_LNPI_F 1.144729886f -#define CUDART_TWO_TO_M126_F 1.175494351e-38f -#define CUDART_TWO_TO_126_F 8.507059173e37f -#define CUDART_NORM_HUGE_F 3.402823466e38f -#define CUDART_TWO_TO_23_F 8388608.0f -#define CUDART_TWO_TO_24_F 16777216.0f -#define CUDART_TWO_TO_31_F 2147483648.0f -#define CUDART_TWO_TO_32_F 4294967296.0f -#define CUDART_REMQUO_BITS_F 3 -#define CUDART_REMQUO_MASK_F (~((~0)< Date: Mon, 6 Nov 2023 23:59:05 +0000 Subject: [PATCH 19/22] Fix formating to Google style. --- .../batched-static-nnet3-kernels.h | 1 + ...hed-threaded-nnet3-cuda-online-pipeline.cc | 1 + .../batched-threaded-nnet3-cuda-pipeline.cc | 1 + .../batched-threaded-nnet3-cuda-pipeline2.cc | 1 + src/cudadecoder/cuda-decoder-kernels.cu | 3 +- src/cudadecoder/cuda-decoder.cc | 22 +- src/cudadecoder/cuda-decoder.h | 1 + src/cudadecoder/cuda-fst.cc | 1 + .../batched-wav-nnet3-cuda-online.cc | 2 +- src/cudadecoderbin/batched-wav-nnet3-cuda.cc | 2 +- src/cudadecoderbin/batched-wav-nnet3-cuda2.cc | 1 + ...eature-online-batched-cmvn-cuda-kernels.cu | 1 + ...ure-online-batched-ivector-cuda-kernels.cu | 27 +- ...re-online-batched-spectral-cuda-kernels.cu | 4 +- .../feature-online-batched-spectral-cuda.h | 1 + src/cudafeat/feature-online-cmvn-cuda.cu | 4 +- src/cudafeat/feature-spectral-cuda.cu | 2 + src/cudafeat/feature-spectral-cuda.h | 1 + src/cudafeat/feature-window-cuda.cu | 1 + .../online-batched-feature-pipeline-cuda.cc | 4 +- .../online-ivector-feature-cuda-kernels.cu | 28 +- src/cudafeat/online-ivector-feature-cuda.cc | 1 + src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 3 +- src/cudamatrix/cu-array-inl.h | 1 + src/cudamatrix/cu-array.cc | 1 + src/cudamatrix/cu-block-matrix.cc | 1 + src/cudamatrix/cu-common.cc | 121 +++-- src/cudamatrix/cu-common.h | 5 +- src/cudamatrix/cu-compressed-matrix.cc | 1 + src/cudamatrix/cu-device.cc | 11 +- src/cudamatrix/cu-device.h | 5 +- src/cudamatrix/cu-kernels.cu | 107 ++-- src/cudamatrix/cu-matrix-test.cc | 6 +- src/cudamatrix/cu-matrix.cc | 1 + src/cudamatrix/cu-packed-matrix.cc | 1 + src/cudamatrix/cu-sp-matrix.cc | 1 + src/cudamatrix/cu-sparse-matrix.cc | 1 + src/cudamatrix/cu-tp-matrix.cc | 1 + src/cudamatrix/cu-vector.cc | 16 +- src/cudamatrix/cublas-wrappers.h | 13 +- src/hip/hipify.h | 488 +++++++++--------- 42 files changed, 512 insertions(+), 384 deletions(-) diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h index 0bcb1997576..fec2470a9db 100644 --- a/src/cudadecoder/batched-static-nnet3-kernels.h +++ b/src/cudadecoder/batched-static-nnet3-kernels.h @@ -19,6 +19,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc index c7012b686e0..ed0c0a2f5e9 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc @@ -23,6 +23,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc index d5cf7dae2d7..23d0ca283a2 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc @@ -28,6 +28,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc index f6a3455db01..01d6b1165e7 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc @@ -25,6 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu index 6a14371911d..8503182c1f8 100644 --- a/src/cudadecoder/cuda-decoder-kernels.cu +++ b/src/cudadecoder/cuda-decoder-kernels.cu @@ -16,8 +16,9 @@ // limitations under the License. #ifdef __IS_HIP_COMPILE__ -#include "float.h" #include + +#include "float.h" #include "hipify.h" #else #include diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc index 9baa274e2ea..056d563a791 100644 --- a/src/cudadecoder/cuda-decoder.cc +++ b/src/cudadecoder/cuda-decoder.cc @@ -40,6 +40,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include @@ -190,35 +191,36 @@ void CudaDecoder::AllocateDeviceData() { void CudaDecoder::AllocateHostData() { channel_to_compute_.resize(nlanes_); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_extra_and_acoustic_cost_concat_, + (void **)&h_extra_and_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_acoustic_cost_concat_, + (void **)&h_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_extra_prev_tokens_concat_, + (void **)&h_extra_prev_tokens_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_infotoken_concat_, + (void **)&h_infotoken_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR( - cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_, + cudaMallocHost((void **)&h_extra_and_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_acoustic_cost_concat_tmp_, + (void **)&h_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_extra_prev_tokens_concat_tmp_, + (void **)&h_extra_prev_tokens_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_infotoken_concat_tmp_, + (void **)&h_infotoken_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_))); h_lanes_counters_.Resize( nlanes_ + 1, 1); // +1 because we sometimes need last+1 value (for offsets) - KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_))); + KALDI_DECODER_CUDA_API_CHECK_ERROR( + cudaMallocHost((void **)&h_channels_counters_, + nchannels_ * sizeof(*h_channels_counters_))); h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_); h_all_tokens_acoustic_cost_.resize(nchannels_); diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h index 510904aa004..f6ee37512e2 100644 --- a/src/cudadecoder/cuda-decoder.h +++ b/src/cudadecoder/cuda-decoder.h @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc index 3af37eb7676..682485f6ce4 100644 --- a/src/cudadecoder/cuda-fst.cc +++ b/src/cudadecoder/cuda-fst.cc @@ -25,6 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc index 56368853df2..2bc0a483a0f 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc @@ -25,8 +25,8 @@ #ifdef __IS_HIP_COMPILE__ #include "hip/hip_runtime.h" -#include "roctracer/roctx.h" #include "hipify.h" +#include "roctracer/roctx.h" #else #include #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc index 05af50d7a3b..0e4a719bc75 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc @@ -19,8 +19,8 @@ #ifdef __IS_HIP_COMPILE__ #include "hip/hip_runtime.h" -#include "roctracer/roctx.h" #include "hipify.h" +#include "roctracer/roctx.h" #else #include #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc index c14571f2ed9..b2ad9254c67 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc @@ -22,6 +22,7 @@ #include #include #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu index 7a521d43693..1df9c6a7a43 100644 --- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu @@ -17,6 +17,7 @@ // #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index e5b89d163e5..da2ba24bd90 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -18,6 +18,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include @@ -102,8 +103,9 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss, int32_t right, const LaneDesc *lanes, int32_t num_lanes) { dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); - dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / + GPU_MAX_WARPS_PER_BLOCK, num_lanes); zero_invalid_posteriors_kernel<<>>( @@ -217,8 +219,10 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim, int32_t stridest, float *spliced_feats, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size - if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(num_chunk_frames, num_lanes); @@ -314,8 +318,8 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, // is less than stash size KALDI_ASSERT(stash_size <= GPU_WARP_SIZE); - // This only works if stash size is <= GPU_WARP_SIZE as we rely on __syncthreads() - // to avoid read/write hazards when reading/writing in-place + // This only works if stash size is <= GPU_WARP_SIZE as we rely on + // __syncthreads() to avoid read/write hazards when reading/writing in-place dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks(num_lanes); @@ -325,9 +329,11 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, } { - int threads = - (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size - if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(stash_size, num_lanes); // Then we need to copy feats from source into stash @@ -510,7 +516,8 @@ void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A, int32_t lda, int32_t stridea, const LaneDesc *lanes, int32_t num_lanes) { dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); - int block = (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE; // blocks in x and y dimensions + int block = + (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE; // blocks in x and y dimensions dim3 blocks(block, block, num_lanes); batched_convert_sp_to_dense_kernel<<>>( diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu index 27375f4914e..856d2acab81 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu @@ -18,8 +18,10 @@ #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h" #ifdef __IS_HIP_COMPILE__ -#include #include + +#include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index 113657ce317..d18f5237e8f 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -21,6 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index bb78028118f..e432fe56573 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -18,6 +18,7 @@ #ifdef __IS_HIP_COMPILE__ #define __CUDA_ARCH__ 800 #include + #include "hipify.h" #else #include @@ -189,7 +190,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase &feats_in, stats.Stride()); CU_SAFE_CALL(cudaGetLastError()); - threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to GPU_WARP_SIZE threads + threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to GPU_WARP_SIZE threads if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; const CuMatrix &gstats = cmvn_state_.global_cmvn_stats; diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu index 9c0d5df5288..d8fc215b80b 100644 --- a/src/cudafeat/feature-spectral-cuda.cu +++ b/src/cudafeat/feature-spectral-cuda.cu @@ -19,7 +19,9 @@ #ifdef __IS_HIP_COMPILE__ #include + #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 5625592a717..b0e4a24c8d2 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -21,6 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu index 6ba45e682c1..60fe113d402 100644 --- a/src/cudafeat/feature-window-cuda.cu +++ b/src/cudafeat/feature-window-cuda.cu @@ -19,6 +19,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc index 650b51ec3c7..7736f525237 100644 --- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc +++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include @@ -100,7 +101,8 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda( current_samples_stash_ = new int32_t[num_channels_]; // allocated pinned memory for storing channel desc - CU_SAFE_CALL(cudaMallocHost((void**)&h_lanes_, sizeof(LaneDesc) * max_lanes_)); + CU_SAFE_CALL( + cudaMallocHost((void **)&h_lanes_, sizeof(LaneDesc) * max_lanes_)); // allocate device memory lanes_ = diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu index dffc9fd3c8f..b7128dec7e6 100644 --- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu +++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu @@ -17,6 +17,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include @@ -34,9 +35,12 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols, // Specialize WarpReduce for type float typedef cub::WarpReduce WarpReduce; // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps - __shared__ typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK]; + __shared__ + typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK]; - __shared__ float s_A[GPU_MAX_WARPS_PER_BLOCK][GPU_WARP_SIZE + 1]; //+1 to avoid bank conflicts on transpose + __shared__ float + s_A[GPU_MAX_WARPS_PER_BLOCK] + [GPU_WARP_SIZE + 1]; //+1 to avoid bank conflicts on transpose int bid = blockIdx.x; // batch id int tid = threadIdx.x; // thread id @@ -47,13 +51,15 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols, // Offset to input vector to starting column for batch const float* __restrict__ X_in = X + bid * ldx; - for (int i = 0; i < cols; i += GPU_WARP_SIZE) { // threadIdx.x, keep all threads present + for (int i = 0; i < cols; + i += GPU_WARP_SIZE) { // threadIdx.x, keep all threads present int c = i + tid; float sum = 0.0f; // Perform dot product for (int j = 0; j < rows; - j += GPU_MAX_WARPS_PER_BLOCK) { // threadIdx.y, keep all threads present + j += + GPU_MAX_WARPS_PER_BLOCK) { // threadIdx.y, keep all threads present int r = j + wid; float val = 0.0f; @@ -139,7 +145,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows, int32_t lda, float scale, float* retval) { // Specialize WarpReduce for type float - typedef cub::BlockReduce + typedef cub::BlockReduce BlockReduce; // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps __shared__ typename BlockReduce::TempStorage temp_storage; @@ -207,7 +215,8 @@ __global__ void update_linear_and_quadratic_terms_kernel( void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, const float* AT, int B_stride, const float* B, float* C) { - batched_gemv_reduce_kernel<<>>( + batched_gemv_reduce_kernel<<>>( rows, cols, AT, A_stride, B, B_stride, C); CU_SAFE_CALL(cudaGetLastError()); } @@ -215,8 +224,11 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left, int32_t size, const float* feats, int32_t ldf, float* sfeats, int32_t lds) { - int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size - if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads splice_features_kernel<<>>( num_frames, feat_dim, left, size, feats, ldf, sfeats, lds); diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index fa0e9f68237..f96b2a81ce2 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -18,6 +18,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" // The BLAS enumerators are used instead of the SOLVER ones. #ifdef CUBLAS_FILL_MODE_LOWER diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index abd08a9b015..c4cceedca48 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -26,6 +26,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include @@ -33,7 +34,6 @@ #include #endif - #include #include #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 1ed7e54b541..3edd9f1ca40 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,9 +24,10 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include #include #include +#include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 1fd80502cf9..b8c250c6771 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -30,6 +30,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc index 333e8fbed1c..2a29338aeb1 100644 --- a/src/cudamatrix/cu-array.cc +++ b/src/cudamatrix/cu-array.cc @@ -24,6 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index fd17fe61893..63cf33f98b2 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 2e77062f20d..938ec679f68 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -24,6 +24,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #define API_NAME_PREFIX "HIP" #else @@ -59,7 +60,7 @@ NvtxTracer::~NvtxTracer() { #ifdef __IS_HIP_COMPILE__ roctxRangePop(); #else - nvtxRangePop(); + nvtxRangePop(); #endif } #endif @@ -102,19 +103,31 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, const char* cublasGetStatusStringK(cublasStatus_t status) { // Defined in CUDA include file: cublas.h or cublas_api.h switch(status) { - case CUBLAS_STATUS_SUCCESS: return API_NAME_PREFIX "BLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; + case CUBLAS_STATUS_SUCCESS: + return API_NAME_PREFIX "BLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: + return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; #ifdef __IS_HIP_COMPILE__ - case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; - case HIPBLAS_STATUS_INVALID_ENUM: return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; + case HIPBLAS_STATUS_HANDLE_IS_NULLPTR: + return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; + case HIPBLAS_STATUS_INVALID_ENUM: + return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; #endif } return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR"; @@ -124,20 +137,32 @@ const char* cusparseGetStatusString(cusparseStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust // Defined in CUDA include file: cusparse.h switch(status) { - case CUSPARSE_STATUS_SUCCESS: return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS"; - case CUSPARSE_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED"; - case CUSPARSE_STATUS_ALLOC_FAILED: return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED"; - case CUSPARSE_STATUS_INVALID_VALUE: return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE"; - case CUSPARSE_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH"; - case CUSPARSE_STATUS_MAPPING_ERROR: return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR"; - case CUSPARSE_STATUS_EXECUTION_FAILED: return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED"; - case CUSPARSE_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR"; - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - case CUSPARSE_STATUS_ZERO_PIVOT: return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT"; - #if CUDA_VERSION >= 11000 - case CUSPARSE_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED"; - case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; - #endif + case CUSPARSE_STATUS_SUCCESS: + return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_ALLOC_FAILED: + return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_INVALID_VALUE: + return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_MAPPING_ERROR: + return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_EXECUTION_FAILED: + return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_ZERO_PIVOT: + return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT"; +#if CUDA_VERSION >= 11000 + case CUSPARSE_STATUS_NOT_SUPPORTED: + return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED"; + case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: + return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; +#endif } return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR"; } @@ -146,21 +171,35 @@ const char* curandGetStatusString(curandStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html // Defined in CUDA include file: curand.h switch(status) { - case CURAND_STATUS_SUCCESS: return API_NAME_PREFIX "RAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; + case CURAND_STATUS_SUCCESS: + return API_NAME_PREFIX "RAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: + return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: + return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: + return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: + return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; #ifdef __IS_HIP_COMPILE__ - case HIPRAND_STATUS_NOT_IMPLEMENTED: return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED"; + case HIPRAND_STATUS_NOT_IMPLEMENTED: + return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED"; #endif } return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR"; diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 41ef7536a7f..934668da6f2 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,11 +32,12 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ -#include -#include #include +#include #include +#include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index e42c93f1b67..bb4017de9bb 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 4d0be20ddc3..fd2c0c64f1f 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -24,15 +24,16 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include #include #include +#include + #include "hipify.h" #else #include #include #include -#endif // __IS_HIP_COMPILE__ +#endif // __IS_HIP_COMPILE__ #include #include #include @@ -246,12 +247,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) { return; } else { // Suggest to use compute exclusive mode - #ifdef __IS_HIP_COMPILE__ +#ifdef __IS_HIP_COMPILE__ KALDI_WARN << "Not in compute-exclusive mode."; - #else +#else KALDI_WARN << "Not in compute-exclusive mode. Suggestion: use " "'nvidia-smi -c 3' to set compute exclusive mode"; - #endif +#endif // We want to choose the device more carefully, so release the CUDA context. e = cudaDeviceReset(); if (e != cudaSuccess) { diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index bb1170314c4..fe8ac795560 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,11 +29,12 @@ #include #ifdef __IS_HIP_COMPILE__ -#include -#include #include #include +#include #include +#include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 3d7fae5c15e..8d5784acb52 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -31,18 +31,18 @@ #define __CUDA_ARCH__ 800 #include #include + #include "hipify.h" -#define CUDART_INF HIP_INF -#define CUDART_INF_F HIP_INF_F -#include "cudamatrix/cu-kernels-ansi.h" -#include #include +#include + +#include "cudamatrix/cu-kernels-ansi.h" #else #include #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION -#endif //__IS_HIP_COMPILE__ +#endif //__IS_HIP_COMPILE__ /*********************************************************************** * Generic __device__ functions @@ -1122,7 +1122,7 @@ void trace_mat_mat_trans_atomic(Real *d_result, // Assuming *d_result is set to zero already constexpr int THREADS_X = GPU_WARP_SIZE; - constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK/2; + constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK / 2; dim3 thrds(THREADS_X, THREADS_Y); @@ -2111,7 +2111,7 @@ static void _group_transform_reduce( // tree-reduce to 2x warpSize elements per group int shift = threads_per_group / 2; -# pragma unroll +#pragma unroll for (; shift > warpSize; shift >>= 1) { if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); @@ -4009,9 +4009,9 @@ struct BatchedMatrixCopyDesc { MatrixCopyDesc batch[MAX_BATCH_SIZE]; }; -// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) -// grid dim x,y expands to fill out average in x/y across batches -// grid dim.z is batch +// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE +// (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) grid dim x,y +// expands to fill out average in x/y across batches grid dim.z is batch template __global__ void _cuda_batch_copy_mats(BatchedMatrixCopyDesc batch_desc) { @@ -4390,7 +4390,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B, void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { - _trace_mat_mat <<>>(A,B,dA,B_stride,value); + _trace_mat_mat<<>>(A, B, dA, B_stride, value); } void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha, @@ -4413,8 +4413,8 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha, v, stride_v); #ifdef __IS_HIP_COMPILE__ } else if (Bl.x == 64) { - _add_diag_mat_mat_MTN<64> <<>>(alpha, M, stride_M, N, dim_N, beta, - v, stride_v); + _add_diag_mat_mat_MTN<64> + <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); #endif } } @@ -4426,10 +4426,10 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha, if (Bl.x == 16) { _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { - _add_diag_mat_mat_MN<32> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<32><<>>(alpha, M, stride_M, N, dim_N, beta, v); #ifdef __IS_HIP_COMPILE__ - } else if (Bl.x==64) { - _add_diag_mat_mat_MN<64> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + } else if (Bl.x == 64) { + _add_diag_mat_mat_MN<64><<>>(alpha, M, stride_M, N, dim_N, beta, v); #endif } } @@ -5105,7 +5105,7 @@ void cudaD_trace_mat_mat_trans(const double* A, void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { - _trace_mat_mat <<>>(A,B,dA,B_stride,value); + _trace_mat_mat<<>>(A, B, dA, B_stride, value); } void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha, @@ -5128,8 +5128,8 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha, v, stride_v); #ifdef __IS_HIP_COMPILE__ } else if (Bl.x == 64) { - _add_diag_mat_mat_MTN<64> <<>>(alpha, M, stride_M, N, dim_N, beta, - v, stride_v); + _add_diag_mat_mat_MTN<64> + <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); #endif } } @@ -5141,10 +5141,10 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha, if (Bl.x == 16) { _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { - _add_diag_mat_mat_MN<32> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<32><<>>(alpha, M, stride_M, N, dim_N, beta, v); #ifdef __IS_HIP_COMPILE__ - } else if (Bl.x==64) { - _add_diag_mat_mat_MN<64> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + } else if (Bl.x == 64) { + _add_diag_mat_mat_MN<64><<>>(alpha, M, stride_M, N, dim_N, beta, v); #endif } } @@ -5516,25 +5516,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim, @@ -5831,8 +5831,9 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, // this will synchronize all threads without blocking. void cuda_legacy_noop() { #ifdef __IS_HIP_COMPILE__ - // HIP doesn't currently support cudaStreamLegacy stream so we force the implementation to use the - // legacy (not per-thread) API to get similar semantics. + // HIP doesn't currently support cudaStreamLegacy stream so we force the + // implementation to use the legacy (not per-thread) API to get similar + // semantics. auto k = reinterpret_cast(_noop_kernel); hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0); #else @@ -5847,8 +5848,10 @@ void cudaF_mat_copy_range_clamped( float *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); - dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks( + (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5861,8 +5864,10 @@ void cudaD_mat_copy_range_clamped( double *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); - dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks( + (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5871,8 +5876,7 @@ void cudaD_mat_copy_range_clamped( void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs, int32_t *ldo) { - - dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5898,9 +5902,10 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, - MAX_BATCH_SIZE); + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5920,10 +5925,11 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - - dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, - remaining); + + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + remaining); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5936,8 +5942,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs, int32_t *ldo) { - - dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5963,9 +5968,10 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, - MAX_BATCH_SIZE); + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5986,10 +5992,11 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, - remaining); - + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + remaining); + // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index ecddd24db19..dfcaf30770a 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2679,7 +2679,7 @@ static void UnitTestCuMatrixSetRandUniform() { MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200; CuMatrix M(rows, cols); M.SetRandUniform(); - + M.Add(-0.5); // we'll be testing the central moments, so // center it around zero first. // Got these moments from http://mathworld.wolfram.com/UniformDistribution.html @@ -2705,11 +2705,11 @@ static void UnitTestCuMatrixSetRandUniform() { upper_bound = expected_moment + allowed_deviation; if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) { KALDI_LOG << "Random matrix is " << M; - KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment + KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment << ", expected " << expected_moment << ", allowed range " << lower_bound << " to " << upper_bound; } - } + } } } diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index fd31758f0e6..53831a52bc8 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -30,6 +30,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index c9d686d0ce8..001170fdeca 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -24,6 +24,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index a6c7d7720e4..96085848d72 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index cda575b1914..81ecbe68080 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -25,6 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 378cc8e4e38..da19a31b39a 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index c88b3ebf50c..6667f2bca62 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -25,6 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include @@ -649,8 +650,9 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, const int32 warpSize = GPU_WARP_SIZE; const int32 kOptNumBlocks = 512; const int32 tile_dim = - (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ? - GPU_WARP_SIZE/2 : GPU_WARP_SIZE; + (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) + ? GPU_WARP_SIZE / 2 + : GPU_WARP_SIZE; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x), n_blocks(N.NumRows(), dimBlock.y)); @@ -676,8 +678,9 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access. // One block per 'tile_dim' columns of N. // 1D grid expands along the row of N. - int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2; + int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 + ? GPU_WARP_SIZE + : GPU_WARP_SIZE / 2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(), @@ -685,8 +688,9 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, } else { // Case 4: diag(M'*N') == sum(N'.*M, 1) // Same kernel and config as case 3 except M and N are swapped. - int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2; + int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 + ? GPU_WARP_SIZE + : GPU_WARP_SIZE / 2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(M.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(), diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h index dc5c0e0ced5..537cca9b97f 100644 --- a/src/cudamatrix/cublas-wrappers.h +++ b/src/cudamatrix/cublas-wrappers.h @@ -37,8 +37,9 @@ inline cublasStatus_t cublas_gemm( const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { #if CUDA_VERSION >= 11000 - return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUBLAS_R_32F,lda,B,CUBLAS_R_32F,ldb,&beta, - C,CUBLAS_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(), + return cublasGemmEx(handle, transa, transb, m, n, k, &alpha, A, CUBLAS_R_32F, + lda, B, CUBLAS_R_32F, ldb, &beta, C, CUBLAS_R_32F, ldc, + CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc); @@ -66,9 +67,11 @@ inline cublasStatus_t cublas_gemmBatched( const float *A[], int lda, const float *B[], int ldb, float beta, float *C[], int ldc, int batchCount) { #if CUDA_VERSION >= 11000 - return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUBLAS_R_32F, lda, - (const void**)B, CUBLAS_R_32F, ldb, &beta, (void**)C, CUBLAS_R_32F, ldc, batchCount, - CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); + return cublasGemmBatchedEx( + handle, transa, transb, m, n, k, &alpha, (const void **)A, CUBLAS_R_32F, + lda, (const void **)B, CUBLAS_R_32F, ldb, &beta, (void **)C, CUBLAS_R_32F, + ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(), + CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); #endif diff --git a/src/hip/hipify.h b/src/hip/hipify.h index efe4848c009..e9ca483d022 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -2,250 +2,262 @@ #define __HIPIFY_H__ #ifdef __HIPCC__ -inline __device__ void __syncwarp(unsigned mask=0xffffffff) { - // On CDNA hardware wave-fronts (warps) execute always in - // lock step. Though it might still be important to signal - // that the compiler can't reorder code around certain code - // sections that rely on data sharing mecanisms like LDS - // (shared memory). So this implements a No-op but is seen - // by the compiler as having side effects. - __asm__("s_nop 0"); +inline __device__ void __syncwarp(unsigned mask = 0xffffffff) { + // On CDNA hardware wave-fronts (warps) execute always in + // lock step. Though it might still be important to signal + // that the compiler can't reorder code around certain code + // sections that rely on data sharing mecanisms like LDS + // (shared memory). So this implements a No-op but is seen + // by the compiler as having side effects. + __asm__("s_nop 0"); - // A saffest option, arguably less performant would be to use: - // __asm__("s_waitcnt lgkmcnt(0)"); Í - // to explicitly do a memory fence. + // A saffest option, arguably less performant would be to use: + // __asm__("s_waitcnt lgkmcnt(0)"); Í + // to explicitly do a memory fence. } // AMDGCN only support this rounding mode. #define __fdiv_rd __fdiv_rn #else -#define __align__(x) __attribute__((aligned (x))) +#define __align__(x) __attribute__((aligned(x))) #endif // // HIP types // -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. -#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. -#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT -#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER -#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs. -#define CUBLAS_OP_C HIPBLAS_OP_C -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_R_32F HIPBLAS_R_32F -#define CUBLAS_R_64F HIPBLAS_R_64F -#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUDA_R_32F HIP_R_32F -#define CUDA_R_64F HIP_R_64F -#define CUFFT_R2C HIPFFT_R2C -#define CUFFT_SUCCESS HIPFFT_SUCCESS -#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT -#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED -#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH -#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED -#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED -#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED -#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR -#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE -#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE -#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED -#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE -#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE -#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS -#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR -#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH -#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC -#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I -#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO -#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE -#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE -#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN -#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 -#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED -#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH -#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED -#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES -#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR -#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE -#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR -#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED -#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED -#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS -#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT -#define cuDeviceGetName hipDeviceGetName -#define cuMemGetInfo_v2 hipMemGetInfo -#define cublasComputeType_t hipblasDatatype_t -#define cublasCreate hipblasCreate -#define cublasDasum_v2 hipblasDasum -#define cublasDaxpy_v2 hipblasDaxpy -#define cublasDcopy_v2 hipblasDcopy -#define cublasDdot_v2 hipblasDdot -#define cublasDestroy hipblasDestroy -#define cublasDgemmBatched hipblasDgemmBatched -#define cublasDgemm_v2 hipblasDgemm -#define cublasDgemv_v2 hipblasDgemv -#define cublasDger_v2 hipblasDger -#define cublasDnrm2_v2 hipblasDnrm2 -#define cublasDscal_v2 hipblasDscal -#define cublasDspmv_v2 hipblasDspmv -#define cublasDspr_v2 hipblasDspr -#define cublasDsyrk_v2 hipblasDsyrk -#define cublasDtpmv_v2 hipblasDtpmv -#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) -#define cublasFillMode_t hipblasFillMode_t -#define cublasGemmAlgo_t hipblasGemmAlgo_t -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmEx hipblasGemmEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasOperation_t hipblasOperation_t -#define cublasSasum_v2 hipblasSasum -#define cublasSaxpy_v2 hipblasSaxpy -#define cublasScopy_v2 hipblasScopy -#define cublasSdot_v2 hipblasSdot -#define cublasSetStream hipblasSetStream -#define cublasSgemv_v2 hipblasSgemv -#define cublasSger_v2 hipblasSger -#define cublasSnrm2_v2 hipblasSnrm2 -#define cublasSscal_v2 hipblasSscal -#define cublasSspmv_v2 hipblasSspmv -#define cublasSspr_v2 hipblasSspr -#define cublasSsyrk_v2 hipblasSsyrk -#define cublasStatus_t hipblasStatus_t -#define cublasStatus_t hipblasStatus_t -#define cublasStpmv_v2 hipblasStpmv -#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) -#define cudaComputeModeExclusive hipComputeModeExclusive -#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess -#define cudaDataType hipDataType -#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize -#define cudaDeviceGetAttribute hipDeviceGetAttribute -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceReset hipDeviceReset -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse -#define cudaErrorInvalidDevice hipErrorInvalidDevice -#define cudaError_t hipError_t -#define cudaEventCreate hipEventCreate -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDestroy hipEventDestroy -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaFree hipFree -#define cudaFreeHost hipFreeHost -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorName hipGetErrorName -#define cudaGetErrorString hipGetErrorString -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaHostRegister hipHostRegister -#define cudaHostRegisterDefault hipHostRegisterDefault -#define cudaHostUnregister hipHostUnregister -#define cudaLaunchHostFunc hipLaunchHostFunc -#define cudaMalloc hipMalloc -#define cudaMallocHost hipHostMalloc -#define cudaMallocPitch hipMallocPitch -#define cudaMemcpy hipMemcpy -// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F \ + HIPBLAS_R_32F // TODO: Verify that plain float compute are viable + // replacements for the tensor cores alternative. +#define CUBLAS_COMPUTE_32F_FAST_TF32 \ + HIPBLAS_R_32F // TODO: Verify that plain float compute are viable + // replacements for the tensor cores alternative. +#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT +#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP \ + HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements + // for explicit tensor GEMMs. +#define CUBLAS_OP_C HIPBLAS_OP_C +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_R_32F HIPBLAS_R_32F +#define CUBLAS_R_64F HIPBLAS_R_64F +#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUDA_R_32F HIP_R_32F +#define CUDA_R_64F HIP_R_64F +#define CUFFT_R2C HIPFFT_R2C +#define CUFFT_SUCCESS HIPFFT_SUCCESS +#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT +#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED +#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH +#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED \ + HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR +#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE +#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE +#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED +#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE +#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE +#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS +#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR +#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH +#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN +#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 +#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED +#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH +#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED +#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES \ + HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES +#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR +#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE +#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR +#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED \ + HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED +#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED +#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT +#define cuDeviceGetName hipDeviceGetName +#define cuMemGetInfo_v2 hipMemGetInfo +#define cublasComputeType_t hipblasDatatype_t +#define cublasCreate hipblasCreate +#define cublasDasum_v2 hipblasDasum +#define cublasDaxpy_v2 hipblasDaxpy +#define cublasDcopy_v2 hipblasDcopy +#define cublasDdot_v2 hipblasDdot +#define cublasDestroy hipblasDestroy +#define cublasDgemmBatched hipblasDgemmBatched +#define cublasDgemm_v2 hipblasDgemm +#define cublasDgemv_v2 hipblasDgemv +#define cublasDger_v2 hipblasDger +#define cublasDnrm2_v2 hipblasDnrm2 +#define cublasDscal_v2 hipblasDscal +#define cublasDspmv_v2 hipblasDspmv +#define cublasDspr_v2 hipblasDspr +#define cublasDsyrk_v2 hipblasDsyrk +#define cublasDtpmv_v2 hipblasDtpmv +#define cublasDtrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \ + hipblasDtrsm(a, b, c, d, e, f, g, h, const_cast(i), j, k, l) +#define cublasFillMode_t hipblasFillMode_t +#define cublasGemmAlgo_t hipblasGemmAlgo_t +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmEx hipblasGemmEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasOperation_t hipblasOperation_t +#define cublasSasum_v2 hipblasSasum +#define cublasSaxpy_v2 hipblasSaxpy +#define cublasScopy_v2 hipblasScopy +#define cublasSdot_v2 hipblasSdot +#define cublasSetStream hipblasSetStream +#define cublasSgemv_v2 hipblasSgemv +#define cublasSger_v2 hipblasSger +#define cublasSnrm2_v2 hipblasSnrm2 +#define cublasSscal_v2 hipblasSscal +#define cublasSspmv_v2 hipblasSspmv +#define cublasSspr_v2 hipblasSspr +#define cublasSsyrk_v2 hipblasSsyrk +#define cublasStatus_t hipblasStatus_t +#define cublasStatus_t hipblasStatus_t +#define cublasStpmv_v2 hipblasStpmv +#define cublasStrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \ + hipblasStrsm(a, b, c, d, e, f, g, h, const_cast(i), j, k, l) +#define cudaComputeModeExclusive hipComputeModeExclusive +#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess +#define cudaDataType hipDataType +#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceReset hipDeviceReset +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse +#define cudaErrorInvalidDevice hipErrorInvalidDevice +#define cudaError_t hipError_t +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDestroy hipEventDestroy +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipFreeHost +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorName hipGetErrorName +#define cudaGetErrorString hipGetErrorString +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterDefault hipHostRegisterDefault +#define cudaHostUnregister hipHostUnregister +#define cudaLaunchHostFunc hipLaunchHostFunc +#define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc +#define cudaMallocPitch hipMallocPitch +#define cudaMemcpy hipMemcpy +// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized // copies, which should be canceled by ROCm 5.7.1+. Then the following would // be sufficient: // #define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpy2DAsync(a,b,c,d,width,height,e,f) \ - [&]() -> hipError_t { \ - if (width && height) \ - return hipMemcpy2DAsync(a,b,c,d,width,height,e,f); \ - return hipSuccess; \ - }() -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemGetInfo hipMemGetInfo -#define cudaMemset2DAsync hipMemset2DAsync -#define cudaMemsetAsync hipMemsetAsync -#define cudaProfilerStop hipProfilerStop -#define cudaSetDevice hipSetDevice -#define cudaStreamCreate hipStreamCreate -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamPerThread ((hipStream_t)2) -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent hipStreamWaitEvent -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define cufftComplex hipfftComplex -#define cufftDestroy hipfftDestroy -#define cufftExecR2C hipfftExecR2C -#define cufftHandle hipfftHandle -#define cufftPlanMany hipfftPlanMany -#define cufftSetStream hipfftSetStream -#define curandCreateGenerator hiprandCreateGenerator -#define curandDestroyGenerator hiprandDestroyGenerator -#define curandGenerateNormal hiprandGenerateNormal -#define curandGenerateNormalDouble hiprandGenerateNormalDouble -#define curandGenerateUniform hiprandGenerateUniform -#define curandGenerateUniformDouble hiprandGenerateUniformDouble -#define curandGenerator_t hiprandGenerator_t -#define curandSetGeneratorOffset hiprandSetGeneratorOffset -#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing. -#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed -#define curandSetStream hiprandSetStream -#define curandStatus_t hiprandStatus_t -#define cusolverDnCreate hipsolverDnCreate -#define cusolverDnDestroy hipsolverDnDestroy -#define cusolverDnHandle_t hipsolverDnHandle_t -#define cusolverDnSetStream hipsolverDnSetStream -#define cusolverDnSpotrf hipsolverDnSpotrf -#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched -#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize -#define cusolverDnSpotrs hipsolverDnSpotrs -#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched -#define cusparseAction_t hipsparseAction_t -#define cusparseCreate hipsparseCreate -#define cusparseCreateCsr hipsparseCreateCsr -#define cusparseCreateDnMat hipsparseCreateDnMat -#define cusparseCreateMatDescr hipsparseCreateMatDescr -#define cusparseDcsr2csc hipsparseDcsr2csc -#define cusparseDestroy hipsparseDestroy -#define cusparseDestroy hipsparseDestroy -#define cusparseDestroyDnMat hipsparseDestroyDnMat -#define cusparseDestroyMatDescr hipsparseDestroyMatDescr -#define cusparseDestroySpMat hipsparseDestroySpMat -#define cusparseDnMatDescr_t hipsparseDnMatDescr_t -#define cusparseGetMatIndexBase hipsparseGetMatIndexBase -#define cusparseHandle_t hipsparseHandle_t -#define cusparseIndexBase_t hipsparseIndexBase_t -#define cusparseMatDescr_t hipsparseMatDescr_t -#define cusparseOperation_t hipsparseOperation_t -#define cusparseScsr2csc hipsparseScsr2csc -#define cusparseSetStream hipsparseSetStream -#define cusparseSpMM hipsparseSpMM -#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize -#define cusparseSpMatDescr_t hipsparseSpMatDescr_t -#define cusparseStatus_t hipsparseStatus_t -#define nvtxRangePop roctxRangePop -#define nvtxRangePush roctxRangePush -#define nvtxRangePushA roctxRangePushA +#define cudaMemcpy2DAsync(a, b, c, d, width, height, e, f) \ + [&]() -> hipError_t { \ + if (width && height) \ + return hipMemcpy2DAsync(a, b, c, d, width, height, e, f); \ + return hipSuccess; \ + }() +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemGetInfo hipMemGetInfo +#define cudaMemset2DAsync hipMemset2DAsync +#define cudaMemsetAsync hipMemsetAsync +#define cudaProfilerStop hipProfilerStop +#define cudaSetDevice hipSetDevice +#define cudaStreamCreate hipStreamCreate +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamPerThread ((hipStream_t)2) +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define cufftComplex hipfftComplex +#define cufftDestroy hipfftDestroy +#define cufftExecR2C hipfftExecR2C +#define cufftHandle hipfftHandle +#define cufftPlanMany hipfftPlanMany +#define cufftSetStream hipfftSetStream +#define curandCreateGenerator hiprandCreateGenerator +#define curandDestroyGenerator hiprandDestroyGenerator +#define curandGenerateNormal hiprandGenerateNormal +#define curandGenerateNormalDouble hiprandGenerateNormalDouble +#define curandGenerateUniform hiprandGenerateUniform +#define curandGenerateUniformDouble hiprandGenerateUniformDouble +#define curandGenerator_t hiprandGenerator_t +#define curandSetGeneratorOffset hiprandSetGeneratorOffset +#define curandSetGeneratorOrdering(x, y) \ + 0 // HIP does not support generator ordeing. +#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed +#define curandSetStream hiprandSetStream +#define curandStatus_t hiprandStatus_t +#define cusolverDnCreate hipsolverDnCreate +#define cusolverDnDestroy hipsolverDnDestroy +#define cusolverDnHandle_t hipsolverDnHandle_t +#define cusolverDnSetStream hipsolverDnSetStream +#define cusolverDnSpotrf hipsolverDnSpotrf +#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched +#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize +#define cusolverDnSpotrs hipsolverDnSpotrs +#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched +#define cusparseAction_t hipsparseAction_t +#define cusparseCreate hipsparseCreate +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseCreateDnMat hipsparseCreateDnMat +#define cusparseCreateMatDescr hipsparseCreateMatDescr +#define cusparseDcsr2csc hipsparseDcsr2csc +#define cusparseDestroy hipsparseDestroy +#define cusparseDestroy hipsparseDestroy +#define cusparseDestroyDnMat hipsparseDestroyDnMat +#define cusparseDestroyMatDescr hipsparseDestroyMatDescr +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseDnMatDescr_t hipsparseDnMatDescr_t +#define cusparseGetMatIndexBase hipsparseGetMatIndexBase +#define cusparseHandle_t hipsparseHandle_t +#define cusparseIndexBase_t hipsparseIndexBase_t +#define cusparseMatDescr_t hipsparseMatDescr_t +#define cusparseOperation_t hipsparseOperation_t +#define cusparseScsr2csc hipsparseScsr2csc +#define cusparseSetStream hipsparseSetStream +#define cusparseSpMM hipsparseSpMM +#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseStatus_t hipsparseStatus_t +#define nvtxRangePop roctxRangePop +#define nvtxRangePush roctxRangePush +#define nvtxRangePushA roctxRangePushA // // HIPCUB namespace. // @@ -256,8 +268,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) { // #define CUDART_CB +// +// Math constants +// +#define CUDART_INF HIP_INF +#define CUDART_INF_F HIP_INF_F + +// +// GPU static hardware characteristics. +// #define GPU_WARP_SIZE 64 #define GPU_MAX_THREADS_PER_BLOCK 1024 -#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK/GPU_WARP_SIZE) -#endif //__HIPIFY_H__ - +#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE) +#endif //__HIPIFY_H__ From 3aaa32637850c919af905b1c799b3f4919d804cd Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 7 Nov 2023 00:00:01 +0000 Subject: [PATCH 20/22] Fix more formating to Google style. --- src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu | 3 ++- src/cudamatrix/cu-kernels.cu | 2 +- src/hip/hipify.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index da2ba24bd90..5b94c34e829 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -222,7 +222,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim, int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size if (threads > GPU_MAX_THREADS_PER_BLOCK) - threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(num_chunk_frames, num_lanes); diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 8d5784acb52..9127819eca5 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -32,11 +32,11 @@ #include #include -#include "hipify.h" #include #include #include "cudamatrix/cu-kernels-ansi.h" +#include "hipify.h" #else #include #include "cudamatrix/cu-kernels-ansi.h" diff --git a/src/hip/hipify.h b/src/hip/hipify.h index e9ca483d022..459372e68b8 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -275,7 +275,7 @@ inline __device__ void __syncwarp(unsigned mask = 0xffffffff) { #define CUDART_INF_F HIP_INF_F // -// GPU static hardware characteristics. +// GPU static hardware characteristics. // #define GPU_WARP_SIZE 64 #define GPU_MAX_THREADS_PER_BLOCK 1024 From 6ebab7023b01a4270cbd07b5c3bfce7f1ca2c461 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 7 Nov 2023 00:25:49 +0000 Subject: [PATCH 21/22] Fix header ordering. --- src/cudamatrix/cu-kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 9127819eca5..9df6cea6e9d 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -32,8 +32,8 @@ #include #include -#include #include +#include #include "cudamatrix/cu-kernels-ansi.h" #include "hipify.h" From 7efdeaeb10ed0ae2593ee69faa04b5172a39aba9 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 7 Nov 2023 05:16:09 -0600 Subject: [PATCH 22/22] Add GPU characteristics for CUDA. --- src/cudamatrix/cu-common.h | 4 ++++ src/cudamatrix/cu-kernels.cu | 1 + 2 files changed, 5 insertions(+) diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 934668da6f2..3206fe7e7f4 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -45,6 +45,10 @@ #include #include #include + +#define GPU_WARP_SIZE 32 +#define GPU_MAX_THREADS_PER_BLOCK 1024 +#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE) #endif #define CU_SAFE_CALL(fun) \ diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 9df6cea6e9d..b3c3165bd96 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -39,6 +39,7 @@ #include "hipify.h" #else #include +#include "cudamatrix/cu-common.h" #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION