From de3632d3a35c0a3bc942c403f073c30fa897386c Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuelfantao@gmail.com>
Date: Wed, 7 Sep 2022 13:43:58 +0100
Subject: [PATCH 01/22] Insert build system changes.

---
 src/Makefile                   |  6 +++
 src/chain/Makefile             | 13 +++++-
 src/configure                  | 79 ++++++++++++++++++++++++++++++++--
 src/cudamatrix/Makefile        | 13 +++++-
 src/makefiles/default_rules.mk | 10 ++++-
 src/nnet3/Makefile             |  7 ++-
 src/nnet3bin/Makefile          |  6 +++
 7 files changed, 127 insertions(+), 7 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 4d4efbc0172..bc4375e30f6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -34,6 +34,12 @@ SUBDIRS += $(CUDADECODER)
 endif
 endif
 
+ifeq ($(ROCM), true)
+ifeq ($(WITH_CUDADECODER), true)
+SUBDIRS += $(CUDADECODER)
+endif
+endif
+
 SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS))
 SUBDIRS_BIN = $(filter     %bin, $(SUBDIRS))
 
diff --git a/src/chain/Makefile b/src/chain/Makefile
index fbad28f7de6..c4411f4b997 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -10,7 +10,7 @@ TESTFILES = chain-supervision-test language-model-test
 OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \
           language-model.o chain-denominator.o chain-training.o \
           chain-generic-numerator.o
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
   OBJFILES += chain-kernels.o
 endif
 
@@ -28,7 +28,18 @@ ifeq ($(CUDA), true)
 endif
 
 # Implicit rule for kernel compilation,
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
+endif
+ifeq ($(ROCM), true)
+#%.hip : %.cu
+#	$(HIPIFY) $< 1> $@ 2> $@.stats
+#%.o : %.hip
+#	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+%.o : %.cu
+        $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+endif
+
 
 include ../makefiles/default_rules.mk
diff --git a/src/configure b/src/configure
index ed627eceedc..feb2fd276ad 100755
--- a/src/configure
+++ b/src/configure
@@ -74,6 +74,9 @@ Configuration options:
   --cudatk-dir=DIR      CUDA toolkit directory
   --cuda-arch=FLAGS     Override the default CUDA_ARCH flags. See:
          https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
+  --use-rocm            Build with ROCm
+  --rocm-dir=DIR        ROCM directory
+  --rocm-targets=TGTS   Comma separated list of GPU targets to target through ROCm
   --debug-level=N       Use assertion level 0 (disabled), 1, or 2 [default=1]
   --double-precision    Build with BaseFloat set to double if yes [default=no],
                         mostly useful for testing purposes.
@@ -248,6 +251,63 @@ function check_for_slow_expf {
   fi
 }
 
+# ROCM is used only in selected directories including src/cudamatrix, src/nnet*
+# and src/chain*. It is used to accelerate the neural network training.
+# The rest of Kaldi runs on CPUs.
+
+function configure_rocm {
+  # Check for ROCM in the system
+  if [ ! -d "$ROCMDIR" ]; then
+    for base in $ROCM_PATH  /opt/rocm /usr/local/rocm /usr/; do
+      if [ -f $base/bin/hipcc ]; then
+        ROCMDIR=$base
+      fi
+    done
+  fi
+
+  if [ -d "$ROCMDIR" ]; then
+    if [ ! -f $ROCMDIR/bin/hipcc ]; then
+      failure "Cannnot find hipcc in ROCm directory $ROCMDIR"
+    fi
+  fi
+  echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)"
+  echo >> kaldi.mk
+  echo "# ROCm configuration" >> kaldi.mk
+  echo >> kaldi.mk
+  echo IS_GPU_BUILD = true >> kaldi.mk
+  echo ROCM = true">> kaldi.mk
+  echo "ROCMDIR = $ROCMDIR" >> kaldi.mk
+  echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk 
+
+  echo "CUDA_ARCH = " >> kaldi.mk
+  echo "ROCM_ARCH_FLAGS = " >> kaldi.mk
+  for i in ${ROCM_TARGETS//,/ } ; do
+    echo "Targetting ROCm arch $i"
+    echo "ROCM_ARCH_FLAGS += --offload-arch=$i" >> kaldi.mk
+  done
+  
+  echo "HOST_ARCH = `uname -m`" >> kaldi.mk
+  echo >> kaldi.mk
+
+  # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, 
+  # use direct calls to uname -m here
+  if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then
+    cat makefiles/hip_64bit.mk >> kaldi.mk
+  else
+    echo "\
+WARNING: ROCM will not be used!
+         ROCM is only supported with 64-bit Linux builds."
+    exit 1;
+  fi
+
+  #add cusolver flags for newer toolkits
+  if [ "$CUSOLVER" == "true" ]; then
+    echo "ROCM_LDLIBS += -lcusolver" >> kaldi.mk
+  fi
+}
+
+
+
 # CUDA is used only in selected directories including src/cudamatrix, src/nnet*
 # and src/chain*. It is used to accelerate the neural network training.
 # The rest of Kaldi runs on CPUs.
@@ -371,6 +431,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
     echo "# CUDA configuration" >> kaldi.mk
     echo >> kaldi.mk
 
+    echo IS_GPU_BUILD = true >> kaldi.mk
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
@@ -602,7 +663,8 @@ ENV_LDLIBS=$LDLIBS
 debug_level=1
 double_precision=false
 dynamic_kaldi=false
-use_cuda=true
+use_cuda=false
+use_rocm=false
 with_cudadecoder=true
 static_fst=false
 static_math=false
@@ -651,8 +713,11 @@ do
   --atlas-root=*)
     GetSwitchExistingPathOrDie ATLASROOT "$1"
     shift ;;
-  --use-cuda)
-    use_cuda=true;
+  --use-rocm)
+    use_rocm=true;
+    shift ;;
+  --use-rocm=no)
+    use_rocm=false;
     shift ;;
   --use-cuda=yes)
     use_cuda=true;
@@ -729,6 +794,13 @@ do
   --mathlib=*)
     GetSwitchValueOrDie MATHLIB "$1"
     shift ;;
+  --rocm-dir=*)
+    # ROCM is used in src/cudamatrix and src/nnet{,bin} only.
+    GetSwitchExistingPathOrDie ROCMDIR "$1"
+    shift ;;
+  --rocm-targets=*)
+    GetSwitchValueOrDie ROCM_TARGETS "$1"
+    shift ;;
   --cudatk-dir=*)
     # CUDA is used in src/cudamatrix and src/nnet{,bin} only.
     GetSwitchExistingPathOrDie CUDATKDIR "$1"
@@ -1304,6 +1376,7 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)."
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
   $use_cuda && configure_cuda
+  $use_rocm && configure_rocm
   linux_configure_speex
 else
   failure "Could not detect the platform or we have not yet worked out the
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 45c2ba44fd7..31c7c5ef3e5 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -12,7 +12,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test
 OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
            cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \
            cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
   OBJFILES += cu-kernels.o
 endif
 
@@ -27,8 +27,19 @@ ifeq ($(CUDA), true)
   endif
 endif
 
+ifeq ($(CUDA), true)
 # Implicit rule for kernel compilation,
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
+endif
+
+ifeq ($(ROCM), true)
+#%.hip : %.cu
+#	$(HIPIFY) $< 1> $@ 2> $@.stats
+#%.o : %.hip
+#	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+%.o : %.cu
+        $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+endifn
 
 include ../makefiles/default_rules.mk
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 3ae5ed5e2dd..c27b7b0a108 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -145,12 +145,17 @@ ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_ENABLED), true)
 CUDA_SRCS=$(wildcard *.cu)
 # Check if any CUDA .cu sources exist to run dependency commands on.
 ifneq ($(CUDA_SRCS),)
+ifeq ($(CUDA), true)
 NVCC_DEP_COMMAND = $(CUDATKDIR)/bin/nvcc -M $(CUDA_FLAGS) $(CUDA_INCLUDE) $(CUDA_SRCS)
 endif
+ifeq ($(ROCM), true)
+HIPCC_DEP_COMMAND = $(HIPCC) -M $(ROCM_FLAGS) $(ROCM_INCLUDE) $(CUDA_SRCS)
+endif
+endif
 endif
 
 .PHONY: depend
@@ -162,6 +167,9 @@ endif
 ifneq ($(NVCC_DEP_COMMAND),)
 	-$(NVCC_DEP_COMMAND) >> .depend.mk
 endif
+ifneq ($(HIPCC_DEP_COMMAND),)
+	-$(HIPCC_DEP_COMMAND) >> .depend.mk
+endif
 
 # removing automatic making of "depend" as it's quite slow.
 #.depend.mk: depend
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 0bf1bebe096..b6c75ac7118 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -3,9 +3,14 @@ all:
 
 include ../kaldi.mk
 
+ifeq ($(CUDA), true)
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
-
+endif
+ifeq ($(ROCM), true)
+LDFLAGS += $(ROCM_LDFLAGS)
+LDLIBS += $(ROCM_LDLIBS)
+endif
 
 TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-descriptor-test nnet-parse-test nnet-component-test \
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 039fc258b13..2bd23273982 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -3,8 +3,14 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
+ifeq ($(CUDA), true)
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
+endif
+ifeq ($(ROCM), true)
+LDFLAGS += $(ROCM_LDFLAGS)
+LDLIBS += $(ROCM_LDLIBS)
+endif
 
 BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \

From 64c27545ce49357fe900de377eb266e9fe11f46d Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Wed, 7 Sep 2022 10:03:38 -0500
Subject: [PATCH 02/22] Remove extra quote.

---
 src/configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index feb2fd276ad..21e439eeb4b 100755
--- a/src/configure
+++ b/src/configure
@@ -275,7 +275,7 @@ function configure_rocm {
   echo "# ROCm configuration" >> kaldi.mk
   echo >> kaldi.mk
   echo IS_GPU_BUILD = true >> kaldi.mk
-  echo ROCM = true">> kaldi.mk
+  echo ROCM = true >> kaldi.mk
   echo "ROCMDIR = $ROCMDIR" >> kaldi.mk
   echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk 
 

From ee18146a6ce723de6c26a78890f6e83b484c0460 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Thu, 8 Sep 2022 07:05:47 -0500
Subject: [PATCH 03/22] Add hipify header.

---
 src/configure                |   3 +-
 src/cudamatrix/Makefile      |   4 +-
 src/cudamatrix/cu-device.cc  |   8 +-
 src/cudamatrix/cu-kernels.cu |   9 ++-
 src/hip/hipify.h             |  22 +++++
 src/hip/math_constants.h     | 152 +++++++++++++++++++++++++++++++++++
 src/makefiles/hip_64bit.mk   |  21 +++++
 7 files changed, 214 insertions(+), 5 deletions(-)
 create mode 100644 src/hip/hipify.h
 create mode 100644 src/hip/math_constants.h
 create mode 100644 src/makefiles/hip_64bit.mk

diff --git a/src/configure b/src/configure
index 21e439eeb4b..fa0b77373a0 100755
--- a/src/configure
+++ b/src/configure
@@ -258,9 +258,10 @@ function check_for_slow_expf {
 function configure_rocm {
   # Check for ROCM in the system
   if [ ! -d "$ROCMDIR" ]; then
-    for base in $ROCM_PATH  /opt/rocm /usr/local/rocm /usr/; do
+    for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do
       if [ -f $base/bin/hipcc ]; then
         ROCMDIR=$base
+        break
       fi
     done
   fi
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 31c7c5ef3e5..512028c6c13 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -39,7 +39,7 @@ ifeq ($(ROCM), true)
 #%.o : %.hip
 #	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 %.o : %.cu
-        $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
-endifn
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+endif
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 39bcf373ace..5bcb0552924 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -23,10 +23,16 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-
+#endif // __IS_HIP_COMPILE__
 #include <string>
 #include <vector>
 #include <algorithm>
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 8044ff699bc..c644cbc0784 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -28,10 +28,17 @@
 #include <cfloat>
 #include <limits>
 #include <math_constants.h>
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hipify.h>
+#include "cudamatrix/cu-kernels-ansi.h"
+#include <hipcub/hipcub.hpp>
+#include <hipcub/block/block_reduce.hpp>
+#else
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION
-
+#endif //__IS_HIP_COMPILE__
 
 /***********************************************************************
  * Generic __device__ functions
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
new file mode 100644
index 00000000000..41b7a02cb04
--- /dev/null
+++ b/src/hip/hipify.h
@@ -0,0 +1,22 @@
+#ifndef __HIPIFY_H__
+#define __HIPIFY_H__
+
+inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
+
+//
+// HIP types
+//
+#define cudaDevAttrWarpSize     hipDeviceAttributeWarpSize
+#define cudaDeviceGetAttribute  hipDeviceGetAttribute
+#define cudaGetDevice           hipGetDevice
+#define cudaStream_t            hipStream_t
+#define cudaStreamLegacy        ((hipStream_t)1)
+#define cudaStreamPerThread     ((hipStream_t)2)
+
+//
+// HIPCUB
+//
+#define cub hipcub
+
+
+#endif //__HIPIFY_H__
diff --git a/src/hip/math_constants.h b/src/hip/math_constants.h
new file mode 100644
index 00000000000..7fb8fce8e71
--- /dev/null
+++ b/src/hip/math_constants.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__MATH_CONSTANTS_H__)
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+#define CUDART_INF_F            __int_as_float(0x7f800000)
+#define CUDART_NAN_F            __int_as_float(0x7fffffff)
+#define CUDART_MIN_DENORM_F     __int_as_float(0x00000001)
+#define CUDART_MAX_NORMAL_F     __int_as_float(0x7f7fffff)
+#define CUDART_NEG_ZERO_F       __int_as_float(0x80000000)
+#define CUDART_ZERO_F           0.0f
+#define CUDART_ONE_F            1.0f
+#define CUDART_SQRT_HALF_F      0.707106781f
+#define CUDART_SQRT_HALF_HI_F   0.707106781f
+#define CUDART_SQRT_HALF_LO_F   1.210161749e-08f
+#define CUDART_SQRT_TWO_F       1.414213562f
+#define CUDART_THIRD_F          0.333333333f
+#define CUDART_PIO4_F           0.785398163f
+#define CUDART_PIO2_F           1.570796327f
+#define CUDART_3PIO4_F          2.356194490f
+#define CUDART_2_OVER_PI_F      0.636619772f
+#define CUDART_SQRT_2_OVER_PI_F 0.797884561f
+#define CUDART_PI_F             3.141592654f
+#define CUDART_L2E_F            1.442695041f
+#define CUDART_L2T_F            3.321928094f
+#define CUDART_LG2_F            0.301029996f
+#define CUDART_LGE_F            0.434294482f
+#define CUDART_LN2_F            0.693147181f
+#define CUDART_LNT_F            2.302585093f 
+#define CUDART_LNPI_F           1.144729886f
+#define CUDART_TWO_TO_M126_F    1.175494351e-38f
+#define CUDART_TWO_TO_126_F     8.507059173e37f
+#define CUDART_NORM_HUGE_F      3.402823466e38f
+#define CUDART_TWO_TO_23_F      8388608.0f
+#define CUDART_TWO_TO_24_F      16777216.0f
+#define CUDART_TWO_TO_31_F      2147483648.0f
+#define CUDART_TWO_TO_32_F      4294967296.0f
+#define CUDART_REMQUO_BITS_F    3
+#define CUDART_REMQUO_MASK_F    (~((~0)<<CUDART_REMQUO_BITS_F))
+#define CUDART_TRIG_PLOSS_F     105615.0f
+
+/* double precision constants */
+#define CUDART_INF              __longlong_as_double(0x7ff0000000000000ULL)
+#define CUDART_NAN              __longlong_as_double(0xfff8000000000000ULL)
+#define CUDART_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
+#define CUDART_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
+#define CUDART_ZERO             0.0
+#define CUDART_ONE              1.0
+#define CUDART_SQRT_TWO         1.4142135623730951e+0
+#define CUDART_SQRT_HALF        7.0710678118654757e-1
+#define CUDART_SQRT_HALF_HI     7.0710678118654757e-1
+#define CUDART_SQRT_HALF_LO   (-4.8336466567264567e-17)
+#define CUDART_THIRD            3.3333333333333333e-1
+#define CUDART_TWOTHIRD         6.6666666666666667e-1
+#define CUDART_PIO4             7.8539816339744828e-1
+#define CUDART_PIO4_HI          7.8539816339744828e-1
+#define CUDART_PIO4_LO          3.0616169978683830e-17
+#define CUDART_PIO2             1.5707963267948966e+0
+#define CUDART_PIO2_HI          1.5707963267948966e+0
+#define CUDART_PIO2_LO          6.1232339957367660e-17
+#define CUDART_3PIO4            2.3561944901923448e+0
+#define CUDART_2_OVER_PI        6.3661977236758138e-1
+#define CUDART_PI               3.1415926535897931e+0
+#define CUDART_PI_HI            3.1415926535897931e+0
+#define CUDART_PI_LO            1.2246467991473532e-16
+#define CUDART_SQRT_2PI         2.5066282746310007e+0
+#define CUDART_SQRT_2PI_HI      2.5066282746310007e+0
+#define CUDART_SQRT_2PI_LO    (-1.8328579980459167e-16)
+#define CUDART_SQRT_PIO2        1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_HI     1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_LO   (-9.1642899902295834e-17)
+#define CUDART_SQRT_2OPI        7.9788456080286536e-1
+#define CUDART_L2E              1.4426950408889634e+0
+#define CUDART_L2E_HI           1.4426950408889634e+0
+#define CUDART_L2E_LO           2.0355273740931033e-17
+#define CUDART_L2T              3.3219280948873622e+0
+#define CUDART_LG2              3.0102999566398120e-1
+#define CUDART_LG2_HI           3.0102999566398120e-1
+#define CUDART_LG2_LO         (-2.8037281277851704e-18)
+#define CUDART_LGE              4.3429448190325182e-1
+#define CUDART_LGE_HI           4.3429448190325182e-1
+#define CUDART_LGE_LO           1.09831965021676510e-17
+#define CUDART_LN2              6.9314718055994529e-1
+#define CUDART_LN2_HI           6.9314718055994529e-1
+#define CUDART_LN2_LO           2.3190468138462996e-17
+#define CUDART_LNT              2.3025850929940459e+0
+#define CUDART_LNT_HI           2.3025850929940459e+0
+#define CUDART_LNT_LO         (-2.1707562233822494e-16)
+#define CUDART_LNPI             1.1447298858494002e+0
+#define CUDART_LN2_X_1024       7.0978271289338397e+2
+#define CUDART_LN2_X_1025       7.1047586007394398e+2
+#define CUDART_LN2_X_1075       7.4513321910194122e+2
+#define CUDART_LG2_X_1024       3.0825471555991675e+2
+#define CUDART_LG2_X_1075       3.2360724533877976e+2
+#define CUDART_TWO_TO_23        8388608.0
+#define CUDART_TWO_TO_52        4503599627370496.0
+#define CUDART_TWO_TO_53        9007199254740992.0
+#define CUDART_TWO_TO_54        18014398509481984.0
+#define CUDART_TWO_TO_M54       5.5511151231257827e-17
+#define CUDART_TWO_TO_M1022     2.22507385850720140e-308
+#define CUDART_TRIG_PLOSS       2147483648.0
+#define CUDART_DBL2INT_CVT      6755399441055744.0
+
+#endif /* !__MATH_CONSTANTS_H__ */
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
new file mode 100644
index 00000000000..453d9d5fe62
--- /dev/null
+++ b/src/makefiles/hip_64bit.mk
@@ -0,0 +1,21 @@
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef ROCMDIR
+$(error ROCMDIR not defined.)
+endif
+
+CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 \
+	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
+
+ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC)
+ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \
+             -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
+	     -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread
+
+#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
+#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
+ROCM_LDFLAGS += 
+
+#CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
+ROCM_LDLIBS += 

From fd48017a509d61a4488849f8ec5ea9f0496d93cd Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Thu, 8 Sep 2022 18:07:47 -0500
Subject: [PATCH 04/22] Add more entries to hipificatiion header to deal with
 the BLAS routines.

---
 src/cudamatrix/cu-allocator.h    |   7 ++
 src/cudamatrix/cu-array-inl.h    |   5 ++
 src/cudamatrix/cu-common.h       |   9 +++
 src/cudamatrix/cu-device.h       |  14 +++-
 src/cudamatrix/cu-matrix.cc      |   6 ++
 src/cudamatrix/cublas-wrappers.h |  17 ++--
 src/hip/hipify.h                 | 129 +++++++++++++++++++++++++++++++
 src/makefiles/hip_64bit.mk       |   2 +-
 8 files changed, 181 insertions(+), 8 deletions(-)

diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index d7d65da806a..a3baa2fb33d 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -23,10 +23,17 @@
 #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #endif
+#endif
 
 #include <map>
 #include <set>
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 53de59fe4fc..36b829046ed 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -28,7 +28,12 @@
 #include <algorithm>
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
+#endif
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-kernels.h"
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 83f8a39a8b9..617f4363269 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -31,11 +31,20 @@
 
 #if HAVE_CUDA
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipblas/hipblas.h>
+#include <hip/hip_runtime_api.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+//TODO: tests with ROCTX #include <roctracer/roctracer_roctx.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
 #include <nvToolsExt.h>
+#endif
 
 #define CU_SAFE_CALL(fun) \
 { \
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 2f278eb85b9..515fa4d7d25 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -28,14 +28,26 @@
 #include <string>
 #include <iostream>
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipblas/hipblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
-
+#endif
 #if CUDA_VERSION >= 9010
+#ifdef __IS_HIP_COMPILE__
+#include <hipsolver.h>
+#else
 #include <cusolverDn.h>
+#endif
 #else
 // cusolver not supported.
 // Setting a few types to minimize compiler guards.
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index c67842d38bf..a522f13451a 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -27,9 +27,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h
index 63dbe630568..dc5c0e0ced5 100644
--- a/src/cudamatrix/cublas-wrappers.h
+++ b/src/cudamatrix/cublas-wrappers.h
@@ -28,14 +28,17 @@
 namespace kaldi {
 #if HAVE_CUDA == 1
 
+#ifndef CUBLAS_R_32F
+#define CUBLAS_R_32F CUDA_R_32F
+#endif
 inline cublasStatus_t cublas_gemm(
     cublasHandle_t handle, cublasOperation_t transa,
     cublasOperation_t transb, int m, int n,int k, float alpha,
     const float *A, int lda, const float *B, int ldb, float beta,
     float *C, int ldc) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUDA_R_32F,lda,B,CUDA_R_32F,ldb,&beta,
-                      C,CUDA_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(),
+  return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUBLAS_R_32F,lda,B,CUBLAS_R_32F,ldb,&beta,
+                      C,CUBLAS_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(),
                       CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc);
@@ -63,8 +66,8 @@ inline cublasStatus_t cublas_gemmBatched(
     const float *A[], int lda, const float *B[], int ldb, float beta,
     float *C[], int ldc, int batchCount) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUDA_R_32F,  lda,
-                             (const void**)B, CUDA_R_32F, ldb, &beta, (void**)C, CUDA_R_32F, ldc, batchCount,
+  return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUBLAS_R_32F,  lda,
+                             (const void**)B, CUBLAS_R_32F, ldb, &beta, (void**)C, CUBLAS_R_32F, ldc, batchCount,
                              CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount);
@@ -219,6 +222,7 @@ inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo,
 // cuSPARSE wrappers
 //
 #if CUDA_VERSION >= 10020
+#ifndef __IS_HIP_COMPILE__
 inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int nnz, const void *csrVal,
                                          const int *csrRowPtr,
@@ -243,6 +247,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
 
   return status;
 }
+#endif
 
 inline cusparseStatus_t cusparse_csrmm2(cusparseHandle_t handle,
                                        cusparseOperation_t transA, 
@@ -319,7 +324,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int *cscRowInd, int *cscColPtr,
                                          cusparseAction_t copyValues,
                                          cusparseIndexBase_t idxBase) {
-#if CUDA_VERSION >= 10020
+#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__)
   return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
                           cscVal, cscRowInd, cscColPtr, CUDA_R_32F, copyValues,
 			  idxBase);
@@ -336,7 +341,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int *cscRowInd, int *cscColPtr,
                                          cusparseAction_t copyValues,
                                          cusparseIndexBase_t idxBase) {
-#if CUDA_VERSION >= 10020
+#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__)
   return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
                           cscVal, cscRowInd, cscColPtr, CUDA_R_64F, copyValues,
                           idxBase);
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 41b7a02cb04..697afc7a6d3 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -5,14 +5,143 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 
 //
 // HIP types
+// TODO: Verify that HIPBLAS_R_32F and HIPBLAS_GEMM_DEFAULT can be sensible replacements for tensor ops.
 //
+
 #define cudaDevAttrWarpSize     hipDeviceAttributeWarpSize
 #define cudaDeviceGetAttribute  hipDeviceGetAttribute
 #define cudaGetDevice           hipGetDevice
+#define cudaGetErrorString      hipGetErrorString
 #define cudaStream_t            hipStream_t
 #define cudaStreamLegacy        ((hipStream_t)1)
 #define cudaStreamPerThread     ((hipStream_t)2)
+#define cublasStatus_t          hipblasStatus_t
+#define cudaError_t             hipError_t
+#define cusparseDestroy         hipsparseDestroy
+#define cudaGetLastError        hipGetLastError
 
+#define cudaFree  hipFree
+#define cudaGetErrorString hipGetErrorString
+#define cublasCreate hipblasCreate
+#define cublasSetStream hipblasSetStream
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define curandCreateGenerator hiprandCreateGenerator
+#define curandSetStream hiprandSetStream
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define curandDestroyGenerator hiprandDestroyGenerator
+#define cusparseDestroy hipsparseDestroy
+#define cudaDeviceProp hipDeviceProp_t
+#define cublasOperation_t hipblasOperation_t
+#define cublasStatus_t hipblasStatus_t
+#define cusparseStatus_t hipsparseStatus_t
+#define curandStatus_t hiprandStatus_t
+#define cublasHandle_t  hipblasHandle_t
+#define cusparseHandle_t hipsparseHandle_t
+#define curandGenerator_t hiprandGenerator_t
+#define cublasGemmAlgo_t hipblasGemmAlgo_t
+#define cusolverDnHandle_t  hipsolverDnHandle_t
+#define cublasComputeType_t hipblasDatatype_t
+#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed
+#define curandSetGeneratorOffset hiprandSetGeneratorOffset
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cublasDaxpy_v2 hipblasDaxpy
+#define cublasSaxpy_v2 hipblasSaxpy
+#define cublasDscal_v2 hipblasDscal
+#define cublasSscal_v2 hipblasSscal
+#define cudaSetDevice hipSetDevice
+#define cudaSuccess hipSuccess
+#define cusolverDnCreate hipsolverDnCreate
+#define cusolverDnSetStream hipsolverDnSetStream
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define cusparseCreate hipsparseCreate
+#define cusolverDnDestroy hipsolverDnDestroy
+#define cusparseSetStream hipsparseSetStream
+#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
+#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing.
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaDeviceReset hipDeviceReset
+#define cudaComputeModeExclusive hipComputeModeExclusive
+#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess
+#define cudaErrorInvalidDevice hipErrorInvalidDevice
+#define cublasDestroy hipblasDestroy
+#define cuDeviceGetName hipDeviceGetName
+#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse
+#define curandGenerateUniform hiprandGenerateUniform
+#define curandGenerateUniformDouble hiprandGenerateUniformDouble
+#define curandGenerateNormal hiprandGenerateNormal
+#define curandGenerateNormalDouble hiprandGenerateNormalDouble
+#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
+#define cusparseMatDescr_t hipsparseMatDescr_t
+#define cudaMemsetAsync hipMemsetAsync
+#define cublasGemmEx hipblasGemmEx
+#define cublasDgemm_v2 hipblasDgemm
+#define cublasSger_v2 hipblasSger
+#define cublasDger_v2 hipblasDger
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasDgemmBatched hipblasDgemmBatched
+#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast<float*>(i),j,k,l)
+#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
+#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast<double*>(i),j,k,l)
+#define cublasFillMode_t hipblasFillMode_t
+#define cublasSsyrk_v2 hipblasSsyrk
+#define cublasDsyrk_v2 hipblasDsyrk
+#define cublasSdot_v2 hipblasSdot
+#define cublasSasum_v2 hipblasSasum
+#define cublasDnrm2_v2 hipblasDnrm2
+#define cublasScopy_v2 hipblasScopy
+#define cublasDcopy_v2 hipblasDcopy
+#define cublasSgemv_v2 hipblasSgemv
+#define cublasDgemv_v2 hipblasDgemv
+#define cublasSspmv_v2 hipblasSspmv
+#define cublasDspmv_v2 hipblasDspmv
+#define cublasDtpmv_v2 hipblasDtpmv
+#define cublasSspr_v2 hipblasSspr
+#define cublasDspr_v2 hipblasDspr
+#define cudaDataType hipDataType
+#define cusparseAction_t hipsparseAction_t
+#define cublasDdot_v2 hipblasDdot
+#define cublasDasum_v2 hipblasDasum
+#define cublasSnrm2_v2 hipblasSnrm2
+#define cublasStpmv_v2 hipblasStpmv
+#define cusparseIndexBase_t hipsparseIndexBase_t
+#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
+#define cusparseOperation_t hipsparseOperation_t
+#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
+#define cusparseGetMatIndexBase hipsparseGetMatIndexBase
+#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
+#define cusparseCreateCsr hipsparseCreateCsr
+#define cusparseDnMatDescr_t hipsparseDnMatDescr_t
+#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
+#define cusparseCreateDnMat hipsparseCreateDnMat
+#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2
+#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize
+#define cusparseSpMM hipsparseSpMM
+#define cusparseDestroySpMat hipsparseDestroySpMat
+#define cusparseDestroyDnMat hipsparseDestroyDnMat
+#define cusparseScsr2csc hipsparseScsr2csc
+#define CUDA_R_64F HIP_R_64F
+#define CUDA_R_32F HIP_R_32F
+#define CUBLAS_R_64F HIPBLAS_R_64F
+#define CUBLAS_R_32F HIPBLAS_R_32F
+#define cusparseDcsr2csc hipsparseDcsr2csc
+#define cusparseCreateMatDescr hipsparseCreateMatDescr
+#define cusparseDestroyMatDescr hipsparseDestroyMatDescr
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemset2DAsync hipMemset2DAsync
 //
 // HIPCUB
 //
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 453d9d5fe62..b405d84a15b 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -5,7 +5,7 @@ ifndef ROCMDIR
 $(error ROCMDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 \
+CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
 	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
 
 ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC)

From 07f2f36e398aa09a59a6655c212f8c1233f81216 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Thu, 8 Sep 2022 18:36:28 -0500
Subject: [PATCH 05/22] Cudmatrix hipification complete.

---
 src/cudamatrix/cu-allocator.cc         |  7 +++++
 src/cudamatrix/cu-array.cc             |  5 +++
 src/cudamatrix/cu-block-matrix.cc      |  6 ++++
 src/cudamatrix/cu-common.cc            |  5 +++
 src/cudamatrix/cu-compressed-matrix.cc |  6 ++++
 src/cudamatrix/cu-packed-matrix.cc     |  6 ++++
 src/cudamatrix/cu-sp-matrix.cc         |  6 ++++
 src/cudamatrix/cu-sparse-matrix.cc     |  6 ++++
 src/cudamatrix/cu-tp-matrix.cc         |  6 ++++
 src/cudamatrix/cu-vector.cc            |  6 ++++
 src/hip/hipify.h                       | 42 ++++++++++++++++++++++++++
 11 files changed, 101 insertions(+)

diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index e438c604509..8e08d3ef2a1 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -23,9 +23,16 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#endif
+
 
 #include <string>
 #include <vector>
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
index 53eccdd44c5..2017ebce5c7 100644
--- a/src/cudamatrix/cu-array.cc
+++ b/src/cudamatrix/cu-array.cc
@@ -22,8 +22,13 @@
 #include <vector>
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index e0c64912207..a2bd910eba0 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -19,9 +19,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include <algorithm>
 #include "base/timer.h"
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 10fc00da681..585d980ed19 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -22,7 +22,12 @@
 
 #include "cudamatrix/cu-common.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cuda.h>
+#endif
 
 #include "base/kaldi-common.h"
 #include "cudamatrix/cu-matrixdim.h"
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index be02921169d..0a5537b4248 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -19,9 +19,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 756d580c7cf..f0563a6123f 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -21,9 +21,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index d1efc0cff9c..a328457ca11 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -19,9 +19,15 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 703aa40e735..c0ebddfc95e 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -22,9 +22,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include <utility>
 #include <vector>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 377c34239f0..6929911fb5e 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -19,9 +19,15 @@
 // limitations under the License.
 
 #if HAVE_CUDA==1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 8736782a3e0..fa5d94fb0bc 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -22,9 +22,15 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 697afc7a6d3..10010ceb70f 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -139,9 +139,51 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cusparseDestroyMatDescr hipsparseDestroyMatDescr
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_C HIPBLAS_OP_C
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemset2DAsync hipMemset2DAsync
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
+#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED
+#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED
+#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE
+#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH
+#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR
+#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED
+#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
+#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT
+#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED
+#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
+#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
+#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
+#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
+#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
+#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
+#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
+#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
+#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
+#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
+#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
+#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
+#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
+#define cudaMalloc hipMalloc
+#define cudaMallocPitch hipMallocPitch
+#define cuMemGetInfo_v2 hipMemGetInfo
+
 //
 // HIPCUB
 //

From fde6f7f478ce18af0142885fd625a33ce2946671 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 9 Sep 2022 06:54:00 -0500
Subject: [PATCH 06/22] Ignore Eclipse synchronized project files.

---
 .gitignore                     | 4 ++++
 src/chain/Makefile             | 2 +-
 src/chain/chain-kernels-ansi.h | 4 ++++
 src/chain/chain-kernels.cu     | 5 +++++
 src/makefiles/hip_64bit.mk     | 8 +++-----
 5 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9f8c727d4d0..53a4079d9ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,3 +90,7 @@ venv/
 # CMakeLists.txt files are currently autogenerated, must not be committed.
 /src/**/CMakeLists.txt
 /build*
+
+# Eclipse sync project
+.ptp-sync
+.ptp-sync-folder
diff --git a/src/chain/Makefile b/src/chain/Makefile
index c4411f4b997..678bb03ef33 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -38,7 +38,7 @@ ifeq ($(ROCM), true)
 #%.o : %.hip
 #	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 %.o : %.cu
-        $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
 
 
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index f5814d7c11c..48c80cc8d92 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -22,6 +22,10 @@
 #define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_
 #include "chain/chain-datastruct.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#endif
+
 #if HAVE_CUDA == 1
 extern "C" {
 
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index a63944f0012..739b9005854 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -20,6 +20,11 @@
 #include <cfloat>
 #include "chain/chain-kernels-ansi.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hipify.h>
+#endif
+
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
 #error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \
          configure with --use-cuda=no (this will disable the use of GPU).
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index b405d84a15b..6ca4ea7d1b6 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -13,9 +13,7 @@ ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
 	     -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread
 
-#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
-ROCM_LDFLAGS += 
-
+#TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
+CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
 #CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
-ROCM_LDLIBS += 
+CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64

From 21ca60dfeeee2496801869ee96667cfd73df4aa6 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 9 Sep 2022 08:02:20 -0500
Subject: [PATCH 07/22] Hipify complete including NVTX.

---
 src/chain/chain-kernels.cu             |  1 -
 src/cudamatrix/cu-allocator.cc         |  2 +-
 src/cudamatrix/cu-allocator.h          |  2 +-
 src/cudamatrix/cu-block-matrix.cc      |  2 +-
 src/cudamatrix/cu-common.cc            | 84 ++++++++++++++------------
 src/cudamatrix/cu-common.h             |  2 +-
 src/cudamatrix/cu-compressed-matrix.cc |  2 +-
 src/cudamatrix/cu-device.cc            |  2 +-
 src/cudamatrix/cu-device.h             |  2 +-
 src/cudamatrix/cu-matrix.cc            |  2 +-
 src/cudamatrix/cu-packed-matrix.cc     |  2 +-
 src/cudamatrix/cu-sp-matrix.cc         |  2 +-
 src/cudamatrix/cu-sparse-matrix.cc     |  2 +-
 src/cudamatrix/cu-tp-matrix.cc         |  2 +-
 src/cudamatrix/cu-vector.cc            |  2 +-
 src/makefiles/hip_64bit.mk             |  7 ++-
 16 files changed, 65 insertions(+), 53 deletions(-)

diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 739b9005854..2a30128750c 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -22,7 +22,6 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime.h>
-#include <hipify.h>
 #endif
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index 8e08d3ef2a1..82d682588d8 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,7 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index a3baa2fb33d..0cc1f7e6a4b 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hipify.h>
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index a2bd910eba0..04885296445 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 585d980ed19..6275bc9073a 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -25,8 +25,10 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipify.h>
+#define API_NAME_PREFIX "HIP"
 #else
 #include <cuda.h>
+#define API_NAME_PREFIX "CU"
 #endif
 
 #include "base/kaldi-common.h"
@@ -36,6 +38,9 @@ namespace kaldi {
 
 #ifdef USE_NVTX
 NvtxTracer::NvtxTracer(const char* name) {
+#ifdef __IS_HIP_COMPILE__
+  roctxRangePushA(name);
+#else
   const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
   const int num_colors = sizeof(colors)/sizeof(uint32_t);
   int color_id = ((int)name[0])%num_colors;
@@ -48,9 +53,14 @@ NvtxTracer::NvtxTracer(const char* name) {
 	eventAttrib.message.ascii = name;
 	nvtxRangePushEx(&eventAttrib);
   // nvtxRangePushA(name);
+#endif
 }
 NvtxTracer::~NvtxTracer() {
+#ifdef __IS_HIP_COMPILE__
+  roctxRangePop();
+#else
   nvtxRangePop();
+#endif
 }
 #endif
 
@@ -92,16 +102,16 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
 const char* cublasGetStatusStringK(cublasStatus_t status) {
   // Defined in CUDA include file: cublas.h or cublas_api.h
   switch(status) {
-    case CUBLAS_STATUS_SUCCESS:           return "CUBLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED:   return "CUBLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED:      return "CUBLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE:     return "CUBLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH:     return "CUBLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR:     return "CUBLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED:  return "CUBLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR:    return "CUBLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED:     return "CUBLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR:     return "CUBLAS_STATUS_LICENSE_ERROR";
+    case CUBLAS_STATUS_SUCCESS:           return API_NAME_PREFIX "BLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:   return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:      return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:     return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:     return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:  return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:    return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:     return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
   }
   return "CUBLAS_STATUS_UNKNOWN_ERROR";
 }
@@ -110,43 +120,43 @@ const char* cusparseGetStatusString(cusparseStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust
   // Defined in CUDA include file: cusparse.h
   switch(status) {
-    case CUSPARSE_STATUS_SUCCESS:                   return "CUSPARSE_STATUS_SUCCESS";
-    case CUSPARSE_STATUS_NOT_INITIALIZED:           return "CUSPARSE_STATUS_NOT_INITIALIZED";
-    case CUSPARSE_STATUS_ALLOC_FAILED:              return "CUSPARSE_STATUS_ALLOC_FAILED";
-    case CUSPARSE_STATUS_INVALID_VALUE:             return "CUSPARSE_STATUS_INVALID_VALUE";
-    case CUSPARSE_STATUS_ARCH_MISMATCH:             return "CUSPARSE_STATUS_ARCH_MISMATCH";
-    case CUSPARSE_STATUS_MAPPING_ERROR:             return "CUSPARSE_STATUS_MAPPING_ERROR";
-    case CUSPARSE_STATUS_EXECUTION_FAILED:          return "CUSPARSE_STATUS_EXECUTION_FAILED";
-    case CUSPARSE_STATUS_INTERNAL_ERROR:            return "CUSPARSE_STATUS_INTERNAL_ERROR";
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_ZERO_PIVOT:                return "CUSPARSE_STATUS_ZERO_PIVOT";
+    case CUSPARSE_STATUS_SUCCESS:                   return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:           return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_ALLOC_FAILED:              return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_INVALID_VALUE:             return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_ARCH_MISMATCH:             return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_MAPPING_ERROR:             return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:          return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:            return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR";
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_ZERO_PIVOT:                return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT";
     #if CUDA_VERSION >= 11000
-    case CUSPARSE_STATUS_NOT_SUPPORTED:             return "CUSPARSE_STATUS_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
+    case CUSPARSE_STATUS_NOT_SUPPORTED:             return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
     #endif
   }
-  return "CUSPARSE_STATUS_UNKNOWN_ERROR";
+  return "SPARSE_STATUS_UNKNOWN_ERROR";
 }
 
 const char* curandGetStatusString(curandStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html
   // Defined in CUDA include file: curand.h
   switch(status) {
-    case CURAND_STATUS_SUCCESS:                     return "CURAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:            return "CURAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:             return "CURAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:           return "CURAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:                  return "CURAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:                return "CURAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:              return "CURAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:         return "CURAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:       return "CURAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:               return "CURAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:              return "CURAND_STATUS_INTERNAL_ERROR";
+    case CURAND_STATUS_SUCCESS:                     return API_NAME_PREFIX "RAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:            return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:             return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:           return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:                  return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:                return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:              return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:         return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:       return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:               return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:              return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
   }
-  return "CURAND_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR";
 }
 
 }  // namespace kaldi
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 617f4363269..a0c879414d4 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -36,7 +36,7 @@
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
 #include <hipsparse/hipsparse.h>
-//TODO: tests with ROCTX #include <roctracer/roctracer_roctx.h>
+#include <roctracer/roctx.h>
 #include <hipify.h>
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index 0a5537b4248..de4fe6f8da2 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 5bcb0552924..41f8d6f83d5 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hipify.h>
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 515fa4d7d25..9286b6fe14a 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -44,7 +44,7 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
-#include <hipsolver.h>
+#include <hipsolver/hipsolver.h>
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index a522f13451a..675ed74aeb4 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,7 +29,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index f0563a6123f..5acfc7443c4 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,7 +23,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index a328457ca11..adfb3e0b517 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index c0ebddfc95e..45742571a41 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 6929911fb5e..51fb744a855 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index fa5d94fb0bc..62ff16cb7f9 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 6ca4ea7d1b6..0ff628d67f6 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -5,11 +5,14 @@ ifndef ROCMDIR
 $(error ROCMDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
+
+ROCM_USEROCTX = -DUSE_NVTX
+
+CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
 	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
 
 ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC)
-ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \
+ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
 	     -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread
 

From 104023482690fbdc92d1cb190a85de8b697f86be Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 9 Sep 2022 09:21:01 -0500
Subject: [PATCH 08/22] Format files for the hipification.

---
 src/cudamatrix/cu-allocator.cc         |   2 +-
 src/cudamatrix/cu-allocator.h          |   2 +-
 src/cudamatrix/cu-array-inl.h          |   2 +-
 src/cudamatrix/cu-array.cc             |   2 +-
 src/cudamatrix/cu-block-matrix.cc      |   2 +-
 src/cudamatrix/cu-common.cc            |  13 +-
 src/cudamatrix/cu-common.h             |   2 +-
 src/cudamatrix/cu-compressed-matrix.cc |   2 +-
 src/cudamatrix/cu-device.cc            |   2 +-
 src/cudamatrix/cu-device.h             |   2 +-
 src/cudamatrix/cu-kernels.cu           |   2 +-
 src/cudamatrix/cu-matrix.cc            |   2 +-
 src/cudamatrix/cu-packed-matrix.cc     |   2 +-
 src/cudamatrix/cu-sp-matrix.cc         |   2 +-
 src/cudamatrix/cu-sparse-matrix.cc     |   2 +-
 src/cudamatrix/cu-tp-matrix.cc         |   2 +-
 src/cudamatrix/cu-vector.cc            |   2 +-
 src/hip/hipify.h                       | 347 ++++++++++++-------------
 src/makefiles/hip_64bit.mk             |   5 +-
 19 files changed, 198 insertions(+), 199 deletions(-)

diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index 82d682588d8..abd08a9b015 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -26,7 +26,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 0cc1f7e6a4b..1ed7e54b541 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -27,7 +27,7 @@
 #include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 36b829046ed..1fd80502cf9 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -30,7 +30,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #endif
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
index 2017ebce5c7..333e8fbed1c 100644
--- a/src/cudamatrix/cu-array.cc
+++ b/src/cudamatrix/cu-array.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #endif
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index 04885296445..fd17fe61893 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -22,7 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 6275bc9073a..2e77062f20d 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -24,7 +24,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #define API_NAME_PREFIX "HIP"
 #else
 #include <cuda.h>
@@ -112,8 +112,12 @@ const char* cublasGetStatusStringK(cublasStatus_t status) {
     case CUBLAS_STATUS_INTERNAL_ERROR:    return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
     case CUBLAS_STATUS_NOT_SUPPORTED:     return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
     case CUBLAS_STATUS_LICENSE_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
+#ifdef __IS_HIP_COMPILE__
+    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+    case HIPBLAS_STATUS_INVALID_ENUM:     return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+#endif
   }
-  return "CUBLAS_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR";
 }
 
 const char* cusparseGetStatusString(cusparseStatus_t status) {
@@ -135,7 +139,7 @@ const char* cusparseGetStatusString(cusparseStatus_t status) {
     case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
     #endif
   }
-  return "SPARSE_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR";
 }
 
 const char* curandGetStatusString(curandStatus_t status) {
@@ -155,6 +159,9 @@ const char* curandGetStatusString(curandStatus_t status) {
     case CURAND_STATUS_INITIALIZATION_FAILED:       return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
     case CURAND_STATUS_ARCH_MISMATCH:               return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
     case CURAND_STATUS_INTERNAL_ERROR:              return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
+#ifdef __IS_HIP_COMPILE__
+    case HIPRAND_STATUS_NOT_IMPLEMENTED:            return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED";
+#endif
   }
   return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR";
 }
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index a0c879414d4..da7c57bde36 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -37,7 +37,7 @@
 #include <hiprand/hiprand.h>
 #include <hipsparse/hipsparse.h>
 #include <roctracer/roctx.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index de4fe6f8da2..e42c93f1b67 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -22,7 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 41f8d6f83d5..705bfbeee59 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -27,7 +27,7 @@
 #include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 9286b6fe14a..d7edf5a5a1c 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -34,7 +34,7 @@
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
 #include <hipsparse/hipsparse.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index c644cbc0784..9a99f19b58f 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -30,7 +30,7 @@
 #include <math_constants.h>
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime.h>
-#include <hipify.h>
+#include "hipify.h"
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <hipcub/hipcub.hpp>
 #include <hipcub/block/block_reduce.hpp>
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 675ed74aeb4..c1d72ede87e 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -30,7 +30,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 5acfc7443c4..c9d686d0ce8 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -24,7 +24,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index adfb3e0b517..a6c7d7720e4 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -22,7 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 45742571a41..a21e5163701 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -25,7 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 51fb744a855..378cc8e4e38 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -22,7 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 62ff16cb7f9..cf13d631a0d 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -25,7 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 10010ceb70f..89daad6bc28 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -5,187 +5,180 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 
 //
 // HIP types
-// TODO: Verify that HIPBLAS_R_32F and HIPBLAS_GEMM_DEFAULT can be sensible replacements for tensor ops.
 //
-
-#define cudaDevAttrWarpSize     hipDeviceAttributeWarpSize
-#define cudaDeviceGetAttribute  hipDeviceGetAttribute
-#define cudaGetDevice           hipGetDevice
-#define cudaGetErrorString      hipGetErrorString
-#define cudaStream_t            hipStream_t
-#define cudaStreamLegacy        ((hipStream_t)1)
-#define cudaStreamPerThread     ((hipStream_t)2)
-#define cublasStatus_t          hipblasStatus_t
-#define cudaError_t             hipError_t
-#define cusparseDestroy         hipsparseDestroy
-#define cudaGetLastError        hipGetLastError
-
-#define cudaFree  hipFree
-#define cudaGetErrorString hipGetErrorString
-#define cublasCreate hipblasCreate
-#define cublasSetStream hipblasSetStream
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define curandCreateGenerator hiprandCreateGenerator
-#define curandSetStream hiprandSetStream
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define curandDestroyGenerator hiprandDestroyGenerator
-#define cusparseDestroy hipsparseDestroy
-#define cudaDeviceProp hipDeviceProp_t
-#define cublasOperation_t hipblasOperation_t
-#define cublasStatus_t hipblasStatus_t
-#define cusparseStatus_t hipsparseStatus_t
-#define curandStatus_t hiprandStatus_t
-#define cublasHandle_t  hipblasHandle_t
-#define cusparseHandle_t hipsparseHandle_t
-#define curandGenerator_t hiprandGenerator_t
-#define cublasGemmAlgo_t hipblasGemmAlgo_t
-#define cusolverDnHandle_t  hipsolverDnHandle_t
-#define cublasComputeType_t hipblasDatatype_t
-#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed
-#define curandSetGeneratorOffset hiprandSetGeneratorOffset
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cublasDaxpy_v2 hipblasDaxpy
-#define cublasSaxpy_v2 hipblasSaxpy
-#define cublasDscal_v2 hipblasDscal
-#define cublasSscal_v2 hipblasSscal
-#define cudaSetDevice hipSetDevice
-#define cudaSuccess hipSuccess
-#define cusolverDnCreate hipsolverDnCreate
-#define cusolverDnSetStream hipsolverDnSetStream
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define cusparseCreate hipsparseCreate
-#define cusolverDnDestroy hipsolverDnDestroy
-#define cusparseSetStream hipsparseSetStream
-#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
-#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing.
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaDeviceReset hipDeviceReset
-#define cudaComputeModeExclusive hipComputeModeExclusive
-#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess
-#define cudaErrorInvalidDevice hipErrorInvalidDevice
-#define cublasDestroy hipblasDestroy
-#define cuDeviceGetName hipDeviceGetName
-#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse
-#define curandGenerateUniform hiprandGenerateUniform
-#define curandGenerateUniformDouble hiprandGenerateUniformDouble
-#define curandGenerateNormal hiprandGenerateNormal
-#define curandGenerateNormalDouble hiprandGenerateNormalDouble
-#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
-#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
-#define cusparseMatDescr_t hipsparseMatDescr_t
-#define cudaMemsetAsync hipMemsetAsync
-#define cublasGemmEx hipblasGemmEx
-#define cublasDgemm_v2 hipblasDgemm
-#define cublasSger_v2 hipblasSger
-#define cublasDger_v2 hipblasDger
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasDgemmBatched hipblasDgemmBatched
-#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast<float*>(i),j,k,l)
-#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
-#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
-#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast<double*>(i),j,k,l)
-#define cublasFillMode_t hipblasFillMode_t
-#define cublasSsyrk_v2 hipblasSsyrk
-#define cublasDsyrk_v2 hipblasDsyrk
-#define cublasSdot_v2 hipblasSdot
-#define cublasSasum_v2 hipblasSasum
-#define cublasDnrm2_v2 hipblasDnrm2
-#define cublasScopy_v2 hipblasScopy
-#define cublasDcopy_v2 hipblasDcopy
-#define cublasSgemv_v2 hipblasSgemv
-#define cublasDgemv_v2 hipblasDgemv
-#define cublasSspmv_v2 hipblasSspmv
-#define cublasDspmv_v2 hipblasDspmv
-#define cublasDtpmv_v2 hipblasDtpmv
-#define cublasSspr_v2 hipblasSspr
-#define cublasDspr_v2 hipblasDspr
-#define cudaDataType hipDataType
-#define cusparseAction_t hipsparseAction_t
-#define cublasDdot_v2 hipblasDdot
-#define cublasDasum_v2 hipblasDasum
-#define cublasSnrm2_v2 hipblasSnrm2
-#define cublasStpmv_v2 hipblasStpmv
-#define cusparseIndexBase_t hipsparseIndexBase_t
-#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
-#define cusparseOperation_t hipsparseOperation_t
-#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
-#define cusparseGetMatIndexBase hipsparseGetMatIndexBase
-#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
-#define cusparseCreateCsr hipsparseCreateCsr
-#define cusparseDnMatDescr_t hipsparseDnMatDescr_t
-#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
-#define cusparseCreateDnMat hipsparseCreateDnMat
-#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2
-#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize
-#define cusparseSpMM hipsparseSpMM
-#define cusparseDestroySpMat hipsparseDestroySpMat
-#define cusparseDestroyDnMat hipsparseDestroyDnMat
-#define cusparseScsr2csc hipsparseScsr2csc
-#define CUDA_R_64F HIP_R_64F
-#define CUDA_R_32F HIP_R_32F
-#define CUBLAS_R_64F HIPBLAS_R_64F
-#define CUBLAS_R_32F HIPBLAS_R_32F
-#define cusparseDcsr2csc hipsparseDcsr2csc
-#define cusparseCreateMatDescr hipsparseCreateMatDescr
-#define cusparseDestroyMatDescr hipsparseDestroyMatDescr
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_C HIPBLAS_OP_C
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemset2DAsync hipMemset2DAsync
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
-#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED
-#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED
-#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE
-#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH
-#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR
-#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED
-#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUBLAS_COMPUTE_32F                        HIPBLAS_R_32F
+#define CUBLAS_DIAG_NON_UNIT                      HIPBLAS_DIAG_NON_UNIT
+#define CUBLAS_FILL_MODE_UPPER                    HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_GEMM_DEFAULT                       HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_C                               HIPBLAS_OP_C
+#define CUBLAS_OP_N                               HIPBLAS_OP_N
+#define CUBLAS_OP_N                               HIPBLAS_OP_N
+#define CUBLAS_OP_T                               HIPBLAS_OP_T
+#define CUBLAS_R_32F                              HIPBLAS_R_32F
+#define CUBLAS_R_64F                              HIPBLAS_R_64F
+#define CUBLAS_SIDE_LEFT                          HIPBLAS_SIDE_LEFT
+#define CUBLAS_STATUS_ALLOC_FAILED                HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_ARCH_MISMATCH               HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_EXECUTION_FAILED            HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR              HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_INVALID_VALUE               HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_LICENSE_ERROR               HIPBLAS_STATUS_UNKNOWN
+#define CUBLAS_STATUS_MAPPING_ERROR               HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_NOT_INITIALIZED             HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_NOT_SUPPORTED               HIPBLAS_STATUS_NOT_SUPPORTED
+#define CUBLAS_STATUS_SUCCESS                     HIPBLAS_STATUS_SUCCESS
+#define CUDA_R_32F                                HIP_R_32F
+#define CUDA_R_64F                                HIP_R_64F
+#define CURAND_RNG_PSEUDO_DEFAULT                 HIPRAND_RNG_PSEUDO_DEFAULT
+#define CURAND_STATUS_ALLOCATION_FAILED           HIPRAND_STATUS_ALLOCATION_FAILED
+#define CURAND_STATUS_ARCH_MISMATCH               HIPRAND_STATUS_ARCH_MISMATCH
+#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED   HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
+#define CURAND_STATUS_INITIALIZATION_FAILED       HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INITIALIZATION_FAILED       HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INTERNAL_ERROR              HIPRAND_STATUS_INTERNAL_ERROR
+#define CURAND_STATUS_LAUNCH_FAILURE              HIPRAND_STATUS_LAUNCH_FAILURE
+#define CURAND_STATUS_LENGTH_NOT_MULTIPLE         HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
+#define CURAND_STATUS_NOT_INITIALIZED             HIPRAND_STATUS_NOT_INITIALIZED
+#define CURAND_STATUS_OUT_OF_RANGE                HIPRAND_STATUS_OUT_OF_RANGE
+#define CURAND_STATUS_PREEXISTING_FAILURE         HIPRAND_STATUS_PREEXISTING_FAILURE
+#define CURAND_STATUS_SUCCESS                     HIPRAND_STATUS_SUCCESS
+#define CURAND_STATUS_TYPE_ERROR                  HIPRAND_STATUS_TYPE_ERROR
+#define CURAND_STATUS_VERSION_MISMATCH            HIPRAND_STATUS_VERSION_MISMATCH
+#define CUSPARSE_ACTION_NUMERIC                   HIPSPARSE_ACTION_NUMERIC
+#define CUSPARSE_INDEX_32I                        HIPSPARSE_INDEX_32I
+#define CUSPARSE_INDEX_BASE_ZERO                  HIPSPARSE_INDEX_BASE_ZERO
+#define CUSPARSE_OPERATION_NON_TRANSPOSE          HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_OPERATION_TRANSPOSE              HIPSPARSE_OPERATION_TRANSPOSE
+#define CUSPARSE_ORDER_COL                        HIPSPARSE_ORDER_COLUMN
+#define CUSPARSE_SPMM_CSR_ALG2                    HIPSPARSE_SPMM_CSR_ALG2
+#define CUSPARSE_STATUS_ALLOC_FAILED              HIPSPARSE_STATUS_ALLOC_FAILED
+#define CUSPARSE_STATUS_ARCH_MISMATCH             HIPSPARSE_STATUS_ARCH_MISMATCH
+#define CUSPARSE_STATUS_EXECUTION_FAILED          HIPSPARSE_STATUS_EXECUTION_FAILED
+#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
+#define CUSPARSE_STATUS_INTERNAL_ERROR            HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUSPARSE_STATUS_INVALID_VALUE             HIPSPARSE_STATUS_INVALID_VALUE
+#define CUSPARSE_STATUS_MAPPING_ERROR             HIPSPARSE_STATUS_MAPPING_ERROR
 #define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
-#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT
-#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED
-#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
-#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
-#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
-#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
-#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
-#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
-#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
-#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
-#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
-#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
-#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
-#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
-#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
-#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
-#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
-#define cudaMalloc hipMalloc
-#define cudaMallocPitch hipMallocPitch
-#define cuMemGetInfo_v2 hipMemGetInfo
+#define CUSPARSE_STATUS_NOT_INITIALIZED           HIPSPARSE_STATUS_NOT_INITIALIZED
+#define CUSPARSE_STATUS_NOT_SUPPORTED             HIPSPARSE_STATUS_NOT_SUPPORTED
+#define CUSPARSE_STATUS_SUCCESS                   HIPSPARSE_STATUS_SUCCESS
+#define CUSPARSE_STATUS_ZERO_PIVOT                HIPSPARSE_STATUS_ZERO_PIVOT
+#define cuDeviceGetName                           hipDeviceGetName
+#define cuMemGetInfo_v2                           hipMemGetInfo
+#define cublasComputeType_t                       hipblasDatatype_t
+#define cublasCreate                              hipblasCreate
+#define cublasDasum_v2                            hipblasDasum
+#define cublasDaxpy_v2                            hipblasDaxpy
+#define cublasDcopy_v2                            hipblasDcopy
+#define cublasDdot_v2                             hipblasDdot
+#define cublasDestroy                             hipblasDestroy
+#define cublasDgemmBatched                        hipblasDgemmBatched
+#define cublasDgemm_v2                            hipblasDgemm
+#define cublasDgemv_v2                            hipblasDgemv
+#define cublasDger_v2                             hipblasDger
+#define cublasDnrm2_v2                            hipblasDnrm2
+#define cublasDscal_v2                            hipblasDscal
+#define cublasDspmv_v2                            hipblasDspmv
+#define cublasDspr_v2                             hipblasDspr
+#define cublasDsyrk_v2                            hipblasDsyrk
+#define cublasDtpmv_v2                            hipblasDtpmv
+#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l)   hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast<double*>(i),j,k,l)
+#define cublasFillMode_t                          hipblasFillMode_t
+#define cublasGemmAlgo_t                          hipblasGemmAlgo_t
+#define cublasGemmBatchedEx                       hipblasGemmBatchedEx
+#define cublasGemmEx                              hipblasGemmEx
+#define cublasHandle_t                            hipblasHandle_t
+#define cublasOperation_t                         hipblasOperation_t
+#define cublasSasum_v2                            hipblasSasum
+#define cublasSaxpy_v2                            hipblasSaxpy
+#define cublasScopy_v2                            hipblasScopy
+#define cublasSdot_v2                             hipblasSdot
+#define cublasSetStream                           hipblasSetStream
+#define cublasSgemv_v2                            hipblasSgemv
+#define cublasSger_v2                             hipblasSger
+#define cublasSnrm2_v2                            hipblasSnrm2
+#define cublasSscal_v2                            hipblasSscal
+#define cublasSspmv_v2                            hipblasSspmv
+#define cublasSspr_v2                             hipblasSspr
+#define cublasSsyrk_v2                            hipblasSsyrk
+#define cublasStatus_t                            hipblasStatus_t
+#define cublasStatus_t                            hipblasStatus_t
+#define cublasStpmv_v2                            hipblasStpmv
+#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l)   hipblasStrsm(a,b,c,d,e,f,g,h,const_cast<float*>(i),j,k,l)
+#define cudaComputeModeExclusive                  hipComputeModeExclusive
+#define cudaComputeModeExclusiveProcess           hipComputeModeExclusiveProcess
+#define cudaDataType                              hipDataType
+#define cudaDevAttrWarpSize                       hipDeviceAttributeWarpSize
+#define cudaDeviceGetAttribute                    hipDeviceGetAttribute
+#define cudaDeviceProp                            hipDeviceProp_t
+#define cudaDeviceReset                           hipDeviceReset
+#define cudaDeviceSynchronize                     hipDeviceSynchronize
+#define cudaErrorDeviceAlreadyInUse               hipErrorContextAlreadyInUse
+#define cudaErrorInvalidDevice                    hipErrorInvalidDevice
+#define cudaError_t                               hipError_t
+#define cudaFree                                  hipFree
+#define cudaGetDevice                             hipGetDevice
+#define cudaGetDeviceCount                        hipGetDeviceCount
+#define cudaGetDeviceProperties                   hipGetDeviceProperties
+#define cudaGetErrorString                        hipGetErrorString
+#define cudaGetErrorString                        hipGetErrorString
+#define cudaGetLastError                          hipGetLastError
+#define cudaMalloc                                hipMalloc
+#define cudaMallocPitch                           hipMallocPitch
+#define cudaMemcpy2DAsync                         hipMemcpy2DAsync
+#define cudaMemcpyAsync                           hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost                    hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice                    hipMemcpyHostToDevice
+#define cudaMemset2DAsync                         hipMemset2DAsync
+#define cudaMemsetAsync                           hipMemsetAsync
+#define cudaSetDevice                             hipSetDevice
+#define cudaStreamLegacy                          ((hipStream_t)1)
+#define cudaStreamPerThread                       ((hipStream_t)2)
+#define cudaStreamSynchronize                     hipStreamSynchronize
+#define cudaStream_t                              hipStream_t
+#define cudaSuccess                               hipSuccess
+#define curandCreateGenerator                     hiprandCreateGenerator
+#define curandDestroyGenerator                    hiprandDestroyGenerator
+#define curandGenerateNormal                      hiprandGenerateNormal
+#define curandGenerateNormalDouble                hiprandGenerateNormalDouble
+#define curandGenerateUniform                     hiprandGenerateUniform
+#define curandGenerateUniformDouble               hiprandGenerateUniformDouble
+#define curandGenerator_t                         hiprandGenerator_t
+#define curandSetGeneratorOffset                  hiprandSetGeneratorOffset
+#define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
+#define curandSetStream                           hiprandSetStream
+#define curandStatus_t                            hiprandStatus_t
+#define cusolverDnCreate                          hipsolverDnCreate
+#define cusolverDnDestroy                         hipsolverDnDestroy
+#define cusolverDnHandle_t                        hipsolverDnHandle_t
+#define cusolverDnSetStream                       hipsolverDnSetStream
+#define cusparseAction_t                          hipsparseAction_t
+#define cusparseCreate                            hipsparseCreate
+#define cusparseCreateCsr                         hipsparseCreateCsr
+#define cusparseCreateDnMat                       hipsparseCreateDnMat
+#define cusparseCreateMatDescr                    hipsparseCreateMatDescr
+#define cusparseDcsr2csc                          hipsparseDcsr2csc
+#define cusparseDestroy                           hipsparseDestroy
+#define cusparseDestroy                           hipsparseDestroy
+#define cusparseDestroyDnMat                      hipsparseDestroyDnMat
+#define cusparseDestroyMatDescr                   hipsparseDestroyMatDescr
+#define cusparseDestroySpMat                      hipsparseDestroySpMat
+#define cusparseDnMatDescr_t                      hipsparseDnMatDescr_t
+#define cusparseGetMatIndexBase                   hipsparseGetMatIndexBase
+#define cusparseHandle_t                          hipsparseHandle_t
+#define cusparseIndexBase_t                       hipsparseIndexBase_t
+#define cusparseMatDescr_t                        hipsparseMatDescr_t
+#define cusparseOperation_t                       hipsparseOperation_t
+#define cusparseScsr2csc                          hipsparseScsr2csc
+#define cusparseSetStream                         hipsparseSetStream
+#define cusparseSpMM                              hipsparseSpMM
+#define cusparseSpMM_bufferSize                   hipsparseSpMM_bufferSize
+#define cusparseSpMatDescr_t                      hipsparseSpMatDescr_t
+#define cusparseStatus_t                          hipsparseStatus_t
 
 //
-// HIPCUB
+// HIPCUB namespace.
 //
 #define cub hipcub
 
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 0ff628d67f6..0c558a770d6 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -5,8 +5,8 @@ ifndef ROCMDIR
 $(error ROCMDIR not defined.)
 endif
 
-
-ROCM_USEROCTX = -DUSE_NVTX
+# Uncomment if willing to use ROCTX capabilities.
+# ROCM_USEROCTX = -DUSE_NVTX
 
 CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
 	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
@@ -18,5 +18,4 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
 
 #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
-#CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
 CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64

From 801115d710904ca505e318e9cd9cc3ffa7fc0f87 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 9 Sep 2022 09:57:45 -0500
Subject: [PATCH 09/22] Add hipification entries dropped by mistake.

---
 src/hip/hipify.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 89daad6bc28..7a0300ae02b 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -7,9 +7,12 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 // HIP types
 //
 #define CUBLAS_COMPUTE_32F                        HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_TF32              HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
+#define CUBLAS_COMPUTE_32F_FAST_16F               HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
 #define CUBLAS_DIAG_NON_UNIT                      HIPBLAS_DIAG_NON_UNIT
 #define CUBLAS_FILL_MODE_UPPER                    HIPBLAS_FILL_MODE_UPPER
 #define CUBLAS_GEMM_DEFAULT                       HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP             HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs.
 #define CUBLAS_OP_C                               HIPBLAS_OP_C
 #define CUBLAS_OP_N                               HIPBLAS_OP_N
 #define CUBLAS_OP_N                               HIPBLAS_OP_N
@@ -146,6 +149,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define curandGenerateUniformDouble               hiprandGenerateUniformDouble
 #define curandGenerator_t                         hiprandGenerator_t
 #define curandSetGeneratorOffset                  hiprandSetGeneratorOffset
+#define curandSetGeneratorOrdering(x,y)           0 // HIP does not support generator ordeing.
 #define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
 #define curandSetStream                           hiprandSetStream
 #define curandStatus_t                            hiprandStatus_t

From 081de1ebcc44b846c4953bb3923818d6142b90cc Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 12 Sep 2022 06:06:19 -0500
Subject: [PATCH 10/22] Change IS_GPU_ENABLED to IS_GPU_BUILD in depends build.

---
 src/makefiles/default_rules.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index c27b7b0a108..21a3b053639 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -145,7 +145,7 @@ ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
-ifeq ($(IS_GPU_ENABLED), true)
+ifeq ($(IS_GPU_BUILD), true)
 CUDA_SRCS=$(wildcard *.cu)
 # Check if any CUDA .cu sources exist to run dependency commands on.
 ifneq ($(CUDA_SRCS),)

From 00098bf097ca7e9e804562c937b20c6714adf2f8 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 12 Sep 2022 17:11:35 -0500
Subject: [PATCH 11/22] Add build logic for ROCm < 5.2.0.

---
 src/configure              | 28 +++++++++++++++++++++-------
 src/hip/hipify.h           | 21 +++++++++++++++++++++
 src/makefiles/hip_64bit.mk | 17 ++++++++++++-----
 3 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/src/configure b/src/configure
index fa0b77373a0..ffb87abe106 100755
--- a/src/configure
+++ b/src/configure
@@ -259,7 +259,7 @@ function configure_rocm {
   # Check for ROCM in the system
   if [ ! -d "$ROCMDIR" ]; then
     for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do
-      if [ -f $base/bin/hipcc ]; then
+      if [ -f $base/bin/hipcc ] && [ -f $base/bin/hipconfig ]; then
         ROCMDIR=$base
         break
       fi
@@ -268,7 +268,7 @@ function configure_rocm {
 
   if [ -d "$ROCMDIR" ]; then
     if [ ! -f $ROCMDIR/bin/hipcc ]; then
-      failure "Cannnot find hipcc in ROCm directory $ROCMDIR"
+      failure "Cannnot find hipcc and hipconfig in ROCm directory $ROCMDIR"
     fi
   fi
   echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)"
@@ -289,7 +289,20 @@ function configure_rocm {
   
   echo "HOST_ARCH = `uname -m`" >> kaldi.mk
   echo >> kaldi.mk
-
+  
+  ROCM_MAJOR_VERSION=$(hipconfig -v | cut -d. -f1)
+  echo "ROCM_MAJOR_VERSION = $ROCM_MAJOR_VERSION" >> kaldi.mk
+  ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2)
+  echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk
+  
+  # Enable HIP implementation for CXX compile commands. ROCm 5.2.0 onwards use 
+  # __HIP_PLATFORM_AMD__ others  __HIP_PLATFORM_HCC__
+  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
+    echo "CXXFLAGS += -D__HIP_PLATFORM_AMD__=1" >> kaldi.mk
+  else
+    echo "CXXFLAGS += -D__HIP_PLATFORM_HCC__=1" >> kaldi.mk
+  fi
+  
   # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, 
   # use direct calls to uname -m here
   if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then
@@ -300,10 +313,11 @@ WARNING: ROCM will not be used!
          ROCM is only supported with 64-bit Linux builds."
     exit 1;
   fi
-
-  #add cusolver flags for newer toolkits
-  if [ "$CUSOLVER" == "true" ]; then
-    echo "ROCM_LDLIBS += -lcusolver" >> kaldi.mk
+  
+  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
+    echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk
+  else
+    echo "ROCM_FLAGS += -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1" >> kaldi.mk
   fi
 }
 
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 7a0300ae02b..bdefa9cc4dd 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -3,6 +3,20 @@
 
 inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 
+
+#undef hipLaunchKernelGGLInternal
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamPerThread) : (streamId) )>>>(__VA_ARGS__);         \
+    } while (0)
+#else
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamDefault) : (streamId) )>>>(__VA_ARGS__);         \
+    } while (0)
+#endif
+
 //
 // HIP types
 //
@@ -153,10 +167,17 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
 #define curandSetStream                           hiprandSetStream
 #define curandStatus_t                            hiprandStatus_t
+#if ROCM_MAJOR_VERSION == 5 && ROCM_MINOR_VERSION >= 1 || ROCM_MAJOR_VERSION > 5
 #define cusolverDnCreate                          hipsolverDnCreate
 #define cusolverDnDestroy                         hipsolverDnDestroy
 #define cusolverDnHandle_t                        hipsolverDnHandle_t
 #define cusolverDnSetStream                       hipsolverDnSetStream
+#else
+#define cusolverDnCreate                          hipsolverCreate
+#define cusolverDnDestroy                         hipsolverDestroy
+#define cusolverDnHandle_t                        hipsolverHandle_t
+#define cusolverDnSetStream                       hipsolverSetStream
+#endif
 #define cusparseAction_t                          hipsparseAction_t
 #define cusparseCreate                            hipsparseCreate
 #define cusparseCreateCsr                         hipsparseCreateCsr
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 0c558a770d6..3976624032d 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -8,13 +8,20 @@ endif
 # Uncomment if willing to use ROCTX capabilities.
 # ROCM_USEROCTX = -DUSE_NVTX
 
-CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
-	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
+# Specific HIP/ROCm components should be included prior to the generic include to avoid
+# deprecation warnings.
+CXXFLAGS += -Werror $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
+            -D__IS_HIP_COMPILE__=1 \
+            -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
+            -DCUDA_VERSION=11000 \
+	          -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
 
-ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC)
+ROCM_INCLUDE = -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -isystem $(OPENFSTINC)
 ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
-             -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
-	     -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread
+             -D__IS_HIP_COMPILE__=1 \
+             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
+             -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
+	           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14
 
 #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib

From 9b8dffb3a594293fbf4286233df610ae6041b284 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 12 Sep 2022 17:33:16 -0500
Subject: [PATCH 12/22] Complete ROCm 5.0.2 build with no per-thread streams
 yet.

---
 src/cudamatrix/cu-allocator.cc         | 2 +-
 src/cudamatrix/cu-allocator.h          | 2 +-
 src/cudamatrix/cu-block-matrix.cc      | 2 +-
 src/cudamatrix/cu-common.h             | 6 +++---
 src/cudamatrix/cu-compressed-matrix.cc | 2 +-
 src/cudamatrix/cu-device.cc            | 2 +-
 src/cudamatrix/cu-device.h             | 8 ++++----
 src/cudamatrix/cu-matrix.cc            | 2 +-
 src/cudamatrix/cu-packed-matrix.cc     | 2 +-
 src/cudamatrix/cu-sp-matrix.cc         | 2 +-
 src/cudamatrix/cu-sparse-matrix.cc     | 2 +-
 src/cudamatrix/cu-tp-matrix.cc         | 2 +-
 src/cudamatrix/cu-vector.cc            | 2 +-
 src/makefiles/hip_64bit.mk             | 2 +-
 14 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index abd08a9b015..3b47ee525eb 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,7 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 1ed7e54b541..09ba2c9aa13 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fd17fe61893..309d68fccf7 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index da7c57bde36..99165cc592f 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,10 +32,10 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include <hip/hip_runtime_api.h>
-#include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
+#include <hiprand.h>
+#include <hipsparse.h>
 #include <roctracer/roctx.h>
 #include "hipify.h"
 #else
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index e42c93f1b67..dfcbf41d131 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 705bfbeee59..c073ab358ea 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index d7edf5a5a1c..1311668ec33 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,11 +29,11 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
-#include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
+#include <hiprand.h>
+#include <hipsparse.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
@@ -44,7 +44,7 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
-#include <hipsolver/hipsolver.h>
+#include <hipsolver.h>
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index c1d72ede87e..96c1ef14ed4 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,7 +29,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index c9d686d0ce8..8a5865f71af 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,7 +23,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index a6c7d7720e4..fabd06c9b16 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index a21e5163701..3853ffa7e45 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 378cc8e4e38..dd3a333c9a5 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index cf13d631a0d..cc6332ba48c 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 3976624032d..160f5fb5c0f 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -10,7 +10,7 @@ endif
 
 # Specific HIP/ROCm components should be included prior to the generic include to avoid
 # deprecation warnings.
-CXXFLAGS += -Werror $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
+CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
             -D__IS_HIP_COMPILE__=1 \
             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
             -DCUDA_VERSION=11000 \

From e84d8f072496c9427e804f8189854da9ff49c04b Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 13 Sep 2022 07:44:43 -0500
Subject: [PATCH 13/22] Add cudadecoder support for ROCm 5.2.x.

---
 src/chain/Makefile                            | 16 ++++--
 src/configure                                 |  9 +++-
 src/cudadecoder/Makefile                      | 22 +++++++-
 .../batched-static-nnet3-kernels.cu           |  5 ++
 .../batched-static-nnet3-kernels.h            |  5 ++
 ...hed-threaded-nnet3-cuda-online-pipeline.cc |  5 ++
 .../batched-threaded-nnet3-cuda-pipeline.cc   |  5 ++
 .../batched-threaded-nnet3-cuda-pipeline2.cc  |  5 ++
 src/cudadecoder/cuda-decoder-kernels-utils.h  |  4 +-
 src/cudadecoder/cuda-decoder-kernels.cu       |  6 +++
 src/cudadecoder/cuda-decoder.cc               | 24 +++++----
 src/cudadecoder/cuda-decoder.h                |  5 ++
 src/cudadecoder/cuda-fst.cc                   |  6 +++
 src/cudadecoderbin/Makefile                   |  4 +-
 .../batched-wav-nnet3-cuda-online.cc          |  6 +++
 src/cudadecoderbin/batched-wav-nnet3-cuda.cc  |  6 +++
 src/cudadecoderbin/batched-wav-nnet3-cuda2.cc |  7 +++
 src/cudafeat/Makefile                         | 23 +++++++-
 ...eature-online-batched-cmvn-cuda-kernels.cu |  5 ++
 ...ure-online-batched-ivector-cuda-kernels.cu |  5 ++
 .../feature-online-batched-ivector-cuda.cc    | 16 ++++++
 ...re-online-batched-spectral-cuda-kernels.cu |  6 +++
 .../feature-online-batched-spectral-cuda.h    |  5 ++
 src/cudafeat/feature-online-cmvn-cuda.cu      |  8 +++
 src/cudafeat/feature-spectral-cuda.cu         |  6 +++
 src/cudafeat/feature-spectral-cuda.h          |  5 ++
 src/cudafeat/feature-window-cuda.cu           |  5 ++
 .../online-batched-feature-pipeline-cuda.cc   |  7 ++-
 .../online-batched-feature-pipeline-cuda.h    |  4 ++
 .../online-ivector-feature-cuda-kernels.cu    |  6 +++
 src/cudafeat/online-ivector-feature-cuda.cc   | 14 ++++-
 src/cudamatrix/Makefile                       | 16 ++++--
 src/cudamatrix/cu-allocator.cc                |  2 +-
 src/cudamatrix/cu-allocator.h                 |  2 +-
 src/cudamatrix/cu-block-matrix.cc             |  2 +-
 src/cudamatrix/cu-common.h                    |  6 +--
 src/cudamatrix/cu-compressed-matrix.cc        |  2 +-
 src/cudamatrix/cu-device.cc                   |  2 +-
 src/cudamatrix/cu-device.h                    |  8 +--
 src/cudamatrix/cu-kernels.cu                  |  1 +
 src/cudamatrix/cu-matrix.cc                   |  2 +-
 src/cudamatrix/cu-packed-matrix.cc            |  2 +-
 src/cudamatrix/cu-sp-matrix.cc                |  2 +-
 src/cudamatrix/cu-sparse-matrix.cc            |  2 +-
 src/cudamatrix/cu-tp-matrix.cc                |  2 +-
 src/cudamatrix/cu-vector.cc                   |  2 +-
 src/hip/hipify.h                              | 54 ++++++++++++++-----
 src/makefiles/hip_64bit.mk                    | 18 +++++--
 48 files changed, 318 insertions(+), 62 deletions(-)

diff --git a/src/chain/Makefile b/src/chain/Makefile
index 678bb03ef33..5cc8d8901a1 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -33,13 +33,21 @@ ifeq ($(CUDA), true)
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
 endif
 ifeq ($(ROCM), true)
-#%.hip : %.cu
-#	$(HIPIFY) $< 1> $@ 2> $@.stats
-#%.o : %.hip
-#	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
+.PRECIOUS: %.hip
+%.hip : %.cu
+	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	cat $< | \
+	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
+	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
+	cat > $@
+%.o : %.hip
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
+endif
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/configure b/src/configure
index ffb87abe106..ca3df9563ab 100755
--- a/src/configure
+++ b/src/configure
@@ -316,8 +316,9 @@ WARNING: ROCM will not be used!
   
   if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
     echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk
+    echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = false" >> kaldi.mk
   else
-    echo "ROCM_FLAGS += -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1" >> kaldi.mk
+    echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = true" >> kaldi.mk
   fi
 }
 
@@ -1055,7 +1056,11 @@ if $use_cuda; then
    fi
    echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
 else
-   echo "WITH_CUDADECODER = false" >> kaldi.mk
+   if $use_rocm; then
+     echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
+   else
+     echo "WITH_CUDADECODER = false" >> kaldi.mk
+   fi
 fi
 echo >> kaldi.mk
 
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index e2569e89ab7..062e9a47d41 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -3,13 +3,15 @@ all: ;
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 TESTFILES =
 
@@ -34,8 +36,26 @@ LDLIBS += $(CUDA_LDLIBS)
 
 
 # Implicit rule for kernel compilation
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
+endif
+ifeq ($(ROCM), true)
+ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
+.PRECIOUS: %.hip
+%.hip : %.cu
+	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	cat $< | \
+	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
+	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
+	cat > $@
+%.o : %.hip
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+else
+%.o : %.cu
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+endif
+endif
 
 else
 all:
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.cu b/src/cudadecoder/batched-static-nnet3-kernels.cu
index f02a78ed1af..429d9f72326 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.cu
+++ b/src/cudadecoder/batched-static-nnet3-kernels.cu
@@ -17,6 +17,11 @@
 
 #include "cudadecoder/batched-static-nnet3-kernels.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "hipify.h"
+#endif
+
 #include <stdio.h>
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h
index 45064e15071..0bcb1997576 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.h
+++ b/src/cudadecoder/batched-static-nnet3-kernels.h
@@ -17,7 +17,12 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
+#endif
 #include "base/kaldi-types.h"
 
 #ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
index 6e78d7212fd..c7012b686e0 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -21,7 +21,12 @@
 
 #include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 #include <mutex>
 #include <numeric>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index 89e93e5d98c..d5cf7dae2d7 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -26,7 +26,12 @@
 
 #include <memory>
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 #include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-fst.h"
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
index c076910672a..f6a3455db01 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
@@ -23,7 +23,12 @@
 
 #include <atomic>
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h
index fc0d2cddd2c..add66312817 100644
--- a/src/cudadecoder/cuda-decoder-kernels-utils.h
+++ b/src/cudadecoder/cuda-decoder-kernels-utils.h
@@ -137,7 +137,7 @@ __device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) {
   value.i2 = val;
   if (old.i2.x <= val.x) return;
   do {
-    assumed = old;
+    assumed.ull = old.ull;
     old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
   } while (old.ull != assumed.ull && old.i2.x > value.i2.x);
 }
@@ -148,7 +148,7 @@ __device__ void atomicSubI2(int2 *ptr, int2 sub) {
   UInt64UnionInt2 old, assumed, value;
   old.ull = *ptr64;
   do {
-    assumed = old;
+    assumed.ull = old.ull;
     value.i2.x = assumed.i2.x - sub.x;
     value.i2.y = assumed.i2.y - sub.y;
     old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
index 3a835d02b76..6a14371911d 100644
--- a/src/cudadecoder/cuda-decoder-kernels.cu
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -15,7 +15,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include "float.h"
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cuda-decoder-kernels.h"
 #include "cuda-decoder-kernels-utils.h"
 
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 1ec456ac32c..06dceae73a5 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -37,8 +37,14 @@
 #include <utility>
 #include <vector>
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <nvToolsExt.h>
+#endif
 
 #include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-decoder-kernels.h"
@@ -184,35 +190,35 @@ void CudaDecoder::AllocateDeviceData() {
 void CudaDecoder::AllocateHostData() {
   channel_to_compute_.resize(nlanes_);
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_and_acoustic_cost_concat_,
+      (void**)&h_extra_and_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_acoustic_cost_concat_,
+      (void**)&h_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_prev_tokens_concat_,
+      (void**)&h_extra_prev_tokens_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_infotoken_concat_,
+	  (void**)&h_infotoken_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
-      cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_,
+      cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_,
                      nlanes_ * main_q_capacity_ *
                          sizeof(*h_extra_and_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_acoustic_cost_concat_tmp_,
+	  (void**)&h_acoustic_cost_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_prev_tokens_concat_tmp_,
+	  (void**)&h_extra_prev_tokens_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_infotoken_concat_tmp_,
+	  (void**)&h_infotoken_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_)));
   h_lanes_counters_.Resize(
       nlanes_ + 1,
       1);  // +1 because we sometimes need last+1 value (for offsets)
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
+	  (void**)&h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
 
   h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
   h_all_tokens_acoustic_cost_.resize(nchannels_);
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
index de2bd09f47c..510904aa004 100644
--- a/src/cudadecoder/cuda-decoder.h
+++ b/src/cudadecoder/cuda-decoder.h
@@ -20,7 +20,12 @@
 
 #if HAVE_CUDA
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
+#endif
 
 #include <atomic>
 #include <cfloat>
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 56066ee069d..3af37eb7676 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -22,8 +22,14 @@
 #include "cudadecoder/cuda-fst.h"
 #include "cudamatrix/cu-common.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <nvToolsExt.h>
+#endif
 
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
index 1f093299eb4..96b00c06101 100644
--- a/src/cudadecoderbin/Makefile
+++ b/src/cudadecoderbin/Makefile
@@ -2,13 +2,15 @@ all: ;
 
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
index 1aba7144af1..56368853df2 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
@@ -23,9 +23,15 @@
 #error CUDA support must be configured to compile this binary.
 #endif
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "roctracer/roctx.h"
+#include "hipify.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <nvToolsExt.h>
+#endif
 
 #include <algorithm>
 #include <iomanip>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index 46138116bd8..05af50d7a3b 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -17,9 +17,15 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "roctracer/roctx.h"
+#include "hipify.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <nvToolsExt.h>
+#endif
 #include <sstream>
 #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
 #include "cudamatrix/cu-allocator.h"
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
index 992b34598d2..c14571f2ed9 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
@@ -18,9 +18,16 @@
 #include <atomic>
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <nvToolsExt.h>
+#endif
 
 #include <sstream>
 
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index 54bcc53af1e..c3a4489e18e 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -2,13 +2,15 @@ all: ;
 
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 TESTFILES =
 
@@ -37,9 +39,26 @@ LDLIBS += $(CUDA_LDLIBS)
 
 
 # Implicit rule for kernel compilation
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
-
+endif
+ifeq ($(ROCM), true)
+ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
+.PRECIOUS: %.hip
+%.hip : %.cu
+	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	cat $< | \
+	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
+	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
+	cat > $@
+%.o : %.hip
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+else
+%.o : %.cu
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+endif
+endif
 else
 all:
 		$(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]")
diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
index c839548d6eb..09b0caff255 100644
--- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
@@ -15,7 +15,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h"
 
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index 0b57d6a32ea..0b4cfce812c 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -16,7 +16,12 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h"
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc
index 538e268dd98..6d68c93f917 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.cc
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc
@@ -15,6 +15,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include "hipify.h"
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched
+#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched
+// The BLAS enumerators are used instead of the SOLVER ones.
+#ifdef CUBLAS_FILL_MODE_LOWER
+#undef CUBLAS_FILL_MODE_LOWER
+#endif
+#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
+#ifdef CUDA_R_32F
+#undef CUDA_R_32F
+#endif
+#define CUDA_R_32F HIPBLAS_R_32F
+#endif
+
 #include "cudafeat/feature-online-batched-ivector-cuda.h"
 #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h"
 
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index c43adaccc2e..f847311d755 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -17,8 +17,14 @@
 
 #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
 #include <nvToolsExt.h>
+#endif
 
 #include "cudafeat/lane-desc.h"
 #include "cudamatrix/cu-rand.h"
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index e4549c7177c..113657ce317 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -19,8 +19,13 @@
 #define KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipfft/hipfft.h>
+#include "hipify.h"
+#else
 #include <cufft.h>
 #endif
+#endif
 
 #include "cudafeat/feature-spectral-cuda.h"
 #include "cudafeat/feature-window-cuda.h"
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index ba13b4fe484..8d4648d04bb 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -15,11 +15,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
+
 #include "cudafeat/feature-online-cmvn-cuda.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 
+#ifndef __IS_HIP_COMPILE__
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
   float2 retval;
   retval.x = a.x - b.x;
@@ -32,6 +39,7 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
   retval.y = a.y + b.y;
   return retval;
 }
+#endif
 
 #if __CUDA_ARCH__ == 750
 __launch_bounds__ (1024, 1)
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index 3912661c4fd..c320c85a029 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -17,8 +17,14 @@
 
 #include "cudafeat/feature-spectral-cuda.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
 #include <cub/cub.cuh>
+#endif
 
 #include "cudamatrix/cu-rand.h"
 
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 8683372098c..5625592a717 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -19,8 +19,13 @@
 #define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipfft/hipfft.h>
+#include "hipify.h"
+#else
 #include <cufft.h>
 #endif
+#endif
 
 #include "cudafeat/feature-window-cuda.h"
 #include "cudamatrix/cu-matrix.h"
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
index b8db5bd46d3..6ba45e682c1 100644
--- a/src/cudafeat/feature-window-cuda.cu
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -17,7 +17,12 @@
 
 #include "cudafeat/feature-window-cuda.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 #include "matrix/matrix-functions.h"
 
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
index 981345404f5..650b51ec3c7 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
@@ -20,7 +20,12 @@
 
 #include "cudafeat/online-batched-feature-pipeline-cuda.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 namespace kaldi {
 
@@ -95,7 +100,7 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda(
   current_samples_stash_ = new int32_t[num_channels_];
 
   // allocated pinned memory for storing channel desc
-  CU_SAFE_CALL(cudaMallocHost(&h_lanes_, sizeof(LaneDesc) * max_lanes_));
+  CU_SAFE_CALL(cudaMallocHost((void**)&h_lanes_, sizeof(LaneDesc) * max_lanes_));
 
   // allocate device memory
   lanes_ =
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.h b/src/cudafeat/online-batched-feature-pipeline-cuda.h
index fa000f03b62..6c588c40c24 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.h
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.h
@@ -23,6 +23,10 @@
 #include <string>
 #include <vector>
 
+#ifdef __IS_HIP_COMPILE__
+#include "hipify.h"
+#endif
+
 #include "base/kaldi-error.h"
 #include "feat/feature-window.h"
 #include "matrix/matrix-lib.h"
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
index 12d9b071f59..378ea18e689 100644
--- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -15,7 +15,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
+
 #include "cudafeat/online-ivector-feature-cuda-kernels.h"
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index bd4964860e0..c3b15d72a5b 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -16,8 +16,19 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+// The BLAS enumerators are used instead of the SOLVER ones.
+#ifdef CUBLAS_FILL_MODE_LOWER
+#undef CUBLAS_FILL_MODE_LOWER
+#endif
+#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
+#else
 #include <nvToolsExt.h>
 #endif
+#endif
+
 #include <iostream>
 
 #include "base/io-funcs.h"
@@ -288,13 +299,14 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats(
   // Forming new non-SP matrix for cusolver.
   CuMatrix<float> A(quadratic);
 
+
+
 #ifdef CHOLESKY
   // query temp buffer size
   int L_work;
   CUSOLVER_SAFE_CALL(
       cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER,
                                   A.NumRows(), A.Data(), A.Stride(), &L_work));
-
   // allocate temp buffer
   float *workspace = static_cast<float *>(
       CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 512028c6c13..5cd4adcffd8 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -34,12 +34,20 @@ ifeq ($(CUDA), true)
 endif
 
 ifeq ($(ROCM), true)
-#%.hip : %.cu
-#	$(HIPIFY) $< 1> $@ 2> $@.stats
-#%.o : %.hip
-#	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
+.PRECIOUS: %.hip
+%.hip : %.cu
+	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	cat $< | \
+	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
+	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
+	cat > $@
+%.o : %.hip
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
+endif
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index 3b47ee525eb..abd08a9b015 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,7 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 09ba2c9aa13..1ed7e54b541 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index 309d68fccf7..fd17fe61893 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 99165cc592f..da7c57bde36 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,10 +32,10 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime_api.h>
-#include <hiprand.h>
-#include <hipsparse.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
 #include <roctracer/roctx.h>
 #include "hipify.h"
 #else
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index dfcbf41d131..e42c93f1b67 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index c073ab358ea..705bfbeee59 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 1311668ec33..d7edf5a5a1c 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,11 +29,11 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
-#include <hiprand.h>
-#include <hipsparse.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
@@ -44,7 +44,7 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
-#include <hipsolver.h>
+#include <hipsolver/hipsolver.h>
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 9a99f19b58f..1d6e0664541 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -23,6 +23,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+
 // In this file is the CUDA code of the CUDA kernels, plus the ANSI-C wrappers
 
 #include <cfloat>
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 96c1ef14ed4..c1d72ede87e 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,7 +29,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 8a5865f71af..c9d686d0ce8 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,7 +23,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index fabd06c9b16..a6c7d7720e4 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 3853ffa7e45..a21e5163701 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index dd3a333c9a5..378cc8e4e38 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index cc6332ba48c..cf13d631a0d 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index bdefa9cc4dd..24b5f2f8eb3 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -1,29 +1,22 @@
 #ifndef __HIPIFY_H__
 #define __HIPIFY_H__
 
+#ifdef __HIPCC__
 inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
-
-
-#undef hipLaunchKernelGGLInternal
-#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
-#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
-    do {                                                                                           \
-        kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamPerThread) : (streamId) )>>>(__VA_ARGS__);         \
-    } while (0)
+// AMDGCN only support this rounding mode.
+#define __fdiv_rd __fdiv_rn
 #else
-#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
-    do {                                                                                           \
-        kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamDefault) : (streamId) )>>>(__VA_ARGS__);         \
-    } while (0)
+#define __align__(x) __attribute__((aligned (x)))
 #endif
 
 //
 // HIP types
 //
 #define CUBLAS_COMPUTE_32F                        HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_TF32              HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
 #define CUBLAS_COMPUTE_32F_FAST_16F               HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
+#define CUBLAS_COMPUTE_32F_FAST_TF32              HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
 #define CUBLAS_DIAG_NON_UNIT                      HIPBLAS_DIAG_NON_UNIT
+#define CUBLAS_FILL_MODE_LOWER                    HIPBLAS_FILL_MODE_LOWER
 #define CUBLAS_FILL_MODE_UPPER                    HIPBLAS_FILL_MODE_UPPER
 #define CUBLAS_GEMM_DEFAULT                       HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP             HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs.
@@ -46,6 +39,8 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define CUBLAS_STATUS_SUCCESS                     HIPBLAS_STATUS_SUCCESS
 #define CUDA_R_32F                                HIP_R_32F
 #define CUDA_R_64F                                HIP_R_64F
+#define CUFFT_R2C                                 HIPFFT_R2C
+#define CUFFT_SUCCESS                             HIPFFT_SUCCESS
 #define CURAND_RNG_PSEUDO_DEFAULT                 HIPRAND_RNG_PSEUDO_DEFAULT
 #define CURAND_STATUS_ALLOCATION_FAILED           HIPRAND_STATUS_ALLOCATION_FAILED
 #define CURAND_STATUS_ARCH_MISMATCH               HIPRAND_STATUS_ARCH_MISMATCH
@@ -104,6 +99,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cublasGemmAlgo_t                          hipblasGemmAlgo_t
 #define cublasGemmBatchedEx                       hipblasGemmBatchedEx
 #define cublasGemmEx                              hipblasGemmEx
+#define cublasGemmStridedBatchedEx                hipblasGemmStridedBatchedEx
 #define cublasHandle_t                            hipblasHandle_t
 #define cublasOperation_t                         hipblasOperation_t
 #define cublasSasum_v2                            hipblasSasum
@@ -133,15 +129,29 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaErrorDeviceAlreadyInUse               hipErrorContextAlreadyInUse
 #define cudaErrorInvalidDevice                    hipErrorInvalidDevice
 #define cudaError_t                               hipError_t
+#define cudaEventCreate                           hipEventCreate
+#define cudaEventCreateWithFlags                  hipEventCreateWithFlags
+#define cudaEventDestroy                          hipEventDestroy
+#define cudaEventDisableTiming                    hipEventDisableTiming
+#define cudaEventRecord                           hipEventRecord
+#define cudaEventSynchronize                      hipEventSynchronize
+#define cudaEvent_t                               hipEvent_t
 #define cudaFree                                  hipFree
+#define cudaFreeHost                              hipFreeHost
 #define cudaGetDevice                             hipGetDevice
 #define cudaGetDeviceCount                        hipGetDeviceCount
 #define cudaGetDeviceProperties                   hipGetDeviceProperties
+#define cudaGetErrorName                          hipGetErrorName
 #define cudaGetErrorString                        hipGetErrorString
 #define cudaGetErrorString                        hipGetErrorString
 #define cudaGetLastError                          hipGetLastError
+#define cudaHostRegister                          hipHostRegister
+#define cudaHostRegisterDefault                   hipHostRegisterDefault
+#define cudaHostUnregister                        hipHostUnregister
 #define cudaMalloc                                hipMalloc
+#define cudaMallocHost                            hipHostMalloc
 #define cudaMallocPitch                           hipMallocPitch
+#define cudaMemcpy                                hipMemcpy
 #define cudaMemcpy2DAsync                         hipMemcpy2DAsync
 #define cudaMemcpyAsync                           hipMemcpyAsync
 #define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
@@ -150,11 +160,20 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaMemset2DAsync                         hipMemset2DAsync
 #define cudaMemsetAsync                           hipMemsetAsync
 #define cudaSetDevice                             hipSetDevice
+#define cudaStreamCreate                          hipStreamCreate
+#define cudaStreamDestroy                         hipStreamDestroy
 #define cudaStreamLegacy                          ((hipStream_t)1)
 #define cudaStreamPerThread                       ((hipStream_t)2)
 #define cudaStreamSynchronize                     hipStreamSynchronize
+#define cudaStreamWaitEvent                       hipStreamWaitEvent
 #define cudaStream_t                              hipStream_t
 #define cudaSuccess                               hipSuccess
+#define cufftComplex                              hipfftComplex
+#define cufftDestroy                              hipfftDestroy
+#define cufftExecR2C                              hipfftExecR2C
+#define cufftHandle                               hipfftHandle
+#define cufftPlanMany                             hipfftPlanMany
+#define cufftSetStream                            hipfftSetStream
 #define curandCreateGenerator                     hiprandCreateGenerator
 #define curandDestroyGenerator                    hiprandDestroyGenerator
 #define curandGenerateNormal                      hiprandGenerateNormal
@@ -178,6 +197,11 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cusolverDnHandle_t                        hipsolverHandle_t
 #define cusolverDnSetStream                       hipsolverSetStream
 #endif
+#define cusolverDnSpotrf                          hipsolverDnSpotrf
+#define cusolverDnSpotrfBatched                   hipsolverDnSpotrfBatched
+#define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
+#define cusolverDnSpotrs                          hipsolverDnSpotrs
+#define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
 #define cusparseAction_t                          hipsparseAction_t
 #define cusparseCreate                            hipsparseCreate
 #define cusparseCreateCsr                         hipsparseCreateCsr
@@ -201,7 +225,9 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cusparseSpMM_bufferSize                   hipsparseSpMM_bufferSize
 #define cusparseSpMatDescr_t                      hipsparseSpMatDescr_t
 #define cusparseStatus_t                          hipsparseStatus_t
-
+#define nvtxRangePop                              roctxRangePop
+#define nvtxRangePush                             roctxRangePush
+#define nvtxRangePushA                            roctxRangePushA
 //
 // HIPCUB namespace.
 //
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 160f5fb5c0f..e2f43ecd55c 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -14,9 +14,21 @@ CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
             -D__IS_HIP_COMPILE__=1 \
             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
             -DCUDA_VERSION=11000 \
-	          -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
+	          -I$(ROCMDIR)/hipsparse/include \
+	          -I$(ROCMDIR)/hipfft/include \
+	          -I$(ROCMDIR)/hipblas/include \
+	          -I$(ROCMDIR)/hiprand/include \
+	          -I$(ROCMDIR)/rocrand/include \
+	          -I$(ROCMDIR)/include \
+	          -I.. -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
 
-ROCM_INCLUDE = -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -isystem $(OPENFSTINC)
+ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \
+               -I$(ROCMDIR)/hipfft/include \
+               -I$(ROCMDIR)/hipblas/include \
+               -I$(ROCMDIR)/hiprand/include \
+               -I$(ROCMDIR)/rocrand/include \
+               -I$(ROCMDIR)/include \
+               -I.. -I../hip -isystem $(OPENFSTINC)
 ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 \
              -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
@@ -25,4 +37,4 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
 
 #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
-CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64
+CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64

From aed0ce594e72bc935ab1f2fade0f26aa5229a3b9 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 13 Sep 2022 11:44:33 -0500
Subject: [PATCH 14/22] Complete support for ROCm 5.0.2.

---
 src/chain/Makefile                            |  2 +-
 src/cudadecoder/Makefile                      |  2 +-
 src/cudafeat/Makefile                         |  2 +-
 .../feature-online-batched-ivector-cuda.cc    | 41 +++++++++++++++++--
 .../feature-online-batched-spectral-cuda.h    |  4 ++
 src/cudafeat/feature-spectral-cuda.h          |  4 ++
 src/cudafeat/online-ivector-feature-cuda.cc   | 17 ++++++++
 src/cudamatrix/Makefile                       |  2 +-
 src/cudamatrix/cu-allocator.cc                |  4 ++
 src/cudamatrix/cu-allocator.h                 |  4 ++
 src/cudamatrix/cu-block-matrix.cc             |  4 ++
 src/cudamatrix/cu-common.h                    |  7 +++-
 src/cudamatrix/cu-compressed-matrix.cc        |  4 ++
 src/cudamatrix/cu-device.cc                   |  5 ++-
 src/cudamatrix/cu-device.h                    | 11 ++++-
 src/cudamatrix/cu-matrix.cc                   |  4 ++
 src/cudamatrix/cu-packed-matrix.cc            |  4 ++
 src/cudamatrix/cu-sp-matrix.cc                |  4 ++
 src/cudamatrix/cu-sparse-matrix.cc            |  4 ++
 src/cudamatrix/cu-tp-matrix.cc                |  4 ++
 src/cudamatrix/cu-vector.cc                   |  4 ++
 src/hip/hipify.h                              | 16 +++++---
 22 files changed, 138 insertions(+), 15 deletions(-)

diff --git a/src/chain/Makefile b/src/chain/Makefile
index 5cc8d8901a1..5b177981ad8 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -36,7 +36,7 @@ ifeq ($(ROCM), true)
 ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
 .PRECIOUS: %.hip
 %.hip : %.cu
-	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
 	cat $< | \
 	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
 	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index 062e9a47d41..d4eda345564 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -44,7 +44,7 @@ ifeq ($(ROCM), true)
 ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
 .PRECIOUS: %.hip
 %.hip : %.cu
-	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
 	cat $< | \
 	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
 	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index c3a4489e18e..c0f54a854e8 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -47,7 +47,7 @@ ifeq ($(ROCM), true)
 ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
 .PRECIOUS: %.hip
 %.hip : %.cu
-	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
 	cat $< | \
 	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
 	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc
index 6d68c93f917..68c247b43e9 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.cc
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc
@@ -17,9 +17,6 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include "hipify.h"
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched
-#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched
 // The BLAS enumerators are used instead of the SOLVER ones.
 #ifdef CUBLAS_FILL_MODE_LOWER
 #undef CUBLAS_FILL_MODE_LOWER
@@ -385,6 +382,43 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats(
 
 #if CUDA_VERSION >= 9010
   int nrhs = 1;
+
+#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2)
+  // query temp buffer size
+  int L_work;
+
+  // perform factorization in batched
+  CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched_bufferSize(
+        GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
+        ivector_dim_, &L_work, num_lanes));
+  // allocate temp buffer
+  float *workspace = static_cast<float *>(
+          CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
+
+  // perform factorization in batched
+  CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched(
+        GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
+        ivector_dim_, workspace, L_work, d_infoArray_, num_lanes));
+
+  int L_work2;
+
+  // perform factorization in batched
+  CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched_bufferSize(
+		  GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
+		  quad_array_, ivector_dim_, ivec_array_, ivector_dim_, &L_work2, num_lanes));
+  // allocate temp buffer
+  float *workspace2 = static_cast<float *>(
+            CuDevice::Instantiate().Malloc(L_work2 * sizeof(float)));
+
+  // solve for rhs in batched
+  CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched(
+      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
+      quad_array_, ivector_dim_, ivec_array_, ivector_dim_, workspace2, L_work2, d_infoArray_,
+      num_lanes));
+
+  CuDevice::Instantiate().Free(workspace);
+  CuDevice::Instantiate().Free(workspace2);
+#else
   // perform factorization in batched
   CUSOLVER_SAFE_CALL(cusolverDnSpotrfBatched(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
@@ -395,6 +429,7 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
       quad_array_, ivector_dim_, ivec_array_, ivector_dim_, d_infoArray_,
       num_lanes));
+#endif
 #endif
 
   // cusolver solves in place.  Ivectors are now in linear_
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index 113657ce317..202232c6b23 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -20,7 +20,11 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipfft.h>
+#else
 #include <hipfft/hipfft.h>
+#endif
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 5625592a717..66f0dce395a 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -20,7 +20,11 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipfft.h>
+#else
 #include <hipfft/hipfft.h>
+#endif
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index c3b15d72a5b..56dbac93165 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -317,9 +317,26 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats(
       A.Stride(), workspace, L_work, d_info_));
 
   // solve for rhs
+#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2)
+  // query temp buffer size
+  int L_work2;
+  CUSOLVER_SAFE_CALL(
+	   hipsolverSpotrs_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
+			                      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, &L_work2));
+  // allocate temp buffer
+  float *workspace2 = static_cast<float *>(
+      CuDevice::Instantiate().Malloc(L_work2 * sizeof(float)));
+
+  CUSOLVER_SAFE_CALL(hipsolverSpotrs(
+      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
+      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, workspace2, L_work2, d_info_));
+
+  CuDevice::Instantiate().Free(workspace2);
+#else
   CUSOLVER_SAFE_CALL(cusolverDnSpotrs(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
       A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_));
+#endif
 
   CuDevice::Instantiate().Free(workspace);
 #else
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 5cd4adcffd8..3c1100753e5 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -37,7 +37,7 @@ ifeq ($(ROCM), true)
 ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
 .PRECIOUS: %.hip
 %.hip : %.cu
-	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
 	cat $< | \
 	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
 	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index abd08a9b015..d81dca002ce 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,7 +25,11 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 1ed7e54b541..f776bbb620e 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,7 +24,11 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fd17fe61893..7983cd250e7 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,7 +21,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index da7c57bde36..c4bdf569d3c 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,10 +32,15 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#include <hipsparse.h>
+#else
 #include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+#endif
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
 #include <roctracer/roctx.h>
 #include "hipify.h"
 #else
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index e42c93f1b67..442d2dbac67 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,7 +21,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 705bfbeee59..3dada172ba8 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -21,10 +21,13 @@
 // limitations under the License.
 
 
-
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index d7edf5a5a1c..67b9f1d9e9b 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,11 +29,16 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#include <hipsparse.h>
+#else
 #include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
@@ -44,7 +49,11 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipsolver.h>
+#else
 #include <hipsolver/hipsolver.h>
+#endif
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index c1d72ede87e..9897917a33f 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,7 +29,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index c9d686d0ce8..4de0fcba63d 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,7 +23,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index a6c7d7720e4..86a3cd9a726 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,7 +21,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index a21e5163701..93d10099466 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,7 +24,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 378cc8e4e38..739bab3dd59 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,7 +21,11 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index cf13d631a0d..1deb1cb8733 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,7 +24,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 24b5f2f8eb3..b631ac08a23 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -191,17 +191,22 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cusolverDnDestroy                         hipsolverDnDestroy
 #define cusolverDnHandle_t                        hipsolverDnHandle_t
 #define cusolverDnSetStream                       hipsolverDnSetStream
+#define cusolverDnSpotrf                          hipsolverDnSpotrf
+#define cusolverDnSpotrfBatched                   hipsolverDnSpotrfBatched
+#define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
+#define cusolverDnSpotrs                          hipsolverDnSpotrs
+#define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
 #else
 #define cusolverDnCreate                          hipsolverCreate
 #define cusolverDnDestroy                         hipsolverDestroy
 #define cusolverDnHandle_t                        hipsolverHandle_t
 #define cusolverDnSetStream                       hipsolverSetStream
+#define cusolverDnSpotrf                          hipsolverSpotrf
+#define cusolverDnSpotrfBatched                   hipsolverSpotrfBatched
+#define cusolverDnSpotrf_bufferSize               hipsolverSpotrf_bufferSize
+#define cusolverDnSpotrs                          hipsolverSpotrs
+#define cusolverDnSpotrsBatched                   hipsolverSpotrsBatched
 #endif
-#define cusolverDnSpotrf                          hipsolverDnSpotrf
-#define cusolverDnSpotrfBatched                   hipsolverDnSpotrfBatched
-#define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
-#define cusolverDnSpotrs                          hipsolverDnSpotrs
-#define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
 #define cusparseAction_t                          hipsparseAction_t
 #define cusparseCreate                            hipsparseCreate
 #define cusparseCreateCsr                         hipsparseCreateCsr
@@ -235,3 +240,4 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 
 
 #endif //__HIPIFY_H__
+

From 6d8dd4c2337f224bb7f230cbb41d5e5311c75632 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 20 Oct 2023 11:48:09 +0000
Subject: [PATCH 15/22] Fix __CUDA_ARCH__ issue and add more hipification.

---
 src/chain/chain-kernels.cu                        |  1 +
 src/cudafeat/feature-online-cmvn-cuda.cu          |  1 +
 src/cudafeatbin/Makefile                          |  8 +++++---
 src/cudafeatbin/apply-batched-cmvn-online-cuda.cc |  2 ++
 .../compute-fbank-online-batched-cuda.cc          |  2 ++
 .../compute-mfcc-online-batched-cuda.cc           |  2 ++
 .../compute-online-feats-batched-cuda.cc          |  2 ++
 src/cudafeatbin/compute-online-feats-cuda.cc      |  2 ++
 src/cudamatrix/cu-kernels.cu                      |  1 +
 src/hip/hipify.h                                  |  5 +++++
 src/makefiles/hip_64bit.mk                        | 15 +++++++++++----
 11 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 2a30128750c..ad6691fc895 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -21,6 +21,7 @@
 #include "chain/chain-kernels-ansi.h"
 
 #ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
 #include <hip/hip_runtime.h>
 #endif
 
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index 8d4648d04bb..1c896f1307f 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -16,6 +16,7 @@
 // limitations under the License.
 
 #ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
 #include <hipcub/hipcub.hpp>
 #include "hipify.h"
 #else
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
index 9dbb5d30fa1..ed1c413c939 100644
--- a/src/cudafeatbin/Makefile
+++ b/src/cudafeatbin/Makefile
@@ -3,12 +3,14 @@ all: ;
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
-ifndef CUDA_ARCH
-  $(error CUDA_ARCH is undefined, run 'src/configure')
+ifeq ($(CUDA), true)
+  ifndef CUDA_ARCH
+    $(error CUDA_ARCH is undefined, run 'src/configure')
+  endif
 endif
 
 LDFLAGS += $(CUDA_LDFLAGS)
diff --git a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
index 24e7cbd4a70..44ef403f21a 100644
--- a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
+++ b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
@@ -18,8 +18,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
index 36cfc4ad90c..ff9415b8f11 100644
--- a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
+++ b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
index 99883f3114a..3fcc1aea659 100644
--- a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
+++ b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-online-feats-batched-cuda.cc b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
index 787aceeca0d..2cd6bbb6a93 100644
--- a/src/cudafeatbin/compute-online-feats-batched-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
@@ -16,9 +16,11 @@
 // limitations under the License.
 
 #if HAVE_CUDA
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #include <nvToolsExt.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc
index b9135c3cee6..70380f8ccad 100644
--- a/src/cudafeatbin/compute-online-feats-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <nvToolsExt.h>
 #endif
+#endif
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "cudafeat/online-cuda-feature-pipeline.h"
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 1d6e0664541..1b0cf1f2c90 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -30,6 +30,7 @@
 #include <limits>
 #include <math_constants.h>
 #ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
 #include <hip/hip_runtime.h>
 #include "hipify.h"
 #include "cudamatrix/cu-kernels-ansi.h"
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index b631ac08a23..723b5b1f059 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -148,6 +148,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaHostRegister                          hipHostRegister
 #define cudaHostRegisterDefault                   hipHostRegisterDefault
 #define cudaHostUnregister                        hipHostUnregister
+#define cudaLaunchHostFunc                        hipLaunchHostFunc
 #define cudaMalloc                                hipMalloc
 #define cudaMallocHost                            hipHostMalloc
 #define cudaMallocPitch                           hipMallocPitch
@@ -157,12 +158,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost                    hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice                    hipMemcpyHostToDevice
+#define cudaMemGetInfo                            hipMemGetInfo
 #define cudaMemset2DAsync                         hipMemset2DAsync
 #define cudaMemsetAsync                           hipMemsetAsync
+#define cudaProfilerStop                          hipProfilerStop
 #define cudaSetDevice                             hipSetDevice
 #define cudaStreamCreate                          hipStreamCreate
+#define cudaStreamCreateWithFlags                 hipStreamCreateWithFlags
 #define cudaStreamDestroy                         hipStreamDestroy
 #define cudaStreamLegacy                          ((hipStream_t)1)
+#define cudaStreamNonBlocking                      hipStreamNonBlocking
 #define cudaStreamPerThread                       ((hipStream_t)2)
 #define cudaStreamSynchronize                     hipStreamSynchronize
 #define cudaStreamWaitEvent                       hipStreamWaitEvent
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index e2f43ecd55c..8d85872aa9b 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -29,12 +29,19 @@ ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \
                -I$(ROCMDIR)/rocrand/include \
                -I$(ROCMDIR)/include \
                -I.. -I../hip -isystem $(OPENFSTINC)
+               
+# TODO: Consider passing __CUDA_ARCH__=800 here as it is mostly supported by ROCm.
+#       However this macro has some side effect with HIPCC that makes it assume
+#       CUDA is active and everything is device compiles.
 ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 \
              -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
-             -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
-	           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14
+             -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \
+	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics
 
-#TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
+# TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
+# We allow the libraries we link against to have undefined symbols so as this can be build in
+# systems with no development version of these libraries (e.g. ncurses).
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
-CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64
+CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined 
+LDLIBS += -Wl,--allow-shlib-undefined 

From f584420d8c1448e8e70f9106aa49712f63d06347 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 6 Nov 2023 16:57:15 +0000
Subject: [PATCH 16/22] Fix tests with zero size matrices and needing syncwarp
 for LDS sharing.

---
 ...ure-online-batched-ivector-cuda-kernels.cu |  30 +++--
 ...re-online-batched-spectral-cuda-kernels.cu |   4 +-
 src/cudafeat/feature-online-cmvn-cuda.cu      |   4 +-
 src/cudafeat/feature-spectral-cuda.cu         |   4 +-
 .../online-ivector-feature-cuda-kernels.cu    |  26 ++--
 src/cudamatrix/cu-device.cc                   |   4 +
 src/cudamatrix/cu-kernels.cu                  | 127 ++++++++++++++----
 src/cudamatrix/cu-math-test.cc                |  11 +-
 src/cudamatrix/cu-math.cc                     |   2 +-
 src/cudamatrix/cu-matrix-test.cc              |  24 +++-
 src/cudamatrix/cu-matrix.cc                   |  12 +-
 src/cudamatrix/cu-sparse-matrix.cc            |   6 +-
 src/cudamatrix/cu-vector.cc                   |  13 +-
 src/hip/hipify.h                              |  35 ++++-
 src/makefiles/hip_64bit.mk                    |   7 +-
 15 files changed, 219 insertions(+), 90 deletions(-)

diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index 0b4cfce812c..e5b89d163e5 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -50,7 +50,7 @@ void square_batched_matrix(int32_t chunk_frames, int32_t num_cols,
                            const float *feats, int32_t ldf, int32_t stridef,
                            float *feats_sq, int32_t lds, int32_t strides,
                            const LaneDesc *lanes, int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (chunk_frames + threads.y - 1) / threads.y, num_lanes);
 
@@ -101,8 +101,10 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss,
                              float *posteriors, int32_t ldp, int32_t stridep,
                              int32_t right, const LaneDesc *lanes,
                              int32_t num_lanes) {
-  dim3 threads(32, 32);
-  dim3 blocks((num_gauss + 31) / 32, (num_chunk_frames + 31) / 32, num_lanes);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, 
+              (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
+              num_lanes);
 
   zero_invalid_posteriors_kernel<<<blocks, threads>>>(
       num_chunk_frames, num_gauss, posteriors, ldp, stridep, right, lanes,
@@ -215,8 +217,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim,
                              int32_t stridest, float *spliced_feats,
                              int32_t lds, int32_t strides,
                              const LaneDesc *lanes, int32_t num_lanes) {
-  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
-  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;       // Max block size is 1024 threads
 
   dim3 blocks(num_chunk_frames, num_lanes);
 
@@ -311,10 +313,10 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
     // First we need to shift feats to handle the case where num_chunk_frames
     // is less than stash size
 
-    KALDI_ASSERT(stash_size <= 32);
-    // This only works if stash size is <= 32 as we rely on __syncthreads()
+    KALDI_ASSERT(stash_size <= GPU_WARP_SIZE);
+    // This only works if stash size is <= GPU_WARP_SIZE as we rely on __syncthreads()
     // to avoid read/write hazards when reading/writing in-place
-    dim3 threads(32, 32);
+    dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
     dim3 blocks(num_lanes);
 
     shift_feats_kernel<<<blocks, threads>>>(chunk_size, feats, feat_dim, ldf,
@@ -324,8 +326,8 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
 
   {
     int threads =
-        (feat_dim + 31) / 32 * 32;       // round up to the nearest warp size
-    if (threads > 1024) threads = 1024;  // Max block size is 1024 threads
+        (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;       // round up to the nearest warp size
+    if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
     dim3 blocks(stash_size, num_lanes);
 
     // Then we need to copy feats from source into stash
@@ -507,8 +509,8 @@ __global__ void batched_convert_sp_to_dense_kernel(int32_t n, float *A_sp,
 void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A,
                                  int32_t lda, int32_t stridea,
                                  const LaneDesc *lanes, int32_t num_lanes) {
-  dim3 threads(32, 32);
-  int block = (n + 31) / 32;  // blocks in x and y dimensions
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  int block = (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE;  // blocks in x and y dimensions
   dim3 blocks(block, block, num_lanes);
 
   batched_convert_sp_to_dense_kernel<<<blocks, threads>>>(
@@ -584,7 +586,7 @@ void initialize_channels(int32_t num_gauss, int32_t feat_dim, float *gamma,
                          int32_t strideg, float *X, int32_t ldx,
                          int32_t stridex, const LaneDesc *lanes,
                          int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t blocks = num_lanes;
 
   initialize_channels_kernel<<<blocks, threads>>>(
@@ -629,7 +631,7 @@ void apply_and_update_stash(int32_t num_gauss, int32_t feat_dim, float *gamma,
                             int32_t ldx, int32_t stridex, float *X_stash,
                             int32_t lds, int32_t strides, const LaneDesc *lanes,
                             int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t blocks = num_lanes;
 
   apply_and_update_stash_kernel<<<blocks, threads>>>(
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index f847311d755..27375f4914e 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -68,7 +68,7 @@ __global__ void batched_mel_banks_compute_kernel(
   // perfom local sum
   float sum = 0;
   if (frame < num_frames) {  // exclude frames beyond the end
-    for (int idx = tid; idx < size; idx += 32) {
+    for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) {
       sum += v[idx] * w[idx];
     }
   }
@@ -487,7 +487,7 @@ void cuda_mel_banks_compute(const LaneDesc *lanes, int32_t num_lanes,
                             float energy_floor, int32 *offsets, int32 *sizes,
                             float **vecs, const float *feats, int32_t ldf,
                             float *mels, int32_t ldm, bool use_log) {
-  dim3 Bl(32, 8);
+  dim3 Bl(GPU_WARP_SIZE, 8);
   dim3 Gr(num_bins, (max_chunk_frames + Bl.y - 1) / Bl.y, num_lanes);
   batched_mel_banks_compute_kernel<<<Gr, Bl>>>(
       lanes, num_lanes, max_chunk_frames, energy_floor, offsets, sizes, vecs,
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index 1c896f1307f..f8947a3b5ed 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -188,8 +188,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
       stats.Stride());
   CU_SAFE_CALL(cudaGetLastError());
 
-  threads = (feat_dim + 31) / 32 * 32;  // round up to 32 threads
-  if (threads > 1024) threads = 1024;
+  threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to GPU_WARP_SIZE threads
+  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;
 
   const CuMatrix<float> &gstats = cmvn_state_.global_cmvn_stats;
   const CuMatrix<float> &sstats = cmvn_state_.speaker_cmvn_stats;
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index c320c85a029..9c0d5df5288 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -134,7 +134,7 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
 
   // perfom local sum
   float sum = 0;
-  for (int idx = tid; idx < size; idx += 32) {
+  for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) {
     sum += v[idx] * w[idx];
   }
 
@@ -493,7 +493,7 @@ void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_w
   // mel banks
   int num_bins = bin_size_;
   cu_mel_energies_.Resize(num_frames, num_bins, kUndefined);
-  dim3 mel_threads(32, 8);
+  dim3 mel_threads(GPU_WARP_SIZE, 8);
   dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y);
   mel_banks_compute_kernel<<<mel_blocks, mel_threads>>>(
       num_frames, std::numeric_limits<float>::epsilon(), offsets_, sizes_,
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
index 378ea18e689..dffc9fd3c8f 100644
--- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -26,17 +26,17 @@
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
 
-// Meant to be called with blockDim= 32x32
+// Meant to be called with blockDim = GPU_WARP_SIZE x GPU_MAX_WARPS_PER_BLOCK
 __global__ void batched_gemv_reduce_kernel(int rows, int cols,
                                            const float* __restrict__ A, int lda,
                                            const float* __restrict__ X, int ldx,
                                            float* C) {
   // Specialize WarpReduce for type float
   typedef cub::WarpReduce<float> WarpReduce;
-  // Allocate WarpReduce shared memory for 32 warps
-  __shared__ typename WarpReduce::TempStorage temp_storage[32];
+  // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
+  __shared__ typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK];
 
-  __shared__ float s_A[32][32 + 1];  //+1 to avoid bank conflicts on transpose
+  __shared__ float s_A[GPU_MAX_WARPS_PER_BLOCK][GPU_WARP_SIZE + 1];  //+1 to avoid bank conflicts on transpose
 
   int bid = blockIdx.x;   // batch id
   int tid = threadIdx.x;  // thread id
@@ -47,13 +47,13 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols,
   // Offset to input vector to starting column for batch
   const float* __restrict__ X_in = X + bid * ldx;
 
-  for (int i = 0; i < cols; i += 32) {  // threadIdx.x, keep all threads present
+  for (int i = 0; i < cols; i += GPU_WARP_SIZE) {  // threadIdx.x, keep all threads present
     int c = i + tid;
 
     float sum = 0.0f;
     // Perform dot product
     for (int j = 0; j < rows;
-         j += 32) {  // threadIdx.y, keep all threads present
+         j += GPU_MAX_WARPS_PER_BLOCK) {  // threadIdx.y, keep all threads present
       int r = j + wid;
 
       float val = 0.0f;
@@ -139,9 +139,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows,
                                                     int32_t lda, float scale,
                                                     float* retval) {
   // Specialize WarpReduce for type float
-  typedef cub::BlockReduce<float, 32, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 32>
+  typedef cub::BlockReduce<float, GPU_WARP_SIZE, cub::BLOCK_REDUCE_WARP_REDUCTIONS, GPU_MAX_WARPS_PER_BLOCK>
       BlockReduce;
-  // Allocate WarpReduce shared memory for 32 warps
+  // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   float sum = 0.0f;
@@ -207,7 +207,7 @@ __global__ void update_linear_and_quadratic_terms_kernel(
 void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
                          const float* AT, int B_stride, const float* B,
                          float* C) {
-  batched_gemv_reduce_kernel<<<batch_size, dim3(32, 32)>>>(
+  batched_gemv_reduce_kernel<<<batch_size, dim3(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK)>>>(
       rows, cols, AT, A_stride, B, B_stride, C);
   CU_SAFE_CALL(cudaGetLastError());
 }
@@ -215,8 +215,8 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
 void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
                      int32_t size, const float* feats, int32_t ldf,
                      float* sfeats, int32_t lds) {
-  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
-  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;       // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
 
   splice_features_kernel<<<num_frames, threads>>>(
       num_frames, feat_dim, left, size, feats, ldf, sfeats, lds);
@@ -238,7 +238,7 @@ void update_linear_and_quadratic_terms(int32_t n, float old_num_frames,
 void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
                                   float* A, int32_t lda, float scale,
                                   float* sum) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (num_rows + threads.y - 1) / threads.y);
 
@@ -249,7 +249,7 @@ void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
 
 void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats,
                    int32_t ldf, float* feats_sq, int32_t lds) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (num_rows + threads.y - 1) / threads.y);
 
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 3dada172ba8..25775fb1b05 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -249,8 +249,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     return;
   } else {
     // Suggest to use compute exclusive mode
+  #ifdef __IS_HIP_COMPILE__
+    KALDI_WARN << "Not in compute-exclusive mode.";
+  #else 
     KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
         "'nvidia-smi -c 3' to set compute exclusive mode";
+  #endif
     // We want to choose the device more carefully, so release the CUDA context.
     e = cudaDeviceReset();
     if (e != cudaSuccess) {
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 1b0cf1f2c90..792932c18d5 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -966,6 +966,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       smem.sum[tid] += smem.sum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1118,8 +1119,8 @@ void trace_mat_mat_trans_atomic(Real *d_result,
                                 cudaStream_t stream) {
   // Assuming *d_result is set to zero already
 
-  constexpr int THREADS_X = 32;
-  constexpr int THREADS_Y = 16;
+  constexpr int THREADS_X = GPU_WARP_SIZE;
+  constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK/2;
 
   dim3 thrds(THREADS_X, THREADS_Y);
 
@@ -1176,6 +1177,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       ssum[tid] += ssum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1219,6 +1221,7 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       ssum[tid] += ssum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1270,6 +1273,7 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
       ssum[tid] += ssum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1353,6 +1357,7 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
       smem.sum[tid] += smem.sum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1805,6 +1810,7 @@ static void _vec_transform_reduce(
   if (tid < warpSize) {
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+      __syncwarp();
     }
   }
 
@@ -1904,7 +1910,6 @@ __global__ void _strided_reduction_fused_kernel(Real * __restrict__ dots, const
         int idx = colStart + (j + u*stride) * d.stride;
         vals[u] = op.Transform(data[idx]);
       }
-
       #pragma unroll
       for (int u = 0; u < unroll_count; ++u) {
         thread_data = op.Reduce(thread_data, vals[u]);
@@ -2018,6 +2023,7 @@ static void _transform_reduce_mat_rows(
   if (tid < warpSize) {
     for (int shift = warpSize; shift > 0; shift >>= 1)
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+      __syncwarp();
   }
 
   // Output to vector result.
@@ -2042,9 +2048,27 @@ static void _transform_reduce_mat_cols(
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));
   }
+
+  // if (tid == 0) {
+  //   for (int j = 0; j < d.cols; j += 1)
+  //     tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));
+  //   result[i] = tdata;
+    
+  // }
+  // return;
+
   sdata[tid] = tdata;
   __syncthreads();
 
+  // if (tid == 0) {
+  //   tdata = 0;
+  //   for (int j = 0; j < CU1DBLOCK; j += 1)
+  //     tdata = op.Reduce(tdata, op.Transform(sdata[j]));
+  //   result[i] = tdata;
+  // }
+
+  // return;
+
   // Tree reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -2053,12 +2077,30 @@ static void _transform_reduce_mat_cols(
     __syncthreads();
   }
 
+  // if (tid == 0) {
+  //   tdata = 0;
+  //   for (int j = 0; j < 2*warpSize; j += 1)
+  //     tdata = op.Reduce(tdata, op.Transform(sdata[j]));
+  //   result[i] = tdata;
+  // }
+
+  // return;
+
+
   // Reduce last warp. Threads implicitly synchronized within a warp.
   if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
-      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      sdata[tid] +=  sdata[tid + shift];
+      __syncwarp();
+      //__syncthreads(); // Why this needed?
+    }
   }
 
+  if (tid == 0)
+    result[i] = sdata[0];
+
+  return;
+
   // Output to vector result.
   if (tid == 0) {
     result[i] = op.PostReduce(sdata[0], result[i]);
@@ -2117,6 +2159,7 @@ static void _group_transform_reduce(
 #     pragma unroll
       for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
+        __syncwarp();
       }
     }
 
@@ -2981,6 +3024,7 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       sprod[tid] += sprod[tid + shift];
       snorm[tid] += snorm[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -3271,6 +3315,7 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
         smax[tid] = smax[tid + num_working_threads];
         sidx[tid] = sidx[tid + num_working_threads];
       }
+      __syncwarp(0xffffffffu >> (32-num_working_threads));
     }
   }
 
@@ -3999,7 +4044,7 @@ struct  BatchedMatrixCopyDesc {
   MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
 };
 
-// launched with a block size of 32x32 (32 rows, 32 cols per CTA)
+// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA)
 // grid dim x,y expands to fill out average in x/y across batches
 // grid dim.z is batch
 template<typename Real>
@@ -4380,7 +4425,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B,
 
 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
                          MatrixDim dA, int B_stride, float* value) {
-  _trace_mat_mat<32> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
 void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
@@ -4401,6 +4446,11 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
   } else if (Bl.x == 32) {
     _add_diag_mat_mat_MTN<32> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
                                            v, stride_v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MTN<64> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
+                                           v, stride_v);
+#endif
   }
 }
 
@@ -4409,9 +4459,13 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
                                const float* N, const MatrixDim dim_N,
                                const float beta, float* v) {
   if (Bl.x == 16) {
-    _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<16><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
     _add_diag_mat_mat_MN<32><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x==64) {
+    _add_diag_mat_mat_MN<64><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+#endif
   }
 }
 
@@ -4451,6 +4505,7 @@ void cudaF_vector_copy_elements(dim3 Gr, dim3 Bl, float *data, int dim,
                                          transpose, elements);
 }
 
+
 void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int s,
                           const float* z, MatrixDim d, float* z2, MatrixDim d2,
                           float* t) {
@@ -5086,7 +5141,7 @@ void cudaD_trace_mat_mat_trans(const double* A,
 
 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
                          MatrixDim dA, int B_stride, double* value) {
-  _trace_mat_mat<32> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
 void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
@@ -5107,6 +5162,11 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
   } else if (Bl.x == 32) {
     _add_diag_mat_mat_MTN<32> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
                                            v, stride_v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MTN<64> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
+                                           v, stride_v);
+#endif
   }
 }
 
@@ -5115,9 +5175,13 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
                                const double* N, const MatrixDim dim_N,
                                const double beta, double* v) {
   if (Bl.x == 16) {
-    _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<16><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
     _add_diag_mat_mat_MN<32><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x==64) {
+    _add_diag_mat_mat_MN<64><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+#endif
   }
 }
 
@@ -5488,25 +5552,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
@@ -5802,7 +5866,14 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
 // Launches a kernel that does nothing, explicitly using the legacy default stream;
 // this will synchronize all threads without blocking.
 void cuda_legacy_noop() {
+#ifdef __IS_HIP_COMPILE__
+  // HIP doesn't currently support cudaStreamLegacy stream so we force to use the
+  // non-per-thread API to get similar semantics.
+  auto k = reinterpret_cast<void*>(_noop_kernel);
+  hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0);
+#else
   _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>();
+#endif
 }
 
 void cudaF_mat_copy_range_clamped(
@@ -5812,8 +5883,8 @@ void cudaF_mat_copy_range_clamped(
    float *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(32,32);
-  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5826,8 +5897,8 @@ void cudaD_mat_copy_range_clamped(
    double *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(32,32);
-  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5837,7 +5908,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
     int32_t *ldo) {
 
-  dim3 threads(32,32);
+  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<float> batch_desc; 
@@ -5863,8 +5934,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
                   MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
@@ -5886,8 +5957,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
       
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
                   remaining);
 
       // no memcpy needed here.  Memory will be passed down directly
@@ -5902,7 +5973,7 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
     int32_t *ldo) {
 
-  dim3 threads(32,32);
+  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<double> batch_desc; 
@@ -5928,8 +5999,8 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
                   MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
@@ -5951,8 +6022,8 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
 
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
                   remaining);
       
       // no memcpy needed here.  Memory will be passed down directly
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index e1d59e777be..1245fb28bad 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -214,9 +214,9 @@ void UnitTestLstmNonlinearity() {
   for (int32 loop = 0; loop < 10; loop++) {
 
     // problem dimensions.
-    int32 num_rows = RandInt(5, 20),
-          cell_dim = RandInt(2, 200),
-        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
+    int32 num_rows = RandInt(5, 20), //16
+          cell_dim = RandInt(2, 200), //45
+        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); //3
 
     // Pick the (input or params block), and output block, for which we'll
     // spot-check the derivative values.  This will give us test failures
@@ -232,7 +232,6 @@ void UnitTestLstmNonlinearity() {
     else
       test_params = -1;
 
-
     CuMatrix<BaseFloat> input(num_rows, cell_dim * 5 + dropout_dim),
         params(3, cell_dim),
         output_deriv(num_rows, cell_dim * 2);
@@ -277,11 +276,11 @@ void UnitTestLstmNonlinearity() {
     for (int32 i = 0; i < test_dim; i++) {
       CuMatrix<BaseFloat> delta_input(num_rows, 5 * cell_dim + dropout_dim),
           delta_params(3, cell_dim);
-      if (test_input >= 0) {
+      if (test_input >= 0) { // -1
         delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn();
         delta_input.Scale(delta);
       }
-      if (test_params >= 0) {
+      if (test_params >= 0) { // 0
         delta_params.Row(test_params).SetRandn();
         delta_params.Scale(delta);
       }
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 3fbeff3a470..d0d8e4e771f 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -818,7 +818,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
 
     // Use 2D block (8x32 threads) as we need to compute column sum.
     // Use 1D grid to cover the data matrix width `cell_dim`.
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
 //    dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
 //                 n_blocks(num_rows, dimBlock.y));
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index be8483e48f5..26a5281ec05 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2675,10 +2675,18 @@ static void UnitTestCuMatrixSetRandn() {
 
 template <typename Real>
 static void UnitTestCuMatrixSetRandUniform() {
+
+  // if (CuDevice::Instantiate().Enabled()) {
+  //   CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(GetCurandHandle(), 123456));
+  // }
+
   for (int32 i = 0; i < 2; i++) {
-    MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200;
+    MatrixIndexT rows = 180+Rand() % 200, cols = 200+Rand() % 200;
     CuMatrix<Real> M(rows, cols);
     M.SetRandUniform();
+    // M.SetZero();
+    // M.Add(0.5);
+    // M.SetZeroAboveDiag();
 
     M.Add(-0.5); // we'll be testing the central moments, so
     // center it around zero first.
@@ -2693,6 +2701,16 @@ static void UnitTestCuMatrixSetRandUniform() {
     for (int32 pow = 1; pow < central_moments.Dim(); pow++) {
       CuMatrix<Real> Mpow(M);
       Mpow.ApplyPow(pow);
+
+      // if (CuDevice::Instantiate().Enabled()) {
+      //   CuVector<Real> col_sum(rows, kUndefined);
+      //   cuda_sum_mat_cols(rows, CU1DBLOCK, col_sum.Data(), Mpow.Data(), Mpow.Dim());
+      //   KALDI_LOG << "Sums vector is " << col_sum;
+      //   Real ans = col_sum.Sum();
+      //   KALDI_LOG << "Total sum is " << ans;
+      //   KALDI_ERR << "Stopping!";
+      // }
+
       Real observed_moment = Mpow.Sum() / (rows * cols);
       // see http://en.wikipedia.org/wiki/Normal_distribution#Moments,
       // note that mu = 0 and sigma = 1.
@@ -2705,10 +2723,12 @@ static void UnitTestCuMatrixSetRandUniform() {
           upper_bound = expected_moment + allowed_deviation;
       if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) {
         KALDI_LOG << "Random matrix is " << M;
+        //KALDI_LOG << "Random vector sum is " << col_sum;
         KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
                   << ", expected " << expected_moment << ", allowed range "
                   << lower_bound << " to " << upper_bound;
       }
+      KALDI_LOG << "Moment[" << pow << "] is " << observed_moment << " (" << expected_moment << ")";
     }
   }
 }
@@ -3061,7 +3081,7 @@ template<typename Real> void CudaMatrixUnitTest() {
 int main() {
   SetVerboseLevel(1);
   int32 loop = 0;
-  bool test_threads = true;
+  bool test_threads = false;
   // num_threads only matters if test_threads == true.   Don't make it
   // to large, because it will affect CPU usage if you are using CPU.
   int32 num_threads = 4;
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 9897917a33f..56acf340823 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -253,7 +253,7 @@ void CuMatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &M,
       } else {
         // 2D thread block with warps (blockDim.x) along the row-dim of input M.
         // Each (8x32) thread block will transpose (32x32) data
-        const int32 warpSize = 32;
+        const int32 warpSize = GPU_WARP_SIZE;
         dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
         dim3 dimGrid(n_blocks(M.NumCols(), warpSize),
             n_blocks(M.NumRows(), warpSize));
@@ -859,7 +859,7 @@ void CuMatrixBase<Real>::DiffGroupPnorm(const CuMatrixBase<Real> &in_value,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(NumCols(), dimBlock.x),
                  n_blocks(NumRows(), dimBlock.y));
@@ -1009,7 +1009,7 @@ void CuMatrixBase<Real>::AddSmat(Real alpha, const CuSparseMatrix<Real> &A,
     // We use warpSize threads per row to access only the nonzero elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of A.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y));
 
@@ -2186,7 +2186,7 @@ Real TraceMatMat(const CuMatrixBase<Real> &A,
     // if the matrix is not in a very bad shape.
     // (wider or taller than 32x8192)
     // CPU will then reduce to 1 element.
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize),
         n_blocks(A.NumRows(), kWarpSize));
@@ -2408,7 +2408,7 @@ void CuMatrixBase<Real>::CopyColsFromVec(const CuVectorBase<Real> &rv) {
       // and use transposed copy to fill *this
       // see CuMatrixBase<Real>::CopyFromMat() for more detail of the impl
       MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ };
-      const int32 warpSize = 32;
+      const int32 warpSize = GPU_WARP_SIZE;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(rv_dim.cols, warpSize),
                    n_blocks(rv_dim.rows, warpSize));
@@ -2418,7 +2418,7 @@ void CuMatrixBase<Real>::CopyColsFromVec(const CuVectorBase<Real> &rv) {
     } else if (rv.Dim() == num_rows_) {
       // use 2D block (8x32) and large enough grid to cover matrix *this
       // dimBlock.x need to be at least warpSize for coalesced memory access.
-      const int32 warpSize = 32;
+      const int32 warpSize = GPU_WARP_SIZE;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(num_cols_, dimBlock.x),
                    n_blocks(num_rows_, dimBlock.y));
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 93d10099466..1a82ce0d4df 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -148,7 +148,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all selected rows.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(row_indexes.Dim(), dimBlock.y));
 
@@ -558,7 +558,7 @@ Real TraceMatSmat(const CuMatrixBase<Real> &A,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of B.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y));
 
@@ -648,7 +648,7 @@ void CuSparseMatrix<Real>::CopyToMat(CuMatrixBase<OtherReal> *M,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(NumRows(), dimBlock.y));
 
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 1deb1cb8733..f6426297e49 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -639,7 +639,10 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
                                   N.Data(), N.Stride(), beta, data_);
       } else {
         // Case 2: diag(M'*N) == sum(M.*N, 1)
-        // 16x16 or 8x32 2D block for coalesced memory access.
+        // (2*CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE/2
+        // or
+        // (CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE
+        // 2D block for coalesced memory access.
         // Grid shape is designed as follows,
         // 1. for small matrices, use 1D grid with only 1 row of 16x16 block,
         //    to avoid multiple kernel launch;
@@ -647,11 +650,11 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         //    use 1- or 2-D grid so that the grid contains
         //    at least and not much larger than 'kOptNumBlocks' blocks
         //    to fully utilize the GPU;
-        const int32 warpSize = 32;
+        const int32 warpSize = GPU_WARP_SIZE;
         const int32 kOptNumBlocks = 512;
         const int32 tile_dim =
             (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ?
-                16 : 32;
+                GPU_WARP_SIZE/2 : GPU_WARP_SIZE;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x),
                      n_blocks(N.NumRows(), dimBlock.y));
@@ -678,7 +681,7 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         // One block per 'tile_dim' columns of N.
         // 1D grid expands along the row of N.
         int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
+            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(),
@@ -687,7 +690,7 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         // Case 4: diag(M'*N') == sum(N'.*M, 1)
         // Same kernel and config as case 3 except M and N are swapped.
         int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
+            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(M.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(),
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 723b5b1f059..56d7e869a32 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -2,7 +2,19 @@
 #define __HIPIFY_H__
 
 #ifdef __HIPCC__
-inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
+inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
+    // On CDNA hardware wave-fronts (warps) execute always in
+    // lock step. Though it might still be important to signal
+    // that the compiler can't reorder code around certain code 
+    // sections that rely on data sharing mecanisms like LDS 
+    // (shared memory). So this implements a No-op but is seen
+    // by the compiler as having side effects. 
+    __asm__("s_nop 0");
+
+    // A saffest option, arguably less performant would be to use:
+    // __asm__("s_waitcnt lgkmcnt(0)"); Í
+    // to explicitly do a memory fence. 
+}
 // AMDGCN only support this rounding mode.
 #define __fdiv_rd __fdiv_rn
 #else
@@ -153,7 +165,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaMallocHost                            hipHostMalloc
 #define cudaMallocPitch                           hipMallocPitch
 #define cudaMemcpy                                hipMemcpy
-#define cudaMemcpy2DAsync                         hipMemcpy2DAsync
+// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized 
+// copies, which should be canceled by ROCm 5.7.1+. Then the following would
+// be sufficient:
+// #define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpy2DAsync(a,b,c,d,width,height,e,f) \
+    [&]() -> hipError_t { \
+        if (width && height) \
+            return hipMemcpy2DAsync(a,b,c,d,width,height,e,f); \
+        return hipSuccess; \
+    }()
 #define cudaMemcpyAsync                           hipMemcpyAsync
 #define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost                    hipMemcpyDeviceToHost
@@ -166,8 +187,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaStreamCreate                          hipStreamCreate
 #define cudaStreamCreateWithFlags                 hipStreamCreateWithFlags
 #define cudaStreamDestroy                         hipStreamDestroy
-#define cudaStreamLegacy                          ((hipStream_t)1)
-#define cudaStreamNonBlocking                      hipStreamNonBlocking
+#define cudaStreamNonBlocking                     hipStreamNonBlocking
 #define cudaStreamPerThread                       ((hipStream_t)2)
 #define cudaStreamSynchronize                     hipStreamSynchronize
 #define cudaStreamWaitEvent                       hipStreamWaitEvent
@@ -243,6 +263,13 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 //
 #define cub hipcub
 
+//
+// Callback qualifier
+//
+#define CUDART_CB
 
+#define GPU_WARP_SIZE 64
+#define GPU_MAX_THREADS_PER_BLOCK 1024
+#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK/GPU_WARP_SIZE)
 #endif //__HIPIFY_H__
 
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 8d85872aa9b..aec3e359f53 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -37,11 +37,14 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 \
              -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
              -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \
-	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics
+	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics  \
+             $(EXTRA_ROCM_FLAGS)
+             
 
 # TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
 # We allow the libraries we link against to have undefined symbols so as this can be build in
 # systems with no development version of these libraries (e.g. ncurses).
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
 CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined 
-LDLIBS += -Wl,--allow-shlib-undefined 
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)

From ba4e18fcb2987b7172057aa5fc2613a9e1c1f2f8 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 6 Nov 2023 17:14:29 +0000
Subject: [PATCH 17/22] Move misplaced #pragma unroll.

---
 src/cudamatrix/cu-kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index ac532790b86..349b21b6591 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2135,8 +2135,8 @@ static void _group_transform_reduce(
     __syncthreads();
 
     // tree-reduce to 2x warpSize elements per group
-#   pragma unroll
     int shift = threads_per_group / 2;
+#   pragma unroll
     for (; shift > warpSize; shift >>= 1) {
       if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);

From dac0b272cfff3fba9be4b3cfdd2767271e0d4760 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 6 Nov 2023 23:46:48 +0000
Subject: [PATCH 18/22] Working version trimmed of legacy ROCm < 5.2 code.

---
 .gitignore                                    |   4 -
 src/chain/Makefile                            |  12 --
 src/configure                                 |  22 +--
 src/cudadecoder/Makefile                      |  12 --
 src/cudadecoder/cuda-decoder.cc               |   2 +-
 src/cudafeat/Makefile                         |  12 --
 .../feature-online-batched-ivector-cuda.cc    |  38 -----
 .../feature-online-batched-spectral-cuda.h    |   4 -
 src/cudafeat/feature-online-cmvn-cuda.cu      |   1 +
 src/cudafeat/feature-spectral-cuda.h          |   4 -
 src/cudafeat/online-ivector-feature-cuda.cc   |  20 +--
 src/cudamatrix/Makefile                       |  12 --
 src/cudamatrix/cu-allocator.cc                |   4 -
 src/cudamatrix/cu-allocator.h                 |   4 -
 src/cudamatrix/cu-block-matrix.cc             |   4 -
 src/cudamatrix/cu-common.h                    |   5 -
 src/cudamatrix/cu-compressed-matrix.cc        |   4 -
 src/cudamatrix/cu-device.cc                   |   5 +-
 src/cudamatrix/cu-device.h                    |   9 --
 src/cudamatrix/cu-kernels.cu                  |  33 +---
 src/cudamatrix/cu-math-test.cc                |  11 +-
 src/cudamatrix/cu-matrix-test.cc              |  30 +---
 src/cudamatrix/cu-matrix.cc                   |   4 -
 src/cudamatrix/cu-packed-matrix.cc            |   4 -
 src/cudamatrix/cu-sp-matrix.cc                |   4 -
 src/cudamatrix/cu-sparse-matrix.cc            |   4 -
 src/cudamatrix/cu-tp-matrix.cc                |   4 -
 src/cudamatrix/cu-vector.cc                   |   4 -
 src/hip/hipify.h                              |  12 --
 src/hip/math_constants.h                      | 152 ------------------
 src/makefiles/hip_64bit.mk                    |   3 +
 31 files changed, 29 insertions(+), 414 deletions(-)
 delete mode 100644 src/hip/math_constants.h

diff --git a/.gitignore b/.gitignore
index 53a4079d9ef..9f8c727d4d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,7 +90,3 @@ venv/
 # CMakeLists.txt files are currently autogenerated, must not be committed.
 /src/**/CMakeLists.txt
 /build*
-
-# Eclipse sync project
-.ptp-sync
-.ptp-sync-folder
diff --git a/src/chain/Makefile b/src/chain/Makefile
index 5b177981ad8..dbe6c38709f 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -33,21 +33,9 @@ ifeq ($(CUDA), true)
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
 endif
 ifeq ($(ROCM), true)
-ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
-.PRECIOUS: %.hip
-%.hip : %.cu
-	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
-	cat $< | \
-	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
-	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
-	cat > $@
-%.o : %.hip
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
-else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
-endif
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/configure b/src/configure
index 5f9c48a6cde..37a75a5cade 100755
--- a/src/configure
+++ b/src/configure
@@ -295,12 +295,11 @@ function configure_rocm {
   ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2)
   echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk
   
-  # Enable HIP implementation for CXX compile commands. ROCm 5.2.0 onwards use 
-  # __HIP_PLATFORM_AMD__ others  __HIP_PLATFORM_HCC__
-  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
-    echo "CXXFLAGS += -D__HIP_PLATFORM_AMD__=1" >> kaldi.mk
-  else
-    echo "CXXFLAGS += -D__HIP_PLATFORM_HCC__=1" >> kaldi.mk
+  # Only ROCm 5.2+ is supported.
+  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -lt 2 ] || [ $ROCM_MAJOR_VERSION -lt 5 ] ; then
+    echo "\
+WARNING: ROCm $ROCM_MAJOR_VERSION.$ROCM_MINOR_VERSION found but ROCm 5.2 or above is required."
+    exit 1;
   fi
   
   # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, 
@@ -309,17 +308,10 @@ function configure_rocm {
     cat makefiles/hip_64bit.mk >> kaldi.mk
   else
     echo "\
-WARNING: ROCM will not be used!
-         ROCM is only supported with 64-bit Linux builds."
+WARNING: ROCm will not be used!
+         ROCm is only supported with 64-bit Linux builds."
     exit 1;
   fi
-  
-  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
-    echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk
-    echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = false" >> kaldi.mk
-  else
-    echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = true" >> kaldi.mk
-  fi
 }
 
 
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index d4eda345564..a7972f1831d 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -41,21 +41,9 @@ ifeq ($(CUDA), true)
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
 endif
 ifeq ($(ROCM), true)
-ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
-.PRECIOUS: %.hip
-%.hip : %.cu
-	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
-	cat $< | \
-	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
-	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
-	cat > $@
-%.o : %.hip
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
-else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
 endif
-endif
 
 else
 all:
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 06dceae73a5..9baa274e2ea 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -199,7 +199,7 @@ void CudaDecoder::AllocateHostData() {
       (void**)&h_extra_prev_tokens_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_infotoken_concat_,
+	    (void**)&h_infotoken_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
       cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_,
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index c0f54a854e8..d7739dae623 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -44,21 +44,9 @@ ifeq ($(CUDA), true)
 	$(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
 endif
 ifeq ($(ROCM), true)
-ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
-.PRECIOUS: %.hip
-%.hip : %.cu
-	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
-	cat $< | \
-	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
-	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
-	cat > $@
-%.o : %.hip
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
-else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
 endif
-endif
 else
 all:
 		$(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]")
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc
index 68c247b43e9..1699f8c1e77 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.cc
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc
@@ -382,43 +382,6 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats(
 
 #if CUDA_VERSION >= 9010
   int nrhs = 1;
-
-#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2)
-  // query temp buffer size
-  int L_work;
-
-  // perform factorization in batched
-  CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched_bufferSize(
-        GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
-        ivector_dim_, &L_work, num_lanes));
-  // allocate temp buffer
-  float *workspace = static_cast<float *>(
-          CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
-
-  // perform factorization in batched
-  CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched(
-        GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
-        ivector_dim_, workspace, L_work, d_infoArray_, num_lanes));
-
-  int L_work2;
-
-  // perform factorization in batched
-  CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched_bufferSize(
-		  GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
-		  quad_array_, ivector_dim_, ivec_array_, ivector_dim_, &L_work2, num_lanes));
-  // allocate temp buffer
-  float *workspace2 = static_cast<float *>(
-            CuDevice::Instantiate().Malloc(L_work2 * sizeof(float)));
-
-  // solve for rhs in batched
-  CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched(
-      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
-      quad_array_, ivector_dim_, ivec_array_, ivector_dim_, workspace2, L_work2, d_infoArray_,
-      num_lanes));
-
-  CuDevice::Instantiate().Free(workspace);
-  CuDevice::Instantiate().Free(workspace2);
-#else
   // perform factorization in batched
   CUSOLVER_SAFE_CALL(cusolverDnSpotrfBatched(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
@@ -429,7 +392,6 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
       quad_array_, ivector_dim_, ivec_array_, ivector_dim_, d_infoArray_,
       num_lanes));
-#endif
 #endif
 
   // cusolver solves in place.  Ivectors are now in linear_
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index 202232c6b23..113657ce317 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -20,11 +20,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipfft.h>
-#else
 #include <hipfft/hipfft.h>
-#endif
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index f8947a3b5ed..bb78028118f 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -27,6 +27,7 @@
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 
+// HIP builds do not required packed floating point operators definition.
 #ifndef __IS_HIP_COMPILE__
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
   float2 retval;
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 66f0dce395a..5625592a717 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -20,11 +20,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipfft.h>
-#else
 #include <hipfft/hipfft.h>
-#endif
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index 56dbac93165..fa0e9f68237 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -299,14 +299,13 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats(
   // Forming new non-SP matrix for cusolver.
   CuMatrix<float> A(quadratic);
 
-
-
 #ifdef CHOLESKY
   // query temp buffer size
   int L_work;
   CUSOLVER_SAFE_CALL(
       cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER,
                                   A.NumRows(), A.Data(), A.Stride(), &L_work));
+
   // allocate temp buffer
   float *workspace = static_cast<float *>(
       CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
@@ -317,26 +316,9 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats(
       A.Stride(), workspace, L_work, d_info_));
 
   // solve for rhs
-#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2)
-  // query temp buffer size
-  int L_work2;
-  CUSOLVER_SAFE_CALL(
-	   hipsolverSpotrs_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
-			                      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, &L_work2));
-  // allocate temp buffer
-  float *workspace2 = static_cast<float *>(
-      CuDevice::Instantiate().Malloc(L_work2 * sizeof(float)));
-
-  CUSOLVER_SAFE_CALL(hipsolverSpotrs(
-      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
-      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, workspace2, L_work2, d_info_));
-
-  CuDevice::Instantiate().Free(workspace2);
-#else
   CUSOLVER_SAFE_CALL(cusolverDnSpotrs(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
       A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_));
-#endif
 
   CuDevice::Instantiate().Free(workspace);
 #else
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 3c1100753e5..45c10b78899 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -34,20 +34,8 @@ ifeq ($(CUDA), true)
 endif
 
 ifeq ($(ROCM), true)
-ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
-.PRECIOUS: %.hip
-%.hip : %.cu
-	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
-	cat $< | \
-	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
-	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
-	cat > $@
-%.o : %.hip
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
-else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
-endif
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index d81dca002ce..abd08a9b015 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,11 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index f776bbb620e..1ed7e54b541 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,11 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index 7983cd250e7..fd17fe61893 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,11 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index c4bdf569d3c..41ef7536a7f 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,13 +32,8 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#include <hipsparse.h>
-#else
 #include <hipblas/hipblas.h>
 #include <hipsparse/hipsparse.h>
-#endif
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
 #include <roctracer/roctx.h>
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index 442d2dbac67..e42c93f1b67 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,11 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 25775fb1b05..4d0be20ddc3 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -21,13 +21,10 @@
 // limitations under the License.
 
 
+
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 67b9f1d9e9b..bb1170314c4 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,13 +29,8 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#include <hipsparse.h>
-#else
 #include <hipblas/hipblas.h>
 #include <hipsparse/hipsparse.h>
-#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
@@ -49,11 +44,7 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipsolver.h>
-#else
 #include <hipsolver/hipsolver.h>
-#endif
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 349b21b6591..3d7fae5c15e 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -27,15 +27,18 @@
 
 #include <cfloat>
 #include <limits>
-#include <math_constants.h>
 #ifdef __IS_HIP_COMPILE__
 #define __CUDA_ARCH__ 800
+#include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
 #include "hipify.h"
+#define CUDART_INF HIP_INF
+#define CUDART_INF_F HIP_INF_F
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <hipcub/hipcub.hpp>
 #include <hipcub/block/block_reduce.hpp>
 #else
+#include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION
@@ -2048,27 +2051,9 @@ static void _transform_reduce_mat_cols(
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));
   }
-
-  // if (tid == 0) {
-  //   for (int j = 0; j < d.cols; j += 1)
-  //     tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));
-  //   result[i] = tdata;
-    
-  // }
-  // return;
-
   sdata[tid] = tdata;
   __syncthreads();
 
-  // if (tid == 0) {
-  //   tdata = 0;
-  //   for (int j = 0; j < CU1DBLOCK; j += 1)
-  //     tdata = op.Reduce(tdata, op.Transform(sdata[j]));
-  //   result[i] = tdata;
-  // }
-
-  // return;
-
   // Tree reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -2077,16 +2062,6 @@ static void _transform_reduce_mat_cols(
     __syncthreads();
   }
 
-  // if (tid == 0) {
-  //   tdata = 0;
-  //   for (int j = 0; j < 2*warpSize; j += 1)
-  //     tdata = op.Reduce(tdata, op.Transform(sdata[j]));
-  //   result[i] = tdata;
-  // }
-
-  // return;
-
-
   // Reduce last warp. Threads implicitly synchronized within a warp.
   for (int shift = warpSize; shift > 0; shift >>= 1) {
     if (tid < warpSize) {
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 1245fb28bad..e1d59e777be 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -214,9 +214,9 @@ void UnitTestLstmNonlinearity() {
   for (int32 loop = 0; loop < 10; loop++) {
 
     // problem dimensions.
-    int32 num_rows = RandInt(5, 20), //16
-          cell_dim = RandInt(2, 200), //45
-        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); //3
+    int32 num_rows = RandInt(5, 20),
+          cell_dim = RandInt(2, 200),
+        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
 
     // Pick the (input or params block), and output block, for which we'll
     // spot-check the derivative values.  This will give us test failures
@@ -232,6 +232,7 @@ void UnitTestLstmNonlinearity() {
     else
       test_params = -1;
 
+
     CuMatrix<BaseFloat> input(num_rows, cell_dim * 5 + dropout_dim),
         params(3, cell_dim),
         output_deriv(num_rows, cell_dim * 2);
@@ -276,11 +277,11 @@ void UnitTestLstmNonlinearity() {
     for (int32 i = 0; i < test_dim; i++) {
       CuMatrix<BaseFloat> delta_input(num_rows, 5 * cell_dim + dropout_dim),
           delta_params(3, cell_dim);
-      if (test_input >= 0) { // -1
+      if (test_input >= 0) {
         delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn();
         delta_input.Scale(delta);
       }
-      if (test_params >= 0) { // 0
+      if (test_params >= 0) {
         delta_params.Row(test_params).SetRandn();
         delta_params.Scale(delta);
       }
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 26a5281ec05..ecddd24db19 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2675,19 +2675,11 @@ static void UnitTestCuMatrixSetRandn() {
 
 template <typename Real>
 static void UnitTestCuMatrixSetRandUniform() {
-
-  // if (CuDevice::Instantiate().Enabled()) {
-  //   CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(GetCurandHandle(), 123456));
-  // }
-
   for (int32 i = 0; i < 2; i++) {
-    MatrixIndexT rows = 180+Rand() % 200, cols = 200+Rand() % 200;
+    MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200;
     CuMatrix<Real> M(rows, cols);
     M.SetRandUniform();
-    // M.SetZero();
-    // M.Add(0.5);
-    // M.SetZeroAboveDiag();
-
+    
     M.Add(-0.5); // we'll be testing the central moments, so
     // center it around zero first.
     // Got these moments from http://mathworld.wolfram.com/UniformDistribution.html
@@ -2701,16 +2693,6 @@ static void UnitTestCuMatrixSetRandUniform() {
     for (int32 pow = 1; pow < central_moments.Dim(); pow++) {
       CuMatrix<Real> Mpow(M);
       Mpow.ApplyPow(pow);
-
-      // if (CuDevice::Instantiate().Enabled()) {
-      //   CuVector<Real> col_sum(rows, kUndefined);
-      //   cuda_sum_mat_cols(rows, CU1DBLOCK, col_sum.Data(), Mpow.Data(), Mpow.Dim());
-      //   KALDI_LOG << "Sums vector is " << col_sum;
-      //   Real ans = col_sum.Sum();
-      //   KALDI_LOG << "Total sum is " << ans;
-      //   KALDI_ERR << "Stopping!";
-      // }
-
       Real observed_moment = Mpow.Sum() / (rows * cols);
       // see http://en.wikipedia.org/wiki/Normal_distribution#Moments,
       // note that mu = 0 and sigma = 1.
@@ -2723,13 +2705,11 @@ static void UnitTestCuMatrixSetRandUniform() {
           upper_bound = expected_moment + allowed_deviation;
       if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) {
         KALDI_LOG << "Random matrix is " << M;
-        //KALDI_LOG << "Random vector sum is " << col_sum;
-        KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
+                KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
                   << ", expected " << expected_moment << ", allowed range "
                   << lower_bound << " to " << upper_bound;
       }
-      KALDI_LOG << "Moment[" << pow << "] is " << observed_moment << " (" << expected_moment << ")";
-    }
+          }
   }
 }
 
@@ -3081,7 +3061,7 @@ template<typename Real> void CudaMatrixUnitTest() {
 int main() {
   SetVerboseLevel(1);
   int32 loop = 0;
-  bool test_threads = false;
+  bool test_threads = true;
   // num_threads only matters if test_threads == true.   Don't make it
   // to large, because it will affect CPU usage if you are using CPU.
   int32 num_threads = 4;
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 56acf340823..fd31758f0e6 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,11 +29,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 4de0fcba63d..c9d686d0ce8 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,11 +23,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index 86a3cd9a726..a6c7d7720e4 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,11 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 35ba3ee0c81..cda575b1914 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,11 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 739bab3dd59..378cc8e4e38 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,11 +21,7 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index f6426297e49..c88b3ebf50c 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,11 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 56d7e869a32..efe4848c009 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -211,7 +211,6 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
 #define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
 #define curandSetStream                           hiprandSetStream
 #define curandStatus_t                            hiprandStatus_t
-#if ROCM_MAJOR_VERSION == 5 && ROCM_MINOR_VERSION >= 1 || ROCM_MAJOR_VERSION > 5
 #define cusolverDnCreate                          hipsolverDnCreate
 #define cusolverDnDestroy                         hipsolverDnDestroy
 #define cusolverDnHandle_t                        hipsolverDnHandle_t
@@ -221,17 +220,6 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
 #define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
 #define cusolverDnSpotrs                          hipsolverDnSpotrs
 #define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
-#else
-#define cusolverDnCreate                          hipsolverCreate
-#define cusolverDnDestroy                         hipsolverDestroy
-#define cusolverDnHandle_t                        hipsolverHandle_t
-#define cusolverDnSetStream                       hipsolverSetStream
-#define cusolverDnSpotrf                          hipsolverSpotrf
-#define cusolverDnSpotrfBatched                   hipsolverSpotrfBatched
-#define cusolverDnSpotrf_bufferSize               hipsolverSpotrf_bufferSize
-#define cusolverDnSpotrs                          hipsolverSpotrs
-#define cusolverDnSpotrsBatched                   hipsolverSpotrsBatched
-#endif
 #define cusparseAction_t                          hipsparseAction_t
 #define cusparseCreate                            hipsparseCreate
 #define cusparseCreateCsr                         hipsparseCreateCsr
diff --git a/src/hip/math_constants.h b/src/hip/math_constants.h
deleted file mode 100644
index 7fb8fce8e71..00000000000
--- a/src/hip/math_constants.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__MATH_CONSTANTS_H__)
-#define __MATH_CONSTANTS_H__
-
-/* single precision constants */
-#define CUDART_INF_F            __int_as_float(0x7f800000)
-#define CUDART_NAN_F            __int_as_float(0x7fffffff)
-#define CUDART_MIN_DENORM_F     __int_as_float(0x00000001)
-#define CUDART_MAX_NORMAL_F     __int_as_float(0x7f7fffff)
-#define CUDART_NEG_ZERO_F       __int_as_float(0x80000000)
-#define CUDART_ZERO_F           0.0f
-#define CUDART_ONE_F            1.0f
-#define CUDART_SQRT_HALF_F      0.707106781f
-#define CUDART_SQRT_HALF_HI_F   0.707106781f
-#define CUDART_SQRT_HALF_LO_F   1.210161749e-08f
-#define CUDART_SQRT_TWO_F       1.414213562f
-#define CUDART_THIRD_F          0.333333333f
-#define CUDART_PIO4_F           0.785398163f
-#define CUDART_PIO2_F           1.570796327f
-#define CUDART_3PIO4_F          2.356194490f
-#define CUDART_2_OVER_PI_F      0.636619772f
-#define CUDART_SQRT_2_OVER_PI_F 0.797884561f
-#define CUDART_PI_F             3.141592654f
-#define CUDART_L2E_F            1.442695041f
-#define CUDART_L2T_F            3.321928094f
-#define CUDART_LG2_F            0.301029996f
-#define CUDART_LGE_F            0.434294482f
-#define CUDART_LN2_F            0.693147181f
-#define CUDART_LNT_F            2.302585093f 
-#define CUDART_LNPI_F           1.144729886f
-#define CUDART_TWO_TO_M126_F    1.175494351e-38f
-#define CUDART_TWO_TO_126_F     8.507059173e37f
-#define CUDART_NORM_HUGE_F      3.402823466e38f
-#define CUDART_TWO_TO_23_F      8388608.0f
-#define CUDART_TWO_TO_24_F      16777216.0f
-#define CUDART_TWO_TO_31_F      2147483648.0f
-#define CUDART_TWO_TO_32_F      4294967296.0f
-#define CUDART_REMQUO_BITS_F    3
-#define CUDART_REMQUO_MASK_F    (~((~0)<<CUDART_REMQUO_BITS_F))
-#define CUDART_TRIG_PLOSS_F     105615.0f
-
-/* double precision constants */
-#define CUDART_INF              __longlong_as_double(0x7ff0000000000000ULL)
-#define CUDART_NAN              __longlong_as_double(0xfff8000000000000ULL)
-#define CUDART_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
-#define CUDART_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
-#define CUDART_ZERO             0.0
-#define CUDART_ONE              1.0
-#define CUDART_SQRT_TWO         1.4142135623730951e+0
-#define CUDART_SQRT_HALF        7.0710678118654757e-1
-#define CUDART_SQRT_HALF_HI     7.0710678118654757e-1
-#define CUDART_SQRT_HALF_LO   (-4.8336466567264567e-17)
-#define CUDART_THIRD            3.3333333333333333e-1
-#define CUDART_TWOTHIRD         6.6666666666666667e-1
-#define CUDART_PIO4             7.8539816339744828e-1
-#define CUDART_PIO4_HI          7.8539816339744828e-1
-#define CUDART_PIO4_LO          3.0616169978683830e-17
-#define CUDART_PIO2             1.5707963267948966e+0
-#define CUDART_PIO2_HI          1.5707963267948966e+0
-#define CUDART_PIO2_LO          6.1232339957367660e-17
-#define CUDART_3PIO4            2.3561944901923448e+0
-#define CUDART_2_OVER_PI        6.3661977236758138e-1
-#define CUDART_PI               3.1415926535897931e+0
-#define CUDART_PI_HI            3.1415926535897931e+0
-#define CUDART_PI_LO            1.2246467991473532e-16
-#define CUDART_SQRT_2PI         2.5066282746310007e+0
-#define CUDART_SQRT_2PI_HI      2.5066282746310007e+0
-#define CUDART_SQRT_2PI_LO    (-1.8328579980459167e-16)
-#define CUDART_SQRT_PIO2        1.2533141373155003e+0
-#define CUDART_SQRT_PIO2_HI     1.2533141373155003e+0
-#define CUDART_SQRT_PIO2_LO   (-9.1642899902295834e-17)
-#define CUDART_SQRT_2OPI        7.9788456080286536e-1
-#define CUDART_L2E              1.4426950408889634e+0
-#define CUDART_L2E_HI           1.4426950408889634e+0
-#define CUDART_L2E_LO           2.0355273740931033e-17
-#define CUDART_L2T              3.3219280948873622e+0
-#define CUDART_LG2              3.0102999566398120e-1
-#define CUDART_LG2_HI           3.0102999566398120e-1
-#define CUDART_LG2_LO         (-2.8037281277851704e-18)
-#define CUDART_LGE              4.3429448190325182e-1
-#define CUDART_LGE_HI           4.3429448190325182e-1
-#define CUDART_LGE_LO           1.09831965021676510e-17
-#define CUDART_LN2              6.9314718055994529e-1
-#define CUDART_LN2_HI           6.9314718055994529e-1
-#define CUDART_LN2_LO           2.3190468138462996e-17
-#define CUDART_LNT              2.3025850929940459e+0
-#define CUDART_LNT_HI           2.3025850929940459e+0
-#define CUDART_LNT_LO         (-2.1707562233822494e-16)
-#define CUDART_LNPI             1.1447298858494002e+0
-#define CUDART_LN2_X_1024       7.0978271289338397e+2
-#define CUDART_LN2_X_1025       7.1047586007394398e+2
-#define CUDART_LN2_X_1075       7.4513321910194122e+2
-#define CUDART_LG2_X_1024       3.0825471555991675e+2
-#define CUDART_LG2_X_1075       3.2360724533877976e+2
-#define CUDART_TWO_TO_23        8388608.0
-#define CUDART_TWO_TO_52        4503599627370496.0
-#define CUDART_TWO_TO_53        9007199254740992.0
-#define CUDART_TWO_TO_54        18014398509481984.0
-#define CUDART_TWO_TO_M54       5.5511151231257827e-17
-#define CUDART_TWO_TO_M1022     2.22507385850720140e-308
-#define CUDART_TRIG_PLOSS       2147483648.0
-#define CUDART_DBL2INT_CVT      6755399441055744.0
-
-#endif /* !__MATH_CONSTANTS_H__ */
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index aec3e359f53..3d9b87dcc03 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -11,6 +11,7 @@ endif
 # Specific HIP/ROCm components should be included prior to the generic include to avoid
 # deprecation warnings.
 CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
+            -D__HIP_PLATFORM_AMD__=1 \
             -D__IS_HIP_COMPILE__=1 \
             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
             -DCUDA_VERSION=11000 \
@@ -35,9 +36,11 @@ ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \
 #       CUDA is active and everything is device compiles.
 ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 \
+             -D__HIP_PLATFORM_AMD__=1 \
              -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
              -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \
 	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics  \
+             -fgpu-default-stream=per-thread \
              $(EXTRA_ROCM_FLAGS)
              
 

From 0018fecdd8438c2d2fa7db23c94a045f210efc3c Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 6 Nov 2023 23:59:05 +0000
Subject: [PATCH 19/22] Fix formating to Google style.

---
 .../batched-static-nnet3-kernels.h            |   1 +
 ...hed-threaded-nnet3-cuda-online-pipeline.cc |   1 +
 .../batched-threaded-nnet3-cuda-pipeline.cc   |   1 +
 .../batched-threaded-nnet3-cuda-pipeline2.cc  |   1 +
 src/cudadecoder/cuda-decoder-kernels.cu       |   3 +-
 src/cudadecoder/cuda-decoder.cc               |  22 +-
 src/cudadecoder/cuda-decoder.h                |   1 +
 src/cudadecoder/cuda-fst.cc                   |   1 +
 .../batched-wav-nnet3-cuda-online.cc          |   2 +-
 src/cudadecoderbin/batched-wav-nnet3-cuda.cc  |   2 +-
 src/cudadecoderbin/batched-wav-nnet3-cuda2.cc |   1 +
 ...eature-online-batched-cmvn-cuda-kernels.cu |   1 +
 ...ure-online-batched-ivector-cuda-kernels.cu |  27 +-
 ...re-online-batched-spectral-cuda-kernels.cu |   4 +-
 .../feature-online-batched-spectral-cuda.h    |   1 +
 src/cudafeat/feature-online-cmvn-cuda.cu      |   4 +-
 src/cudafeat/feature-spectral-cuda.cu         |   2 +
 src/cudafeat/feature-spectral-cuda.h          |   1 +
 src/cudafeat/feature-window-cuda.cu           |   1 +
 .../online-batched-feature-pipeline-cuda.cc   |   4 +-
 .../online-ivector-feature-cuda-kernels.cu    |  28 +-
 src/cudafeat/online-ivector-feature-cuda.cc   |   1 +
 src/cudamatrix/cu-allocator.cc                |   2 +-
 src/cudamatrix/cu-allocator.h                 |   3 +-
 src/cudamatrix/cu-array-inl.h                 |   1 +
 src/cudamatrix/cu-array.cc                    |   1 +
 src/cudamatrix/cu-block-matrix.cc             |   1 +
 src/cudamatrix/cu-common.cc                   | 121 +++--
 src/cudamatrix/cu-common.h                    |   5 +-
 src/cudamatrix/cu-compressed-matrix.cc        |   1 +
 src/cudamatrix/cu-device.cc                   |  11 +-
 src/cudamatrix/cu-device.h                    |   5 +-
 src/cudamatrix/cu-kernels.cu                  | 107 ++--
 src/cudamatrix/cu-matrix-test.cc              |   6 +-
 src/cudamatrix/cu-matrix.cc                   |   1 +
 src/cudamatrix/cu-packed-matrix.cc            |   1 +
 src/cudamatrix/cu-sp-matrix.cc                |   1 +
 src/cudamatrix/cu-sparse-matrix.cc            |   1 +
 src/cudamatrix/cu-tp-matrix.cc                |   1 +
 src/cudamatrix/cu-vector.cc                   |  16 +-
 src/cudamatrix/cublas-wrappers.h              |  13 +-
 src/hip/hipify.h                              | 488 +++++++++---------
 42 files changed, 512 insertions(+), 384 deletions(-)

diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h
index 0bcb1997576..fec2470a9db 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.h
+++ b/src/cudadecoder/batched-static-nnet3-kernels.h
@@ -19,6 +19,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
index c7012b686e0..ed0c0a2f5e9 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -23,6 +23,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index d5cf7dae2d7..23d0ca283a2 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -28,6 +28,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
index f6a3455db01..01d6b1165e7 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
@@ -25,6 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
index 6a14371911d..8503182c1f8 100644
--- a/src/cudadecoder/cuda-decoder-kernels.cu
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -16,8 +16,9 @@
 // limitations under the License.
 
 #ifdef __IS_HIP_COMPILE__
-#include "float.h"
 #include <hipcub/hipcub.hpp>
+
+#include "float.h"
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 9baa274e2ea..056d563a791 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -40,6 +40,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
@@ -190,35 +191,36 @@ void CudaDecoder::AllocateDeviceData() {
 void CudaDecoder::AllocateHostData() {
   channel_to_compute_.resize(nlanes_);
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void**)&h_extra_and_acoustic_cost_concat_,
+      (void **)&h_extra_and_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void**)&h_acoustic_cost_concat_,
+      (void **)&h_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void**)&h_extra_prev_tokens_concat_,
+      (void **)&h_extra_prev_tokens_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	    (void**)&h_infotoken_concat_,
+      (void **)&h_infotoken_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
-      cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_,
+      cudaMallocHost((void **)&h_extra_and_acoustic_cost_concat_tmp_,
                      nlanes_ * main_q_capacity_ *
                          sizeof(*h_extra_and_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_acoustic_cost_concat_tmp_,
+      (void **)&h_acoustic_cost_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_extra_prev_tokens_concat_tmp_,
+      (void **)&h_extra_prev_tokens_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_infotoken_concat_tmp_,
+      (void **)&h_infotoken_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_)));
   h_lanes_counters_.Resize(
       nlanes_ + 1,
       1);  // +1 because we sometimes need last+1 value (for offsets)
-  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMallocHost((void **)&h_channels_counters_,
+                     nchannels_ * sizeof(*h_channels_counters_)));
 
   h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
   h_all_tokens_acoustic_cost_.resize(nchannels_);
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
index 510904aa004..f6ee37512e2 100644
--- a/src/cudadecoder/cuda-decoder.h
+++ b/src/cudadecoder/cuda-decoder.h
@@ -22,6 +22,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 3af37eb7676..682485f6ce4 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -25,6 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
index 56368853df2..2bc0a483a0f 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
@@ -25,8 +25,8 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include "hip/hip_runtime.h"
-#include "roctracer/roctx.h"
 #include "hipify.h"
+#include "roctracer/roctx.h"
 #else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index 05af50d7a3b..0e4a719bc75 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -19,8 +19,8 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include "hip/hip_runtime.h"
-#include "roctracer/roctx.h"
 #include "hipify.h"
+#include "roctracer/roctx.h"
 #else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
index c14571f2ed9..b2ad9254c67 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
@@ -22,6 +22,7 @@
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <cuda.h>
diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
index 7a521d43693..1df9c6a7a43 100644
--- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
@@ -17,6 +17,7 @@
 //
 #ifdef __IS_HIP_COMPILE__
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index e5b89d163e5..da2ba24bd90 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -18,6 +18,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
@@ -102,8 +103,9 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss,
                              int32_t right, const LaneDesc *lanes,
                              int32_t num_lanes) {
   dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, 
-              (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
+  dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+              (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) /
+                  GPU_MAX_WARPS_PER_BLOCK,
               num_lanes);
 
   zero_invalid_posteriors_kernel<<<blocks, threads>>>(
@@ -217,8 +219,10 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim,
                              int32_t stridest, float *spliced_feats,
                              int32_t lds, int32_t strides,
                              const LaneDesc *lanes, int32_t num_lanes) {
-  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
-  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;       // Max block size is 1024 threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK)
+    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
 
   dim3 blocks(num_chunk_frames, num_lanes);
 
@@ -314,8 +318,8 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
     // is less than stash size
 
     KALDI_ASSERT(stash_size <= GPU_WARP_SIZE);
-    // This only works if stash size is <= GPU_WARP_SIZE as we rely on __syncthreads()
-    // to avoid read/write hazards when reading/writing in-place
+    // This only works if stash size is <= GPU_WARP_SIZE as we rely on
+    // __syncthreads() to avoid read/write hazards when reading/writing in-place
     dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
     dim3 blocks(num_lanes);
 
@@ -325,9 +329,11 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
   }
 
   {
-    int threads =
-        (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;       // round up to the nearest warp size
-    if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
+    int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                  GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+    if (threads > GPU_MAX_THREADS_PER_BLOCK)
+      threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                            // GPU_MAX_THREADS_PER_BLOCK threads
     dim3 blocks(stash_size, num_lanes);
 
     // Then we need to copy feats from source into stash
@@ -510,7 +516,8 @@ void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A,
                                  int32_t lda, int32_t stridea,
                                  const LaneDesc *lanes, int32_t num_lanes) {
   dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
-  int block = (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE;  // blocks in x and y dimensions
+  int block =
+      (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE;  // blocks in x and y dimensions
   dim3 blocks(block, block, num_lanes);
 
   batched_convert_sp_to_dense_kernel<<<blocks, threads>>>(
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index 27375f4914e..856d2acab81 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -18,8 +18,10 @@
 #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h"
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipcub/hipcub.hpp>
 #include <roctracer/roctx.h>
+
+#include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index 113657ce317..d18f5237e8f 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -21,6 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hipfft/hipfft.h>
+
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index bb78028118f..e432fe56573 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -18,6 +18,7 @@
 #ifdef __IS_HIP_COMPILE__
 #define __CUDA_ARCH__ 800
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
@@ -189,7 +190,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
       stats.Stride());
   CU_SAFE_CALL(cudaGetLastError());
 
-  threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to GPU_WARP_SIZE threads
+  threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+            GPU_MAX_WARPS_PER_BLOCK;  // round up to GPU_WARP_SIZE threads
   if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;
 
   const CuMatrix<float> &gstats = cmvn_state_.global_cmvn_stats;
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index 9c0d5df5288..d8fc215b80b 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -19,7 +19,9 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 5625592a717..b0e4a24c8d2 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -21,6 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hipfft/hipfft.h>
+
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
index 6ba45e682c1..60fe113d402 100644
--- a/src/cudafeat/feature-window-cuda.cu
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -19,6 +19,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
index 650b51ec3c7..7736f525237 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
@@ -22,6 +22,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
@@ -100,7 +101,8 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda(
   current_samples_stash_ = new int32_t[num_channels_];
 
   // allocated pinned memory for storing channel desc
-  CU_SAFE_CALL(cudaMallocHost((void**)&h_lanes_, sizeof(LaneDesc) * max_lanes_));
+  CU_SAFE_CALL(
+      cudaMallocHost((void **)&h_lanes_, sizeof(LaneDesc) * max_lanes_));
 
   // allocate device memory
   lanes_ =
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
index dffc9fd3c8f..b7128dec7e6 100644
--- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -17,6 +17,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
@@ -34,9 +35,12 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols,
   // Specialize WarpReduce for type float
   typedef cub::WarpReduce<float> WarpReduce;
   // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
-  __shared__ typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK];
+  __shared__
+      typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK];
 
-  __shared__ float s_A[GPU_MAX_WARPS_PER_BLOCK][GPU_WARP_SIZE + 1];  //+1 to avoid bank conflicts on transpose
+  __shared__ float
+      s_A[GPU_MAX_WARPS_PER_BLOCK]
+         [GPU_WARP_SIZE + 1];  //+1 to avoid bank conflicts on transpose
 
   int bid = blockIdx.x;   // batch id
   int tid = threadIdx.x;  // thread id
@@ -47,13 +51,15 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols,
   // Offset to input vector to starting column for batch
   const float* __restrict__ X_in = X + bid * ldx;
 
-  for (int i = 0; i < cols; i += GPU_WARP_SIZE) {  // threadIdx.x, keep all threads present
+  for (int i = 0; i < cols;
+       i += GPU_WARP_SIZE) {  // threadIdx.x, keep all threads present
     int c = i + tid;
 
     float sum = 0.0f;
     // Perform dot product
     for (int j = 0; j < rows;
-         j += GPU_MAX_WARPS_PER_BLOCK) {  // threadIdx.y, keep all threads present
+         j +=
+         GPU_MAX_WARPS_PER_BLOCK) {  // threadIdx.y, keep all threads present
       int r = j + wid;
 
       float val = 0.0f;
@@ -139,7 +145,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows,
                                                     int32_t lda, float scale,
                                                     float* retval) {
   // Specialize WarpReduce for type float
-  typedef cub::BlockReduce<float, GPU_WARP_SIZE, cub::BLOCK_REDUCE_WARP_REDUCTIONS, GPU_MAX_WARPS_PER_BLOCK>
+  typedef cub::BlockReduce<float, GPU_WARP_SIZE,
+                           cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                           GPU_MAX_WARPS_PER_BLOCK>
       BlockReduce;
   // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -207,7 +215,8 @@ __global__ void update_linear_and_quadratic_terms_kernel(
 void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
                          const float* AT, int B_stride, const float* B,
                          float* C) {
-  batched_gemv_reduce_kernel<<<batch_size, dim3(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK)>>>(
+  batched_gemv_reduce_kernel<<<batch_size,
+                               dim3(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK)>>>(
       rows, cols, AT, A_stride, B, B_stride, C);
   CU_SAFE_CALL(cudaGetLastError());
 }
@@ -215,8 +224,11 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
 void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
                      int32_t size, const float* feats, int32_t ldf,
                      float* sfeats, int32_t lds) {
-  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
-  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;       // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK)
+    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                          // GPU_MAX_THREADS_PER_BLOCK threads
 
   splice_features_kernel<<<num_frames, threads>>>(
       num_frames, feat_dim, left, size, feats, ldf, sfeats, lds);
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index fa0e9f68237..f96b2a81ce2 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -18,6 +18,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 // The BLAS enumerators are used instead of the SOLVER ones.
 #ifdef CUBLAS_FILL_MODE_LOWER
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index abd08a9b015..c4cceedca48 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -26,6 +26,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
@@ -33,7 +34,6 @@
 #include <cuda_runtime_api.h>
 #endif
 
-
 #include <string>
 #include <vector>
 #include <algorithm>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 1ed7e54b541..3edd9f1ca40 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,9 +24,10 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 1fd80502cf9..b8c250c6771 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -30,6 +30,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
index 333e8fbed1c..2a29338aeb1 100644
--- a/src/cudamatrix/cu-array.cc
+++ b/src/cudamatrix/cu-array.cc
@@ -24,6 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fd17fe61893..63cf33f98b2 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -22,6 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 2e77062f20d..938ec679f68 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -24,6 +24,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #define API_NAME_PREFIX "HIP"
 #else
@@ -59,7 +60,7 @@ NvtxTracer::~NvtxTracer() {
 #ifdef __IS_HIP_COMPILE__
   roctxRangePop();
 #else
-  nvtxRangePop();
+        nvtxRangePop();
 #endif
 }
 #endif
@@ -102,19 +103,31 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
 const char* cublasGetStatusStringK(cublasStatus_t status) {
   // Defined in CUDA include file: cublas.h or cublas_api.h
   switch(status) {
-    case CUBLAS_STATUS_SUCCESS:           return API_NAME_PREFIX "BLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED:   return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED:      return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE:     return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH:     return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED:  return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR:    return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED:     return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
+    case CUBLAS_STATUS_SUCCESS:
+      return API_NAME_PREFIX "BLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
 #ifdef __IS_HIP_COMPILE__
-    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
-    case HIPBLAS_STATUS_INVALID_ENUM:     return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:
+      return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+    case HIPBLAS_STATUS_INVALID_ENUM:
+      return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
 #endif
   }
   return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR";
@@ -124,20 +137,32 @@ const char* cusparseGetStatusString(cusparseStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust
   // Defined in CUDA include file: cusparse.h
   switch(status) {
-    case CUSPARSE_STATUS_SUCCESS:                   return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS";
-    case CUSPARSE_STATUS_NOT_INITIALIZED:           return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED";
-    case CUSPARSE_STATUS_ALLOC_FAILED:              return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED";
-    case CUSPARSE_STATUS_INVALID_VALUE:             return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE";
-    case CUSPARSE_STATUS_ARCH_MISMATCH:             return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH";
-    case CUSPARSE_STATUS_MAPPING_ERROR:             return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR";
-    case CUSPARSE_STATUS_EXECUTION_FAILED:          return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED";
-    case CUSPARSE_STATUS_INTERNAL_ERROR:            return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR";
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_ZERO_PIVOT:                return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT";
-    #if CUDA_VERSION >= 11000
-    case CUSPARSE_STATUS_NOT_SUPPORTED:             return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
-    #endif
+    case CUSPARSE_STATUS_SUCCESS:
+      return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR";
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_ZERO_PIVOT:
+      return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT";
+#if CUDA_VERSION >= 11000
+    case CUSPARSE_STATUS_NOT_SUPPORTED:
+      return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:
+      return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
+#endif
   }
   return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR";
 }
@@ -146,21 +171,35 @@ const char* curandGetStatusString(curandStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html
   // Defined in CUDA include file: curand.h
   switch(status) {
-    case CURAND_STATUS_SUCCESS:                     return API_NAME_PREFIX "RAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:            return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:             return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:           return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:                  return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:                return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:              return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:         return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:       return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:               return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:              return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
+    case CURAND_STATUS_SUCCESS:
+      return API_NAME_PREFIX "RAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+      return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
 #ifdef __IS_HIP_COMPILE__
-    case HIPRAND_STATUS_NOT_IMPLEMENTED:            return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED";
+    case HIPRAND_STATUS_NOT_IMPLEMENTED:
+      return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED";
 #endif
   }
   return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR";
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 41ef7536a7f..934668da6f2 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,11 +32,12 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
-#include <hipsparse/hipsparse.h>
 #include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
 #include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index e42c93f1b67..bb4017de9bb 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -22,6 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 4d0be20ddc3..fd2c0c64f1f 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -24,15 +24,16 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#endif // __IS_HIP_COMPILE__
+#endif  // __IS_HIP_COMPILE__
 #include <string>
 #include <vector>
 #include <algorithm>
@@ -246,12 +247,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     return;
   } else {
     // Suggest to use compute exclusive mode
-  #ifdef __IS_HIP_COMPILE__
+#ifdef __IS_HIP_COMPILE__
     KALDI_WARN << "Not in compute-exclusive mode.";
-  #else 
+#else
     KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
         "'nvidia-smi -c 3' to set compute exclusive mode";
-  #endif
+#endif
     // We want to choose the device more carefully, so release the CUDA context.
     e = cudaDeviceReset();
     if (e != cudaSuccess) {
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index bb1170314c4..fe8ac795560 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,11 +29,12 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
-#include <hipsparse/hipsparse.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
 #include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 3d7fae5c15e..8d5784acb52 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -31,18 +31,18 @@
 #define __CUDA_ARCH__ 800
 #include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
+
 #include "hipify.h"
-#define CUDART_INF HIP_INF
-#define CUDART_INF_F HIP_INF_F
-#include "cudamatrix/cu-kernels-ansi.h"
-#include <hipcub/hipcub.hpp>
 #include <hipcub/block/block_reduce.hpp>
+#include <hipcub/hipcub.hpp>
+
+#include "cudamatrix/cu-kernels-ansi.h"
 #else
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION
-#endif //__IS_HIP_COMPILE__
+#endif            //__IS_HIP_COMPILE__
 
 /***********************************************************************
  * Generic __device__ functions
@@ -1122,7 +1122,7 @@ void trace_mat_mat_trans_atomic(Real *d_result,
   // Assuming *d_result is set to zero already
 
   constexpr int THREADS_X = GPU_WARP_SIZE;
-  constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK/2;
+  constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK / 2;
 
   dim3 thrds(THREADS_X, THREADS_Y);
 
@@ -2111,7 +2111,7 @@ static void _group_transform_reduce(
 
     // tree-reduce to 2x warpSize elements per group
     int shift = threads_per_group / 2;
-#   pragma unroll
+#pragma unroll
     for (; shift > warpSize; shift >>= 1) {
       if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
@@ -4009,9 +4009,9 @@ struct  BatchedMatrixCopyDesc {
   MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
 };
 
-// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA)
-// grid dim x,y expands to fill out average in x/y across batches
-// grid dim.z is batch
+// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE
+// (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) grid dim x,y
+// expands to fill out average in x/y across batches grid dim.z is batch
 template<typename Real>
 __global__ 
 void _cuda_batch_copy_mats(BatchedMatrixCopyDesc<Real> batch_desc) {
@@ -4390,7 +4390,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B,
 
 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
                          MatrixDim dA, int B_stride, float* value) {
-  _trace_mat_mat<GPU_WARP_SIZE> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE><<<Gr, Bl>>>(A, B, dA, B_stride, value);
 }
 
 void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
@@ -4413,8 +4413,8 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
                                            v, stride_v);
 #ifdef __IS_HIP_COMPILE__
   } else if (Bl.x == 64) {
-    _add_diag_mat_mat_MTN<64> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
-                                           v, stride_v);
+    _add_diag_mat_mat_MTN<64>
+        <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v);
 #endif
   }
 }
@@ -4426,10 +4426,10 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
   if (Bl.x == 16) {
     _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
-    _add_diag_mat_mat_MN<32> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<32><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
 #ifdef __IS_HIP_COMPILE__
-  } else if (Bl.x==64) {
-    _add_diag_mat_mat_MN<64> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MN<64><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
 #endif
   }
 }
@@ -5105,7 +5105,7 @@ void cudaD_trace_mat_mat_trans(const double* A,
 
 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
                          MatrixDim dA, int B_stride, double* value) {
-  _trace_mat_mat<GPU_WARP_SIZE> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE><<<Gr, Bl>>>(A, B, dA, B_stride, value);
 }
 
 void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
@@ -5128,8 +5128,8 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
                                            v, stride_v);
 #ifdef __IS_HIP_COMPILE__
   } else if (Bl.x == 64) {
-    _add_diag_mat_mat_MTN<64> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
-                                           v, stride_v);
+    _add_diag_mat_mat_MTN<64>
+        <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v);
 #endif
   }
 }
@@ -5141,10 +5141,10 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
   if (Bl.x == 16) {
     _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
-    _add_diag_mat_mat_MN<32> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<32><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
 #ifdef __IS_HIP_COMPILE__
-  } else if (Bl.x==64) {
-    _add_diag_mat_mat_MN<64> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MN<64><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
 #endif
   }
 }
@@ -5516,25 +5516,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
@@ -5831,8 +5831,9 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
 // this will synchronize all threads without blocking.
 void cuda_legacy_noop() {
 #ifdef __IS_HIP_COMPILE__
-  // HIP doesn't currently support cudaStreamLegacy stream so we force the implementation to use the
-  // legacy (not per-thread) API to get similar semantics.
+  // HIP doesn't currently support cudaStreamLegacy stream so we force the
+  // implementation to use the legacy (not per-thread) API to get similar
+  // semantics.
   auto k = reinterpret_cast<void*>(_noop_kernel);
   hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0);
 #else
@@ -5847,8 +5848,10 @@ void cudaF_mat_copy_range_clamped(
    float *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks(
+      (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+      (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5861,8 +5864,10 @@ void cudaD_mat_copy_range_clamped(
    double *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks(
+      (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+      (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5871,8 +5876,7 @@ void cudaD_mat_copy_range_clamped(
 void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
     int32_t *ldo) {
-
-  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<float> batch_desc; 
@@ -5898,9 +5902,10 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
-                  MAX_BATCH_SIZE);
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5920,10 +5925,11 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
-      
-      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
-                  remaining);
+
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          remaining);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5936,8 +5942,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
 void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
     int32_t *ldo) {
-
-  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<double> batch_desc; 
@@ -5963,9 +5968,10 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
-                  MAX_BATCH_SIZE);
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5986,10 +5992,11 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
 
-      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
-                  remaining);
-      
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          remaining);
+
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
 
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index ecddd24db19..dfcaf30770a 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2679,7 +2679,7 @@ static void UnitTestCuMatrixSetRandUniform() {
     MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200;
     CuMatrix<Real> M(rows, cols);
     M.SetRandUniform();
-    
+
     M.Add(-0.5); // we'll be testing the central moments, so
     // center it around zero first.
     // Got these moments from http://mathworld.wolfram.com/UniformDistribution.html
@@ -2705,11 +2705,11 @@ static void UnitTestCuMatrixSetRandUniform() {
           upper_bound = expected_moment + allowed_deviation;
       if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) {
         KALDI_LOG << "Random matrix is " << M;
-                KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
+        KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment
                   << ", expected " << expected_moment << ", allowed range "
                   << lower_bound << " to " << upper_bound;
       }
-          }
+    }
   }
 }
 
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index fd31758f0e6..53831a52bc8 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -30,6 +30,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index c9d686d0ce8..001170fdeca 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -24,6 +24,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index a6c7d7720e4..96085848d72 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -22,6 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index cda575b1914..81ecbe68080 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -25,6 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 378cc8e4e38..da19a31b39a 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -22,6 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index c88b3ebf50c..6667f2bca62 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -25,6 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
@@ -649,8 +650,9 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         const int32 warpSize = GPU_WARP_SIZE;
         const int32 kOptNumBlocks = 512;
         const int32 tile_dim =
-            (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ?
-                GPU_WARP_SIZE/2 : GPU_WARP_SIZE;
+            (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize)
+                ? GPU_WARP_SIZE / 2
+                : GPU_WARP_SIZE;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x),
                      n_blocks(N.NumRows(), dimBlock.y));
@@ -676,8 +678,9 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access.
         // One block per 'tile_dim' columns of N.
         // 1D grid expands along the row of N.
-        int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2;
+        int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048
+                           ? GPU_WARP_SIZE
+                           : GPU_WARP_SIZE / 2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(),
@@ -685,8 +688,9 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
       } else {
         // Case 4: diag(M'*N') == sum(N'.*M, 1)
         // Same kernel and config as case 3 except M and N are swapped.
-        int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2;
+        int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048
+                           ? GPU_WARP_SIZE
+                           : GPU_WARP_SIZE / 2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(M.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(),
diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h
index dc5c0e0ced5..537cca9b97f 100644
--- a/src/cudamatrix/cublas-wrappers.h
+++ b/src/cudamatrix/cublas-wrappers.h
@@ -37,8 +37,9 @@ inline cublasStatus_t cublas_gemm(
     const float *A, int lda, const float *B, int ldb, float beta,
     float *C, int ldc) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUBLAS_R_32F,lda,B,CUBLAS_R_32F,ldb,&beta,
-                      C,CUBLAS_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(),
+  return cublasGemmEx(handle, transa, transb, m, n, k, &alpha, A, CUBLAS_R_32F,
+                      lda, B, CUBLAS_R_32F, ldb, &beta, C, CUBLAS_R_32F, ldc,
+                      CuDevice::Instantiate().GetCublasComputeType(),
                       CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc);
@@ -66,9 +67,11 @@ inline cublasStatus_t cublas_gemmBatched(
     const float *A[], int lda, const float *B[], int ldb, float beta,
     float *C[], int ldc, int batchCount) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUBLAS_R_32F,  lda,
-                             (const void**)B, CUBLAS_R_32F, ldb, &beta, (void**)C, CUBLAS_R_32F, ldc, batchCount,
-                             CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo());
+  return cublasGemmBatchedEx(
+      handle, transa, transb, m, n, k, &alpha, (const void **)A, CUBLAS_R_32F,
+      lda, (const void **)B, CUBLAS_R_32F, ldb, &beta, (void **)C, CUBLAS_R_32F,
+      ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(),
+      CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount);
 #endif
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index efe4848c009..e9ca483d022 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -2,250 +2,262 @@
 #define __HIPIFY_H__
 
 #ifdef __HIPCC__
-inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
-    // On CDNA hardware wave-fronts (warps) execute always in
-    // lock step. Though it might still be important to signal
-    // that the compiler can't reorder code around certain code 
-    // sections that rely on data sharing mecanisms like LDS 
-    // (shared memory). So this implements a No-op but is seen
-    // by the compiler as having side effects. 
-    __asm__("s_nop 0");
+inline __device__ void __syncwarp(unsigned mask = 0xffffffff) {
+  // On CDNA hardware wave-fronts (warps) execute always in
+  // lock step. Though it might still be important to signal
+  // that the compiler can't reorder code around certain code
+  // sections that rely on data sharing mecanisms like LDS
+  // (shared memory). So this implements a No-op but is seen
+  // by the compiler as having side effects.
+  __asm__("s_nop 0");
 
-    // A saffest option, arguably less performant would be to use:
-    // __asm__("s_waitcnt lgkmcnt(0)"); Í
-    // to explicitly do a memory fence. 
+  // A saffest option, arguably less performant would be to use:
+  // __asm__("s_waitcnt lgkmcnt(0)"); Í
+  // to explicitly do a memory fence.
 }
 // AMDGCN only support this rounding mode.
 #define __fdiv_rd __fdiv_rn
 #else
-#define __align__(x) __attribute__((aligned (x)))
+#define __align__(x) __attribute__((aligned(x)))
 #endif
 
 //
 // HIP types
 //
-#define CUBLAS_COMPUTE_32F                        HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F               HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
-#define CUBLAS_COMPUTE_32F_FAST_TF32              HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
-#define CUBLAS_DIAG_NON_UNIT                      HIPBLAS_DIAG_NON_UNIT
-#define CUBLAS_FILL_MODE_LOWER                    HIPBLAS_FILL_MODE_LOWER
-#define CUBLAS_FILL_MODE_UPPER                    HIPBLAS_FILL_MODE_UPPER
-#define CUBLAS_GEMM_DEFAULT                       HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP             HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs.
-#define CUBLAS_OP_C                               HIPBLAS_OP_C
-#define CUBLAS_OP_N                               HIPBLAS_OP_N
-#define CUBLAS_OP_N                               HIPBLAS_OP_N
-#define CUBLAS_OP_T                               HIPBLAS_OP_T
-#define CUBLAS_R_32F                              HIPBLAS_R_32F
-#define CUBLAS_R_64F                              HIPBLAS_R_64F
-#define CUBLAS_SIDE_LEFT                          HIPBLAS_SIDE_LEFT
-#define CUBLAS_STATUS_ALLOC_FAILED                HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_ARCH_MISMATCH               HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_EXECUTION_FAILED            HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR              HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_INVALID_VALUE               HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_LICENSE_ERROR               HIPBLAS_STATUS_UNKNOWN
-#define CUBLAS_STATUS_MAPPING_ERROR               HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_NOT_INITIALIZED             HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_NOT_SUPPORTED               HIPBLAS_STATUS_NOT_SUPPORTED
-#define CUBLAS_STATUS_SUCCESS                     HIPBLAS_STATUS_SUCCESS
-#define CUDA_R_32F                                HIP_R_32F
-#define CUDA_R_64F                                HIP_R_64F
-#define CUFFT_R2C                                 HIPFFT_R2C
-#define CUFFT_SUCCESS                             HIPFFT_SUCCESS
-#define CURAND_RNG_PSEUDO_DEFAULT                 HIPRAND_RNG_PSEUDO_DEFAULT
-#define CURAND_STATUS_ALLOCATION_FAILED           HIPRAND_STATUS_ALLOCATION_FAILED
-#define CURAND_STATUS_ARCH_MISMATCH               HIPRAND_STATUS_ARCH_MISMATCH
-#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED   HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
-#define CURAND_STATUS_INITIALIZATION_FAILED       HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_INITIALIZATION_FAILED       HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_INTERNAL_ERROR              HIPRAND_STATUS_INTERNAL_ERROR
-#define CURAND_STATUS_LAUNCH_FAILURE              HIPRAND_STATUS_LAUNCH_FAILURE
-#define CURAND_STATUS_LENGTH_NOT_MULTIPLE         HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
-#define CURAND_STATUS_NOT_INITIALIZED             HIPRAND_STATUS_NOT_INITIALIZED
-#define CURAND_STATUS_OUT_OF_RANGE                HIPRAND_STATUS_OUT_OF_RANGE
-#define CURAND_STATUS_PREEXISTING_FAILURE         HIPRAND_STATUS_PREEXISTING_FAILURE
-#define CURAND_STATUS_SUCCESS                     HIPRAND_STATUS_SUCCESS
-#define CURAND_STATUS_TYPE_ERROR                  HIPRAND_STATUS_TYPE_ERROR
-#define CURAND_STATUS_VERSION_MISMATCH            HIPRAND_STATUS_VERSION_MISMATCH
-#define CUSPARSE_ACTION_NUMERIC                   HIPSPARSE_ACTION_NUMERIC
-#define CUSPARSE_INDEX_32I                        HIPSPARSE_INDEX_32I
-#define CUSPARSE_INDEX_BASE_ZERO                  HIPSPARSE_INDEX_BASE_ZERO
-#define CUSPARSE_OPERATION_NON_TRANSPOSE          HIPSPARSE_OPERATION_NON_TRANSPOSE
-#define CUSPARSE_OPERATION_TRANSPOSE              HIPSPARSE_OPERATION_TRANSPOSE
-#define CUSPARSE_ORDER_COL                        HIPSPARSE_ORDER_COLUMN
-#define CUSPARSE_SPMM_CSR_ALG2                    HIPSPARSE_SPMM_CSR_ALG2
-#define CUSPARSE_STATUS_ALLOC_FAILED              HIPSPARSE_STATUS_ALLOC_FAILED
-#define CUSPARSE_STATUS_ARCH_MISMATCH             HIPSPARSE_STATUS_ARCH_MISMATCH
-#define CUSPARSE_STATUS_EXECUTION_FAILED          HIPSPARSE_STATUS_EXECUTION_FAILED
-#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
-#define CUSPARSE_STATUS_INTERNAL_ERROR            HIPSPARSE_STATUS_INTERNAL_ERROR
-#define CUSPARSE_STATUS_INVALID_VALUE             HIPSPARSE_STATUS_INVALID_VALUE
-#define CUSPARSE_STATUS_MAPPING_ERROR             HIPSPARSE_STATUS_MAPPING_ERROR
-#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
-#define CUSPARSE_STATUS_NOT_INITIALIZED           HIPSPARSE_STATUS_NOT_INITIALIZED
-#define CUSPARSE_STATUS_NOT_SUPPORTED             HIPSPARSE_STATUS_NOT_SUPPORTED
-#define CUSPARSE_STATUS_SUCCESS                   HIPSPARSE_STATUS_SUCCESS
-#define CUSPARSE_STATUS_ZERO_PIVOT                HIPSPARSE_STATUS_ZERO_PIVOT
-#define cuDeviceGetName                           hipDeviceGetName
-#define cuMemGetInfo_v2                           hipMemGetInfo
-#define cublasComputeType_t                       hipblasDatatype_t
-#define cublasCreate                              hipblasCreate
-#define cublasDasum_v2                            hipblasDasum
-#define cublasDaxpy_v2                            hipblasDaxpy
-#define cublasDcopy_v2                            hipblasDcopy
-#define cublasDdot_v2                             hipblasDdot
-#define cublasDestroy                             hipblasDestroy
-#define cublasDgemmBatched                        hipblasDgemmBatched
-#define cublasDgemm_v2                            hipblasDgemm
-#define cublasDgemv_v2                            hipblasDgemv
-#define cublasDger_v2                             hipblasDger
-#define cublasDnrm2_v2                            hipblasDnrm2
-#define cublasDscal_v2                            hipblasDscal
-#define cublasDspmv_v2                            hipblasDspmv
-#define cublasDspr_v2                             hipblasDspr
-#define cublasDsyrk_v2                            hipblasDsyrk
-#define cublasDtpmv_v2                            hipblasDtpmv
-#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l)   hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast<double*>(i),j,k,l)
-#define cublasFillMode_t                          hipblasFillMode_t
-#define cublasGemmAlgo_t                          hipblasGemmAlgo_t
-#define cublasGemmBatchedEx                       hipblasGemmBatchedEx
-#define cublasGemmEx                              hipblasGemmEx
-#define cublasGemmStridedBatchedEx                hipblasGemmStridedBatchedEx
-#define cublasHandle_t                            hipblasHandle_t
-#define cublasOperation_t                         hipblasOperation_t
-#define cublasSasum_v2                            hipblasSasum
-#define cublasSaxpy_v2                            hipblasSaxpy
-#define cublasScopy_v2                            hipblasScopy
-#define cublasSdot_v2                             hipblasSdot
-#define cublasSetStream                           hipblasSetStream
-#define cublasSgemv_v2                            hipblasSgemv
-#define cublasSger_v2                             hipblasSger
-#define cublasSnrm2_v2                            hipblasSnrm2
-#define cublasSscal_v2                            hipblasSscal
-#define cublasSspmv_v2                            hipblasSspmv
-#define cublasSspr_v2                             hipblasSspr
-#define cublasSsyrk_v2                            hipblasSsyrk
-#define cublasStatus_t                            hipblasStatus_t
-#define cublasStatus_t                            hipblasStatus_t
-#define cublasStpmv_v2                            hipblasStpmv
-#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l)   hipblasStrsm(a,b,c,d,e,f,g,h,const_cast<float*>(i),j,k,l)
-#define cudaComputeModeExclusive                  hipComputeModeExclusive
-#define cudaComputeModeExclusiveProcess           hipComputeModeExclusiveProcess
-#define cudaDataType                              hipDataType
-#define cudaDevAttrWarpSize                       hipDeviceAttributeWarpSize
-#define cudaDeviceGetAttribute                    hipDeviceGetAttribute
-#define cudaDeviceProp                            hipDeviceProp_t
-#define cudaDeviceReset                           hipDeviceReset
-#define cudaDeviceSynchronize                     hipDeviceSynchronize
-#define cudaErrorDeviceAlreadyInUse               hipErrorContextAlreadyInUse
-#define cudaErrorInvalidDevice                    hipErrorInvalidDevice
-#define cudaError_t                               hipError_t
-#define cudaEventCreate                           hipEventCreate
-#define cudaEventCreateWithFlags                  hipEventCreateWithFlags
-#define cudaEventDestroy                          hipEventDestroy
-#define cudaEventDisableTiming                    hipEventDisableTiming
-#define cudaEventRecord                           hipEventRecord
-#define cudaEventSynchronize                      hipEventSynchronize
-#define cudaEvent_t                               hipEvent_t
-#define cudaFree                                  hipFree
-#define cudaFreeHost                              hipFreeHost
-#define cudaGetDevice                             hipGetDevice
-#define cudaGetDeviceCount                        hipGetDeviceCount
-#define cudaGetDeviceProperties                   hipGetDeviceProperties
-#define cudaGetErrorName                          hipGetErrorName
-#define cudaGetErrorString                        hipGetErrorString
-#define cudaGetErrorString                        hipGetErrorString
-#define cudaGetLastError                          hipGetLastError
-#define cudaHostRegister                          hipHostRegister
-#define cudaHostRegisterDefault                   hipHostRegisterDefault
-#define cudaHostUnregister                        hipHostUnregister
-#define cudaLaunchHostFunc                        hipLaunchHostFunc
-#define cudaMalloc                                hipMalloc
-#define cudaMallocHost                            hipHostMalloc
-#define cudaMallocPitch                           hipMallocPitch
-#define cudaMemcpy                                hipMemcpy
-// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized 
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F \
+  HIPBLAS_R_32F  // TODO: Verify that plain float compute are viable
+                 // replacements for the tensor cores alternative.
+#define CUBLAS_COMPUTE_32F_FAST_TF32 \
+  HIPBLAS_R_32F  // TODO: Verify that plain float compute are viable
+                 // replacements for the tensor cores alternative.
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
+#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP \
+  HIPBLAS_GEMM_DEFAULT  // TODO: Verify regular GEMMs are viable replacements
+                        // for explicit tensor GEMMs.
+#define CUBLAS_OP_C HIPBLAS_OP_C
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_R_32F HIPBLAS_R_32F
+#define CUBLAS_R_64F HIPBLAS_R_64F
+#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUDA_R_32F HIP_R_32F
+#define CUDA_R_64F HIP_R_64F
+#define CUFFT_R2C HIPFFT_R2C
+#define CUFFT_SUCCESS HIPFFT_SUCCESS
+#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
+#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
+#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
+#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED \
+  HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
+#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
+#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
+#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
+#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
+#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
+#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
+#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
+#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
+#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
+#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
+#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
+#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
+#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
+#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2
+#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED
+#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH
+#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED
+#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES \
+  HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
+#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE
+#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR
+#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED \
+  HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
+#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED
+#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED
+#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
+#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT
+#define cuDeviceGetName hipDeviceGetName
+#define cuMemGetInfo_v2 hipMemGetInfo
+#define cublasComputeType_t hipblasDatatype_t
+#define cublasCreate hipblasCreate
+#define cublasDasum_v2 hipblasDasum
+#define cublasDaxpy_v2 hipblasDaxpy
+#define cublasDcopy_v2 hipblasDcopy
+#define cublasDdot_v2 hipblasDdot
+#define cublasDestroy hipblasDestroy
+#define cublasDgemmBatched hipblasDgemmBatched
+#define cublasDgemm_v2 hipblasDgemm
+#define cublasDgemv_v2 hipblasDgemv
+#define cublasDger_v2 hipblasDger
+#define cublasDnrm2_v2 hipblasDnrm2
+#define cublasDscal_v2 hipblasDscal
+#define cublasDspmv_v2 hipblasDspmv
+#define cublasDspr_v2 hipblasDspr
+#define cublasDsyrk_v2 hipblasDsyrk
+#define cublasDtpmv_v2 hipblasDtpmv
+#define cublasDtrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \
+  hipblasDtrsm(a, b, c, d, e, f, g, h, const_cast<double*>(i), j, k, l)
+#define cublasFillMode_t hipblasFillMode_t
+#define cublasGemmAlgo_t hipblasGemmAlgo_t
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasOperation_t hipblasOperation_t
+#define cublasSasum_v2 hipblasSasum
+#define cublasSaxpy_v2 hipblasSaxpy
+#define cublasScopy_v2 hipblasScopy
+#define cublasSdot_v2 hipblasSdot
+#define cublasSetStream hipblasSetStream
+#define cublasSgemv_v2 hipblasSgemv
+#define cublasSger_v2 hipblasSger
+#define cublasSnrm2_v2 hipblasSnrm2
+#define cublasSscal_v2 hipblasSscal
+#define cublasSspmv_v2 hipblasSspmv
+#define cublasSspr_v2 hipblasSspr
+#define cublasSsyrk_v2 hipblasSsyrk
+#define cublasStatus_t hipblasStatus_t
+#define cublasStatus_t hipblasStatus_t
+#define cublasStpmv_v2 hipblasStpmv
+#define cublasStrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \
+  hipblasStrsm(a, b, c, d, e, f, g, h, const_cast<float*>(i), j, k, l)
+#define cudaComputeModeExclusive hipComputeModeExclusive
+#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess
+#define cudaDataType hipDataType
+#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceReset hipDeviceReset
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse
+#define cudaErrorInvalidDevice hipErrorInvalidDevice
+#define cudaError_t hipError_t
+#define cudaEventCreate hipEventCreate
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDestroy hipEventDestroy
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorName hipGetErrorName
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterDefault hipHostRegisterDefault
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipHostMalloc
+#define cudaMallocPitch hipMallocPitch
+#define cudaMemcpy hipMemcpy
+// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized
 // copies, which should be canceled by ROCm 5.7.1+. Then the following would
 // be sufficient:
 // #define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpy2DAsync(a,b,c,d,width,height,e,f) \
-    [&]() -> hipError_t { \
-        if (width && height) \
-            return hipMemcpy2DAsync(a,b,c,d,width,height,e,f); \
-        return hipSuccess; \
-    }()
-#define cudaMemcpyAsync                           hipMemcpyAsync
-#define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost                    hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice                    hipMemcpyHostToDevice
-#define cudaMemGetInfo                            hipMemGetInfo
-#define cudaMemset2DAsync                         hipMemset2DAsync
-#define cudaMemsetAsync                           hipMemsetAsync
-#define cudaProfilerStop                          hipProfilerStop
-#define cudaSetDevice                             hipSetDevice
-#define cudaStreamCreate                          hipStreamCreate
-#define cudaStreamCreateWithFlags                 hipStreamCreateWithFlags
-#define cudaStreamDestroy                         hipStreamDestroy
-#define cudaStreamNonBlocking                     hipStreamNonBlocking
-#define cudaStreamPerThread                       ((hipStream_t)2)
-#define cudaStreamSynchronize                     hipStreamSynchronize
-#define cudaStreamWaitEvent                       hipStreamWaitEvent
-#define cudaStream_t                              hipStream_t
-#define cudaSuccess                               hipSuccess
-#define cufftComplex                              hipfftComplex
-#define cufftDestroy                              hipfftDestroy
-#define cufftExecR2C                              hipfftExecR2C
-#define cufftHandle                               hipfftHandle
-#define cufftPlanMany                             hipfftPlanMany
-#define cufftSetStream                            hipfftSetStream
-#define curandCreateGenerator                     hiprandCreateGenerator
-#define curandDestroyGenerator                    hiprandDestroyGenerator
-#define curandGenerateNormal                      hiprandGenerateNormal
-#define curandGenerateNormalDouble                hiprandGenerateNormalDouble
-#define curandGenerateUniform                     hiprandGenerateUniform
-#define curandGenerateUniformDouble               hiprandGenerateUniformDouble
-#define curandGenerator_t                         hiprandGenerator_t
-#define curandSetGeneratorOffset                  hiprandSetGeneratorOffset
-#define curandSetGeneratorOrdering(x,y)           0 // HIP does not support generator ordeing.
-#define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
-#define curandSetStream                           hiprandSetStream
-#define curandStatus_t                            hiprandStatus_t
-#define cusolverDnCreate                          hipsolverDnCreate
-#define cusolverDnDestroy                         hipsolverDnDestroy
-#define cusolverDnHandle_t                        hipsolverDnHandle_t
-#define cusolverDnSetStream                       hipsolverDnSetStream
-#define cusolverDnSpotrf                          hipsolverDnSpotrf
-#define cusolverDnSpotrfBatched                   hipsolverDnSpotrfBatched
-#define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
-#define cusolverDnSpotrs                          hipsolverDnSpotrs
-#define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
-#define cusparseAction_t                          hipsparseAction_t
-#define cusparseCreate                            hipsparseCreate
-#define cusparseCreateCsr                         hipsparseCreateCsr
-#define cusparseCreateDnMat                       hipsparseCreateDnMat
-#define cusparseCreateMatDescr                    hipsparseCreateMatDescr
-#define cusparseDcsr2csc                          hipsparseDcsr2csc
-#define cusparseDestroy                           hipsparseDestroy
-#define cusparseDestroy                           hipsparseDestroy
-#define cusparseDestroyDnMat                      hipsparseDestroyDnMat
-#define cusparseDestroyMatDescr                   hipsparseDestroyMatDescr
-#define cusparseDestroySpMat                      hipsparseDestroySpMat
-#define cusparseDnMatDescr_t                      hipsparseDnMatDescr_t
-#define cusparseGetMatIndexBase                   hipsparseGetMatIndexBase
-#define cusparseHandle_t                          hipsparseHandle_t
-#define cusparseIndexBase_t                       hipsparseIndexBase_t
-#define cusparseMatDescr_t                        hipsparseMatDescr_t
-#define cusparseOperation_t                       hipsparseOperation_t
-#define cusparseScsr2csc                          hipsparseScsr2csc
-#define cusparseSetStream                         hipsparseSetStream
-#define cusparseSpMM                              hipsparseSpMM
-#define cusparseSpMM_bufferSize                   hipsparseSpMM_bufferSize
-#define cusparseSpMatDescr_t                      hipsparseSpMatDescr_t
-#define cusparseStatus_t                          hipsparseStatus_t
-#define nvtxRangePop                              roctxRangePop
-#define nvtxRangePush                             roctxRangePush
-#define nvtxRangePushA                            roctxRangePushA
+#define cudaMemcpy2DAsync(a, b, c, d, width, height, e, f)      \
+  [&]() -> hipError_t {                                         \
+    if (width && height)                                        \
+      return hipMemcpy2DAsync(a, b, c, d, width, height, e, f); \
+    return hipSuccess;                                          \
+  }()
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaMemset2DAsync hipMemset2DAsync
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaProfilerStop hipProfilerStop
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread ((hipStream_t)2)
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define cufftComplex hipfftComplex
+#define cufftDestroy hipfftDestroy
+#define cufftExecR2C hipfftExecR2C
+#define cufftHandle hipfftHandle
+#define cufftPlanMany hipfftPlanMany
+#define cufftSetStream hipfftSetStream
+#define curandCreateGenerator hiprandCreateGenerator
+#define curandDestroyGenerator hiprandDestroyGenerator
+#define curandGenerateNormal hiprandGenerateNormal
+#define curandGenerateNormalDouble hiprandGenerateNormalDouble
+#define curandGenerateUniform hiprandGenerateUniform
+#define curandGenerateUniformDouble hiprandGenerateUniformDouble
+#define curandGenerator_t hiprandGenerator_t
+#define curandSetGeneratorOffset hiprandSetGeneratorOffset
+#define curandSetGeneratorOrdering(x, y) \
+  0  // HIP does not support generator ordeing.
+#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed
+#define curandSetStream hiprandSetStream
+#define curandStatus_t hiprandStatus_t
+#define cusolverDnCreate hipsolverDnCreate
+#define cusolverDnDestroy hipsolverDnDestroy
+#define cusolverDnHandle_t hipsolverDnHandle_t
+#define cusolverDnSetStream hipsolverDnSetStream
+#define cusolverDnSpotrf hipsolverDnSpotrf
+#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched
+#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize
+#define cusolverDnSpotrs hipsolverDnSpotrs
+#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched
+#define cusparseAction_t hipsparseAction_t
+#define cusparseCreate hipsparseCreate
+#define cusparseCreateCsr hipsparseCreateCsr
+#define cusparseCreateDnMat hipsparseCreateDnMat
+#define cusparseCreateMatDescr hipsparseCreateMatDescr
+#define cusparseDcsr2csc hipsparseDcsr2csc
+#define cusparseDestroy hipsparseDestroy
+#define cusparseDestroy hipsparseDestroy
+#define cusparseDestroyDnMat hipsparseDestroyDnMat
+#define cusparseDestroyMatDescr hipsparseDestroyMatDescr
+#define cusparseDestroySpMat hipsparseDestroySpMat
+#define cusparseDnMatDescr_t hipsparseDnMatDescr_t
+#define cusparseGetMatIndexBase hipsparseGetMatIndexBase
+#define cusparseHandle_t hipsparseHandle_t
+#define cusparseIndexBase_t hipsparseIndexBase_t
+#define cusparseMatDescr_t hipsparseMatDescr_t
+#define cusparseOperation_t hipsparseOperation_t
+#define cusparseScsr2csc hipsparseScsr2csc
+#define cusparseSetStream hipsparseSetStream
+#define cusparseSpMM hipsparseSpMM
+#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize
+#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
+#define cusparseStatus_t hipsparseStatus_t
+#define nvtxRangePop roctxRangePop
+#define nvtxRangePush roctxRangePush
+#define nvtxRangePushA roctxRangePushA
 //
 // HIPCUB namespace.
 //
@@ -256,8 +268,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
 //
 #define CUDART_CB
 
+//
+// Math constants
+//
+#define CUDART_INF HIP_INF
+#define CUDART_INF_F HIP_INF_F
+
+//
+// GPU static hardware characteristics. 
+//
 #define GPU_WARP_SIZE 64
 #define GPU_MAX_THREADS_PER_BLOCK 1024
-#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK/GPU_WARP_SIZE)
-#endif //__HIPIFY_H__
-
+#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE)
+#endif  //__HIPIFY_H__

From 3aaa32637850c919af905b1c799b3f4919d804cd Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 7 Nov 2023 00:00:01 +0000
Subject: [PATCH 20/22] Fix more formating to Google style.

---
 src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu | 3 ++-
 src/cudamatrix/cu-kernels.cu                                | 2 +-
 src/hip/hipify.h                                            | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index da2ba24bd90..5b94c34e829 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -222,7 +222,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim,
   int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
                 GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
   if (threads > GPU_MAX_THREADS_PER_BLOCK)
-    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
+    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                          // GPU_MAX_THREADS_PER_BLOCK threads
 
   dim3 blocks(num_chunk_frames, num_lanes);
 
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 8d5784acb52..9127819eca5 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -32,11 +32,11 @@
 #include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
 
-#include "hipify.h"
 #include <hipcub/block/block_reduce.hpp>
 #include <hipcub/hipcub.hpp>
 
 #include "cudamatrix/cu-kernels-ansi.h"
+#include "hipify.h"
 #else
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index e9ca483d022..459372e68b8 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -275,7 +275,7 @@ inline __device__ void __syncwarp(unsigned mask = 0xffffffff) {
 #define CUDART_INF_F HIP_INF_F
 
 //
-// GPU static hardware characteristics. 
+// GPU static hardware characteristics.
 //
 #define GPU_WARP_SIZE 64
 #define GPU_MAX_THREADS_PER_BLOCK 1024

From 6ebab7023b01a4270cbd07b5c3bfce7f1ca2c461 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 7 Nov 2023 00:25:49 +0000
Subject: [PATCH 21/22] Fix header ordering.

---
 src/cudamatrix/cu-kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 9127819eca5..9df6cea6e9d 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -32,8 +32,8 @@
 #include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
 
-#include <hipcub/block/block_reduce.hpp>
 #include <hipcub/hipcub.hpp>
+#include <hipcub/block/block_reduce.hpp>
 
 #include "cudamatrix/cu-kernels-ansi.h"
 #include "hipify.h"

From 7efdeaeb10ed0ae2593ee69faa04b5172a39aba9 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 7 Nov 2023 05:16:09 -0600
Subject: [PATCH 22/22] Add GPU characteristics for CUDA.

---
 src/cudamatrix/cu-common.h   | 4 ++++
 src/cudamatrix/cu-kernels.cu | 1 +
 2 files changed, 5 insertions(+)

diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 934668da6f2..3206fe7e7f4 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -45,6 +45,10 @@
 #include <curand.h>
 #include <cusparse.h>
 #include <nvToolsExt.h>
+
+#define GPU_WARP_SIZE 32
+#define GPU_MAX_THREADS_PER_BLOCK 1024
+#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE)
 #endif
 
 #define CU_SAFE_CALL(fun) \
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 9df6cea6e9d..b3c3165bd96 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -39,6 +39,7 @@
 #include "hipify.h"
 #else
 #include <math_constants.h>
+#include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION