From b8a6b90e5f9adb1319fde4e219b46931847bd345 Mon Sep 17 00:00:00 2001
From: Daniel Lowell <daniel.lowell@amd.com>
Date: Fri, 23 Mar 2018 12:50:58 -0500
Subject: [PATCH 01/12] First round.

---
 doc/src/releasenotes.md | 36 ++++++++++++++++++++++++++++++++++++
 include/miopen/miopen.h | 22 +++++++++++++---------
 src/ocl/rnnocl.cpp      |  2 +-
 3 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/doc/src/releasenotes.md b/doc/src/releasenotes.md
index 14af3fcc4f..5c06040eff 100644
--- a/doc/src/releasenotes.md
+++ b/doc/src/releasenotes.md
@@ -1,6 +1,42 @@
 
 ## MIOpen Release notes
 
+### 03/30/2018 [ 1.3.0 ]
+
+Notes: 
+
+This release contain performance enhancements and bug fixes to multiple parts of the MIOpen library. 
+16-bit floating point (fp16) support has been added for most kernels, with the exception of RNN.
+
+Changed:
+
+- Added 2 new API for RNNs
+- Added in support for uninitialized hidden states and nullptr outputs in RNNs
+- Added new convolutions for 1x1 (Jing Zhang, this went in correct?)?
+- Added support for Set and Scale operations for strided tensors with dimensions 1,2,3,4,5
+- Added the transpose + GEMM algorithm for 1x1 convolution (forward and backward data)
+- Added fp16 support for all layers except RNNs
+- Added the transpose + GEMM algorithm for 1x1 convolution (forward and backward data)
+- Improved over MIOpen layer and operations' performance (I removed the host side claim)
+- Improved Batch Normalization performance
+- Improved RNN performance
+- Fixed logic issues in get and set layer functions and related w_supertensor test
+- Fixed hang in batch norm with batch sizes greater than 256
+- Fixed logic issues in get and set layer functions and related w_supertensor test
+- Fixed various RNN bugs
+
+
+Known Issues:
+
+- RNNs do not support fp16
+
+
+### 03/08/2018 [ 1.2.1 ]
+
+Notes:
+
+- This release adds support for ROCm 1.7.1.
+
 
 ### 12/15/2017 [ 1.2.0 ]
 
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 1ac1bcb86f..c9c70b8630 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -289,11 +289,11 @@ typedef enum {
 
 /*! @ingroup convolutions
  *  @enum miopenConvolutionMode_t
- * Convolution mode selection for convolution layer preference
+ * Convolution mode selection for convolution layer preference.
 */
 typedef enum {
-    miopenConvolution = 0, /*!< Convolutions */
-    miopenTranspose   = 1, /*!< Transpose convolutions */
+    miopenConvolution = 0, /*!< Cross-Correlation convolution */
+    miopenTranspose   = 1, /*!< Transpose convolutions -- deconvolution */
 } miopenConvolutionMode_t;
 
 /*! @ingroup padding
@@ -613,7 +613,7 @@ MIOPEN_EXPORT miopenStatus_t
 miopenDestroyConvolutionDescriptor(miopenConvolutionDescriptor_t convDesc);
 
 /*! @enum miopenConvFwdAlgorithm_t
- * Convolutional algorithm mode for forward propagation.
+ * Convolutional algorithm mode for forward propagation. MIOpen use cross-correlation for its convolution implementation.
  */
 typedef enum {
     miopenConvolutionFwdAlgoGEMM     = 0, /*!< GEMM variant */
@@ -1767,11 +1767,6 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNDescriptor(miopenRNNDescriptor_t rnnDes
                                                     int* hiddenSize,
                                                     int* layer);
 
-/* // discuss later
-MIOPEN_EXPORT miopenStatus_t miopenGetRNNDescriptor(
-    miopenRNNDescriptor_t rnnDesc, miopenRNNMode_t* mode, int* seqLength, int* layer, int* bidir
-*/
-
 /*! @brief Destroys the tensor descriptor object
 *
 * @param rnnDesc RNN tensor descriptor type (input)
@@ -1951,6 +1946,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNHiddenTensorSize(miopenHandle_t handle,
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
  *
+ * When inputSkip mode is selected there is no input layer matrix operation, therefore
+ * miopenGetRNNLayerParamSize will return zero for matrices associated with the inputs.
  *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
@@ -1993,6 +1990,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParamSize(miopenHandle_t handle,
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
  *
+ * When inputSkip mode is selected there is no input layer matrix operation, therefore
+ * miopenGetRNNLayerBiasSize will return zero for biases associated with the inputs.
  *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
@@ -2044,6 +2043,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasSize(miopenHandle_t handle,
  * nullptr then only the paramDesc is populated and returned. The size in bytes of the
  * layer parameter matrix can be determined by using miopenGetRNNLayerParamSize().
  *
+ * Note: When inputSkip mode is selected there is no input layer matrix operation, therefore
+ * miopenGetRNNLayerParam will return a error status miopenStatusBadParm.
  *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
@@ -2104,6 +2105,9 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParam(miopenHandle_t handle,
  * nullptr then only the biasDesc is populated and returned. The size in bytes of the
  * layer bias can be determined by using miopenGetRNNLayerBiasSize().
  *
+ * Note: When inputSkip mode is selected there is no input layer matrix operation, 
+ * and therefore no associated memory. In this case miopenGetRNNLayerBias will return 
+ * a error status miopenStatusBadParm.
  *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp
index c1edb6f444..6a42a2fcec 100644
--- a/src/ocl/rnnocl.cpp
+++ b/src/ocl/rnnocl.cpp
@@ -3682,7 +3682,7 @@ void RNNDescriptor::RNNBackwardWeights(Handle& handle,
         {
             int in_bias_val = inputMode == miopenRNNskip ? 0 : wei_stride;
 
-            hid_shift = li * batch_n * hy_stride;
+                               hid_shift = li * batch_n * hy_stride;
             wei_shift = (li == 0) ? (wei_shift_bias + in_bias_val)
                                   : (wei_shift_bias + in_bias_val + li * 2 * wei_stride);
 

From fad0fd92088d6725c08977d53c81c9435b51abb7 Mon Sep 17 00:00:00 2001
From: Daniel Lowell <daniel.lowell@amd.com>
Date: Fri, 23 Mar 2018 17:15:13 -0500
Subject: [PATCH 02/12] Added in documentation updates for 1.3.0.

---
 README.md               |  44 +++---
 doc/src/cache.md        |   2 +-
 doc/src/perfdatabase.md |  15 ++
 doc/src/releasenotes.md |  23 ++--
 doc/src/rnn.rst         |  12 +-
 include/miopen/miopen.h | 299 +++++++++++++++++++++++++---------------
 6 files changed, 248 insertions(+), 147 deletions(-)

diff --git a/README.md b/README.md
index 1da29d5bb6..b6ff4241d5 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,27 @@ AMD's library for high peformance machine learning primitives. MIOpen supports t
 * [Boost](http://www.boost.org/) at least version 1.58
   * MIOpen uses `boost-system` and `boost-filesystem` packages to enable persistent [kernel cache](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/master/doc/src/cache.md)
 
-Instructions to install the above dependencies are present in this [section](#installing-the-dependencies).
+## Installing the dependencies
+
+The dependencies can be installed with the `install_deps.cmake`, script:
+
+```
+cmake -P install_deps.cmake
+```
+
+This will install by default to `/usr/local` but it can be installed in another location with `--prefix` argument:
+
+```
+cmake -P install_deps.cmake --prefix /some/local/dir
+```
+
+If Ubuntu v16 is used then the `OpenSSL` and `Boost` packages can also be installed by:
+```
+sudo apt-get install libssl-dev
+sudo apt-get install libboost-dev
+sudo apt-get install libboost-system-dev
+sudo apt-get install libboost-filesystem-dev
+```
 
 ## Installing MIOpen with pre-built packages
 
@@ -167,24 +187,12 @@ Also, githooks can be installed to format the code per-commit:
 ./.githooks/install
 ```
 
-## Installing the dependencies
+## Using docker
 
-The dependencies can be installed with the `install_deps.cmake`, script:
+The easiest way is to use docker. You can build the top-level docker file:
 
-```
-cmake -P install_deps.cmake
-```
+    docker build -t miopen .
 
-This will install by default to `/usr/local` but it can be installed in another location with `--prefix` argument:
+Then to enter the developement environment use `docker run`:
 
-```
-cmake -P install_deps.cmake --prefix /some/local/dir
-```
-
-If Ubuntu v16 is used then the `OpenSSL` and `Boost` packages can also be installed by:
-```
-sudo apt-get install libssl-dev
-sudo apt-get install libboost-dev
-sudo apt-get install libboost-system-dev
-sudo apt-get install libboost-filesystem-dev
-```
+    docker run --device='/dev/kfd' --device='/dev/dri' -v=`pwd`:/data -w /data --group-add video -it miopen
diff --git a/doc/src/cache.md b/doc/src/cache.md
index 71a7b73a29..4ab864f724 100644
--- a/doc/src/cache.md
+++ b/doc/src/cache.md
@@ -6,7 +6,7 @@ MIOpen will cache binary kernels to disk, so they don't need to be compiled the
 Clear the cache
 ---------------
 
-The cache can be cleared by simply deleting the cache directory(ie `$HOME/.cache/miopen`). This should only be needed for development purposes or to free disk space. The cache does not need to be cleared when upgrading MIOpen.
+The cache can be cleared by simply deleting the cache directory (i.e., `$HOME/.cache/miopen`). This should only be needed for development purposes or to free disk space. The cache does not need to be cleared when upgrading MIOpen.
 
 Disabling the cache
 -------------------
diff --git a/doc/src/perfdatabase.md b/doc/src/perfdatabase.md
index ce16f88711..11b0ce694c 100644
--- a/doc/src/perfdatabase.md
+++ b/doc/src/perfdatabase.md
@@ -11,6 +11,9 @@ MIOpen performs Exhaustive Search only if explicitly requested via MIOpen API an
 
 The optimized solution found during the successful Search process is written into the PerfDb for future re-use. That is why MIOpen will not Search for optimized solution more than once for a given problem in this mode.
 
+See documentation about miopenFind*() API calls for more info on how Search can be explicitly requested.
+
+
 **DB_UPDATE (2)**
 
 Similar to NONE, but Search will NOT be skipped if PerfDb contains relevant record. If Search is requested via MIOpen API, then MIOpen will perform the Search and update PerfDb.
@@ -34,3 +37,15 @@ Note: This mode is intended for tuning the MIOpen installation. When MIOpen is i
 **DB_CLEAN (5)**
 
 MIOpen removes relevant records from the PerfDb instead of just reading and using those. Search is blocked, even if explicitly requested.
+
+## MIOPEN_FIND_ENFORCE_SCOPE
+
+This variable allows to limit the scope of `MIOPEN_FIND_ENFORCE`, so that only forward, backward data or backward weights convolutions will be affected. Both symbolic and numeric values are supported, as shown below.
+
+**ALL (1)** `MIOPEN_FIND_ENFORCE` affects all convolutions. This is the default.
+		
+**CONV_FWD (2)** `MIOPEN_FIND_ENFORCE` affects only Forward convolutions.
+
+**CONV_BWD (3)** `MIOPEN_FIND_ENFORCE` affects only Backward Data convolutions.
+
+**CONV_WRW (3)** `MIOPEN_FIND_ENFORCE` affects only Backward With Regard to Weights (a.k.a WRW) convolutions.
\ No newline at end of file
diff --git a/doc/src/releasenotes.md b/doc/src/releasenotes.md
index 5c06040eff..9c0d6bf705 100644
--- a/doc/src/releasenotes.md
+++ b/doc/src/releasenotes.md
@@ -5,30 +5,27 @@
 
 Notes: 
 
-This release contain performance enhancements and bug fixes to multiple parts of the MIOpen library. 
-16-bit floating point (fp16) support has been added for most kernels, with the exception of RNN.
+- This release adds fp16 support for Inference using CNNs
+- Performance improvements for RNNs
+- Performance improvements for convolutions using 1x1 filters
+- Performance improvement for Batch Normalization
+- Bug fixes for various components of MIOpen
 
 Changed:
 
-- Added 2 new API for RNNs
+- Added 2 new API for RNNs: miopenGetRNNLayerParamOffset and miopenGetRNNLayerBiasOffset
 - Added in support for uninitialized hidden states and nullptr outputs in RNNs
-- Added new convolutions for 1x1 (Jing Zhang, this went in correct?)?
 - Added support for Set and Scale operations for strided tensors with dimensions 1,2,3,4,5
-- Added the transpose + GEMM algorithm for 1x1 convolution (forward and backward data)
-- Added fp16 support for all layers except RNNs
-- Added the transpose + GEMM algorithm for 1x1 convolution (forward and backward data)
-- Improved over MIOpen layer and operations' performance (I removed the host side claim)
-- Improved Batch Normalization performance
-- Improved RNN performance
+- Added multi-thread and multi-process support for the performance database
+- Improved performance for OpTensor
+- Fixed bug in convolutions for backward bias
 - Fixed logic issues in get and set layer functions and related w_supertensor test
 - Fixed hang in batch norm with batch sizes greater than 256
-- Fixed logic issues in get and set layer functions and related w_supertensor test
-- Fixed various RNN bugs
-
 
 Known Issues:
 
 - RNNs do not support fp16
+- Training with CNNs does not support fp16
 
 
 ### 03/08/2018 [ 1.2.1 ]
diff --git a/doc/src/rnn.rst b/doc/src/rnn.rst
index b57fc80c18..3a53861c97 100644
--- a/doc/src/rnn.rst
+++ b/doc/src/rnn.rst
@@ -51,7 +51,6 @@ miopenGetRNNDescriptor
 .. doxygenfunction::  miopenGetRNNDescriptor
 
 
-
 miopenDestroyRNNDescriptor
 --------------------------
 
@@ -135,6 +134,17 @@ miopenSetRNNLayerBias
 
 .. doxygenfunction::  miopenSetRNNLayerBias
 
+miopenGetRNNLayerParamOffset
+----------------------------
+
+.. doxygenfunction::  miopenGetRNNLayerParamOffset
+
+
+miopenGetRNNLayerBiasOffset
+---------------------------
+
+.. doxygenfunction::  miopenGetRNNLayerBiasOffset
+
 miopenRNNForwardTraining
 ------------------------
 
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index c9c70b8630..bfe9054236 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -698,10 +698,10 @@ miopenConvolutionForwardGetWorkSpaceSize(miopenHandle_t handle,
  * to execute this function, miopenConvolutionForwardGetWorkSpaceSize() must be
  * run to determine the required memory for this search.
  *
- * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
+ * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
  * configuration match is not found, a default configuration will be returned.
  *
- * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
+ * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
  * a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
@@ -831,10 +831,10 @@ miopenConvolutionBackwardDataGetWorkSpaceSize(miopenHandle_t handle,
  * execute this function, miopenConvolutionBackwardsDataGetWorkSpaceSize() must be run to determine
  * the required memory for this search.
  *
- * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
+ * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
  * configuration match is not found, a default configuration will be returned.
  *
- * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
+ * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
  * a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
@@ -944,10 +944,10 @@ miopenConvolutionBackwardWeightsGetWorkSpaceSize(miopenHandle_t handle,
  * execute this function, miopenConvolutionBackwardsWeightsGetWorkSpaceSize() must be run to
  * determine the required memory for this search.
  *
- * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
+ * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
  * configuration match is not found, a default configuration will be returned.
  *
- * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
+ * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
  * a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
@@ -1357,6 +1357,7 @@ MIOPEN_EXPORT miopenStatus_t miopenDestroyLRNDescriptor(miopenLRNDescriptor_t lr
  *
  * This function takes the input tensor descriptor and outputs a derived tensor for the
  * normalization scale (gamma) and shift (beta) tensors.
+ *
  * For an input tensor NCHW and spatial mode, the output derived tensor is 1C11, while for
  * per-activation the derived tensor is 1CHW.
  *
@@ -1374,8 +1375,10 @@ MIOPEN_EXPORT miopenStatus_t miopenDeriveBNTensorDescriptor(miopenTensorDescript
  * Batch normalization pass for forward training pass.
  * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale
  * with their descriptor.
+ *
  * If either resultSaveMean, or resultSaveInvVariance are null pointers then the values for the mean
  * and inverse variance will not be used.
+ *
  * Likewise, if either resultRunningMean, or resultRunningVariance are null pointers then the values
  * for the running mean and variance will not be saved.
  * Running averages and variances are scaled using an exponential averaging factor: \f[
@@ -1429,6 +1432,7 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
  * Batch normalization pass for forward inference pass.
  * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale
  * with their descriptor.
+ *
  * If either estimatedMean, or estimatedVariance are null pointers then the values for the mean and
  * variance will not be used.
  *
@@ -1469,9 +1473,11 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
  *
  * Batch normalization pass for backwards propagation training pass.
  * The method for backwards propagation batch normalization.
+ *
  * Takes in batch normalization mode bn_mode and input tensor data x, input activation tensor dy,
  * output tensor dx, the learned tensors resultBNBiasDiff and resultBNScaleDiff with their
  * descriptor.
+ *
  * If BOTH savedMean, and savedVariance are not null pointers then the method will use the saved
  * mean and variance calculated by the forward training phase.
  *
@@ -1694,8 +1700,8 @@ MIOPEN_EXPORT miopenStatus_t miopenSoftmaxBackward(miopenHandle_t handle,
 * RNN mode selection for rnn layer preference
 */
 typedef enum {
-    miopenRNNRELU = 0, /*!< RNN ReLU squash */
-    miopenRNNTANH = 1, /*!< RNN tanh squash */
+    miopenRNNRELU = 0, /*!< RNN ReLU activation */
+    miopenRNNTANH = 1, /*!< RNN tanh activation */
     miopenLSTM    = 2, /*!< LSTM */
     miopenGRU     = 3, /*!< GRU */
 } miopenRNNMode_t;
@@ -1786,7 +1792,7 @@ MIOPEN_EXPORT miopenStatus_t miopenDestroyRNNDescriptor(miopenRNNDescriptor_t rn
  * @param rnnMode      RNN model type (input)
  * @param biasMode     RNN bias included (input)
  * @param algo         RNN algorithm selected (input)
- * @param dataType     fp32 or fp16 datatype mode, only fp 16 currently supported for RNNs (input)
+ * @param dataType     Only fp32 currently supported for RNNs (input)
  * @return             miopenStatus_t
 */
 MIOPEN_EXPORT miopenStatus_t miopenSetRNNDescriptor(miopenRNNDescriptor_t rnnDesc,
@@ -1930,25 +1936,28 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNHiddenTensorSize(miopenHandle_t handle,
  * For miopenLSTM paramID 0 to 3 refer to the weight matrices associated
  * with the input GEMM, 4-7 are associated with matrices associated with the
  * hidden state GEMM.
- * ParamID 0 and 4 are for the input gate operations.
- * ParamID 1 and 5 are for the forget gate operations.
- * ParamID 2 and 6 are for the memory gate operations.
- * ParamID 3 and 7 are for the output gate operations.
  *
+ * * paramID 0 and 4 are for the input gate operations.
+ * 
+ * * paramID 1 and 5 are for the forget gate operations.
+ * 
+ * * paramID 2 and 6 are for the memory gate operations.
+ * 
+ * * paramID 3 and 7 are for the output gate operations.
  *
- * For miopenGRU paramID 0 to 2 refer to the the weight matrices associated
- * with the input GEMM, while 5 through 6 are associated with the hidden state
+ * For miopenGRU paramID 0 to 2 refer to the weight matrix offset associated
+ * with the input GEMM, while 3 through 5 are associated with the hidden state
  * GEMM.
- * ParamID 0 and 4 are for the reset gate operations.
- * ParamID 1 and 5 are for the update gate operations.
- * ParamID 2 and 6 are for the memory gate operations.
+ *
+ * * paramID 0 and 3 are for the reset gate operations.
+ *
+ * * paramID 1 and 4 are for the update gate operations.
+ *
+ * * paramID 2 and 5 are for the memory gate operations.
  *
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
  *
- * When inputSkip mode is selected there is no input layer matrix operation, therefore
- * miopenGetRNNLayerParamSize will return zero for matrices associated with the inputs.
- *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
  * @param layer           The layer number in the RNN stack (input)
@@ -1970,29 +1979,30 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParamSize(miopenHandle_t handle,
  * weight matrix associated with the in input GEMM, while biasID == 1 retrieves
  * the bias associated with the hidden state GEMM.
  *
- * For miopenLSTM paramID 0 to 3 refer to the biases associated
+ * For miopenLSTM biasID 0 to 3 refer to the biases associated
  * with the input GEMM, 4-7 are associated with biases associated with the
  * hidden state GEMM.
- * biasID 0 and 4 are for the input gate operations.
- * biasID 1 and 5 are for the forget gate operations.
- * biasID 2 and 6 are for the memory gate operations.
- * biasID 3 and 7 are for the output gate operations.
  *
+ * * biasID 0 and 4 are for the input gate operations.
  *
- * For miopenGRU biasID 0 to 2 refer to the biases associated
- * with the input GEMM, while 5 through 6 are associated with the hidden state
- * GEMM.
- * biasID 0 and 4 are for the reset gate operations.
- * biasID 1 and 5 are for the update gate operations.
- * biasID 2 and 6 are for the memory gate operations.
+ * * biasID 1 and 5 are for the forget gate operations.
+ *
+ * * biasID 2 and 6 are for the memory gate operations.
  *
+ * * biasID 3 and 7 are for the output gate operations.
+ *
+ * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM, 
+ * while 3 through 5 are associated with the hidden state GEMM.
+ *
+ * * biasID 0 and 3 are for the reset gate operations.
+ * 
+ * * biasID 1 and 4 are for the update gate operations.
+ *
+ * * biasID 2 and 5 are for the memory gate operations.
  *
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
  *
- * When inputSkip mode is selected there is no input layer matrix operation, therefore
- * miopenGetRNNLayerBiasSize will return zero for biases associated with the inputs.
- *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
  * @param layer           The layer number in the RNN stack (input)
@@ -2018,18 +2028,24 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasSize(miopenHandle_t handle,
  * For miopenLSTM paramID 0 to 3 refer to the weight matrices associated
  * with the input GEMM, 4-7 are associated with matrices associated with the
  * hidden state GEMM.
- * ParamID 0 and 4 are for the input gate operations.
- * ParamID 1 and 5 are for the forget gate operations.
- * ParamID 2 and 6 are for the memory gate operations.
- * ParamID 3 and 7 are for the output gate operations.
  *
+ * * paramID 0 and 4 are for the input gate operations.
+ * 
+ * * paramID 1 and 5 are for the forget gate operations.
+ * 
+ * * paramID 2 and 6 are for the memory gate operations.
+ * 
+ * * paramID 3 and 7 are for the output gate operations.
  *
- * For miopenGRU paramID 0 to 2 refer to the weight matrices associated
- * with the input GEMM, while 5 through 6 are associated with the hidden state
+ * For miopenGRU paramID 0 to 2 refer to the weight matrix offset associated
+ * with the input GEMM, while 3 through 5 are associated with the hidden state
  * GEMM.
- * ParamID 0 and 4 are for the reset gate operations.
- * ParamID 1 and 5 are for the update gate operations.
- * ParamID 2 and 6 are for the memory gate operations.
+ *
+ * * paramID 0 and 3 are for the reset gate operations.
+ *
+ * * paramID 1 and 4 are for the update gate operations.
+ *
+ * * paramID 2 and 5 are for the memory gate operations.
  *
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
@@ -2043,8 +2059,9 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasSize(miopenHandle_t handle,
  * nullptr then only the paramDesc is populated and returned. The size in bytes of the
  * layer parameter matrix can be determined by using miopenGetRNNLayerParamSize().
  *
- * Note: When inputSkip mode is selected there is no input layer matrix operation, therefore
- * miopenGetRNNLayerParam will return a error status miopenStatusBadParm.
+ * Note: When inputSkip mode is selected there is no input layer matrix operation, 
+ * and therefore no associated memory. In this case miopenGetRNNLayerParam() will return 
+ * a error status miopenStatusBadParm for input paramID associated with the input GEMM.
  *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
@@ -2076,22 +2093,27 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParam(miopenHandle_t handle,
  * bias associated with the in input GEMM, while biasID == 1 retrieves
  * the bias associated with the hidden state GEMM.
  *
- * For miopenLSTM paramID 0 to 3 refer to the biases associated
+ * For miopenLSTM biasID 0 to 3 refer to the biases associated
  * with the input GEMM, 4-7 are associated with biases associated with the
  * hidden state GEMM.
- * biasID 0 and 4 are for the input gate operations.
- * biasID 1 and 5 are for the forget gate operations.
- * biasID 2 and 6 are for the memory gate operations.
- * biasID 3 and 7 are for the output gate operations.
  *
+ * * biasID 0 and 4 are for the input gate operations.
+ *
+ * * biasID 1 and 5 are for the forget gate operations.
+ *
+ * * biasID 2 and 6 are for the memory gate operations.
+ *
+ * * biasID 3 and 7 are for the output gate operations.
  *
- * For miopenGRU biasID 0 to 2 refer to the biases associated
- * with the input GEMM, while 5 through 6 are associated with the hidden state
- * GEMM.
- * biasID 0 and 4 are for the reset gate operations.
- * biasID 1 and 5 are for the update gate operations.
- * biasID 2 and 6 are for the memory gate operations.
  *
+ * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM, 
+ * while 3 through 5 are associated with the hidden state GEMM.
+ *
+ * * biasID 0 and 3 are for the reset gate operations.
+ * 
+ * * biasID 1 and 4 are for the update gate operations.
+ *
+ * * biasID 2 and 5 are for the memory gate operations.
  *
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
@@ -2106,8 +2128,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParam(miopenHandle_t handle,
  * layer bias can be determined by using miopenGetRNNLayerBiasSize().
  *
  * Note: When inputSkip mode is selected there is no input layer matrix operation, 
- * and therefore no associated memory. In this case miopenGetRNNLayerBias will return 
- * a error status miopenStatusBadParm.
+ * and therefore no associated memory. In this case miopenGetRNNLayerBias() will return 
+ * a error status miopenStatusBadParm for input biasID associated with the input GEMM.
  *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
@@ -2142,18 +2164,24 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBias(miopenHandle_t handle,
  * For miopenLSTM paramID 0 to 3 refer to the weight matrix offsets associated
  * with the input GEMM, 4-7 are associated with matrix offset associated with the
  * hidden state GEMM.
- * ParamID 0 and 4 are for the input gate operations.
- * ParamID 1 and 5 are for the forget gate operations.
- * ParamID 2 and 6 are for the memory gate operations.
- * ParamID 3 and 7 are for the output gate operations.
  *
+ * * paramID 0 and 4 are for the input gate operations.
+ * 
+ * * paramID 1 and 5 are for the forget gate operations.
+ * 
+ * * paramID 2 and 6 are for the memory gate operations.
+ * 
+ * * paramID 3 and 7 are for the output gate operations.
  *
  * For miopenGRU paramID 0 to 2 refer to the weight matrix offset associated
- * with the input GEMM, while 5 through 6 are associated with the hidden state
+ * with the input GEMM, while 3 through 5 are associated with the hidden state
  * GEMM.
- * ParamID 0 and 4 are for the reset gate operations.
- * ParamID 1 and 5 are for the update gate operations.
- * ParamID 2 and 6 are for the memory gate operations.
+ *
+ * * paramID 0 and 3 are for the reset gate operations.
+ *
+ * * paramID 1 and 4 are for the update gate operations.
+ *
+ * * paramID 2 and 5 are for the memory gate operations.
  *
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
@@ -2165,6 +2193,10 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBias(miopenHandle_t handle,
  * The argument layerParamOffset should either be nullptr, or an address to place the
  * offset. If layerParamOffset is nullptr then only the paramDesc is populated and returned.
  *
+ * Note: When inputSkip mode is selected there is no input layer matrix operation, 
+ * and therefore no associated memory. In this case miopenGetRNNLayerParamOffset() will return 
+ * a error status miopenStatusBadParm for input paramID associated with the input GEMM.
+ *
  *
  * @param rnnDesc           RNN layer descriptor type (input)
  * @param layer             The layer number in the RNN stack (input)
@@ -2189,21 +2221,25 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParamOffset(miopenRNNDescriptor_t
  * bias associated with the in input GEMM, while biasID == 1 retrieves
  * the weight matrix associated with the hidden state GEMM.
  *
- * For miopenLSTM paramID 0 to 3 refer to the bias offset associated
+ * For miopenLSTM biasID 0 to 3 refer to the bias offset associated
  * with the input GEMM, 4-7 are the bias offsets associated with the hidden state GEMM.
- * biasID 0 and 4 are for the input gate operations.
- * biasID 1 and 5 are for the forget gate operations.
- * biasID 2 and 6 are for the memory gate operations.
- * biasID 3 and 7 are for the output gate operations.
  *
+ * * biasID 0 and 4 are for the input gate operations.
  *
- * For miopenGRU biasID 0 to 2 refer to the bias offsets associated
- * with the input GEMM, while 5 through 6 are associated with the hidden state
- * GEMM.
- * biasID 0 and 4 are for the reset gate operations.
- * biasID 1 and 5 are for the update gate operations.
- * biasID 2 and 6 are for the memory gate operations.
+ * * biasID 1 and 5 are for the forget gate operations.
+ *
+ * * biasID 2 and 6 are for the memory gate operations.
+ *
+ * * biasID 3 and 7 are for the output gate operations.
  *
+ * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM, 
+ * while 3 through 5 are associated with the hidden state GEMM.
+ *
+ * * biasID 0 and 3 are for the reset gate operations.
+ * 
+ * * biasID 1 and 4 are for the update gate operations.
+ *
+ * * biasID 2 and 5 are for the memory gate operations.
  *
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
@@ -2215,6 +2251,9 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParamOffset(miopenRNNDescriptor_t
  * The argument layerBiasOffset should either be nullptr, or point to an output address.
  * If layerBias is nullptr then only the biasDesc is populated and returned.
  *
+ * Note: When inputSkip mode is selected there is no input layer matrix operation, 
+ * and therefore no associated memory. In this case miopenGetRNNLayerBiasOffset() will return 
+ * a error status miopenStatusBadParm for input biasID associated with the input GEMM.
  *
  * @param rnnDesc         RNN layer descriptor type (input)
  * @param layer           The layer number in the RNN stack (input)
@@ -2243,18 +2282,25 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasOffset(miopenRNNDescriptor_t r
  * For miopenLSTM paramID 0 to 3 refer to the weight matrices associated
  * with the input GEMM, 4-7 are associated with matrices associated with the
  * hidden state GEMM.
- * ParamID 0 and 4 are for the input gate operations.
- * ParamID 1 and 5 are for the forget gate operations.
- * ParamID 2 and 6 are for the memory gate operations.
- * ParamID 3 and 7 are for the output gate operations.
  *
+ * * paramID 0 and 4 are for the input gate operations.
+ * 
+ * * paramID 1 and 5 are for the forget gate operations.
+ * 
+ * * paramID 2 and 6 are for the memory gate operations.
+ * 
+ * * paramID 3 and 7 are for the output gate operations.
  *
- * For miopenGRU paramID 0 to 2 refer to the weight matrices associated
- * with the input GEMM, while 5 through 6 are associated with the hidden state
+ *
+ * For miopenGRU paramID 0 to 2 refer to the weight matrix offset associated
+ * with the input GEMM, while 3 through 5 are associated with the hidden state
  * GEMM.
- * ParamID 0 and 4 are for the reset gate operations.
- * ParamID 1 and 5 are for the update gate operations.
- * ParamID 2 and 6 are for the memory gate operations.
+ *
+ * * paramID 0 and 3 are for the reset gate operations.
+ *
+ * * paramID 1 and 4 are for the update gate operations.
+ *
+ * * paramID 2 and 5 are for the memory gate operations.
  *
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
@@ -2262,6 +2308,9 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasOffset(miopenRNNDescriptor_t r
  * The input argument paramDesc is a previously populated tensor descriptor typically
  * by first calling miopenGetRNNLayerParam().
  *
+ * Note: When inputSkip mode is selected there is no input layer matrix operation, 
+ * and therefore no associated memory. In this case miopenSetRNNLayerParam() will return 
+ * a error status miopenStatusBadParm for input paramID associated with the input GEMM.
  *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
@@ -2292,22 +2341,27 @@ MIOPEN_EXPORT miopenStatus_t miopenSetRNNLayerParam(miopenHandle_t handle,
  * weight matrix associated with the in input GEMM, while biasID == 1 retrieves
  * the bias associated with the hidden state GEMM.
  *
- * For miopenLSTM paramID 0 to 3 refer to the biases associated
+ * For miopenLSTM biasID 0 to 3 refer to the biases associated
  * with the input GEMM, 4-7 are associated with the biases associated with the
  * hidden state GEMM.
- * biasID 0 and 4 are for the input gate operations.
- * biasID 1 and 5 are for the forget gate operations.
- * biasID 2 and 6 are for the memory gate operations.
- * biasID 3 and 7 are for the output gate operations.
  *
+ * * biasID 0 and 4 are for the input gate operations.
  *
- * For miopenGRU biasID 0 to 2 refer to the biases associated
- * with the input GEMM, while 5 through 6 are associated with the hidden state
- * GEMM.
- * biasID 0 and 4 are for the reset gate operations.
- * biasID 1 and 5 are for the update gate operations.
- * biasID 2 and 6 are for the memory gate operations.
+ * * biasID 1 and 5 are for the forget gate operations.
  *
+ * * biasID 2 and 6 are for the memory gate operations.
+ *
+ * * biasID 3 and 7 are for the output gate operations.
+ *
+ *
+ * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM, 
+ * while 3 through 5 are associated with the hidden state GEMM.
+ *
+ * * biasID 0 and 3 are for the reset gate operations.
+ * 
+ * * biasID 1 and 4 are for the update gate operations.
+ *
+ * * biasID 2 and 5 are for the memory gate operations.
  *
  * For bi-directional RNNs the backwards in time direction is numbered as the layer
  * directly after the forward in time direction.
@@ -2315,6 +2369,9 @@ MIOPEN_EXPORT miopenStatus_t miopenSetRNNLayerParam(miopenHandle_t handle,
  * The input argument biasDesc is a previously populated tensor descriptor typically
  * by first calling miopenGetRNNLayeBias().
  *
+ * Note: When inputSkip mode is selected there is no input layer matrix operation, 
+ * and therefore no associated memory. In this case miopenSetRNNLayerBias will return 
+ * a error status miopenStatusBadParm for input biasID associated with the input GEMM.
  *
  * @param handle          MIOpen handle (input)
  * @param rnnDesc         RNN layer descriptor type (input)
@@ -2355,13 +2412,15 @@ MIOPEN_EXPORT miopenStatus_t miopenSetRNNLayerBias(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hx                    Pointer to the hidden layer input tensor (input)
+ * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL, 
+ * then the initial hidden state will be zero initialized. (input)
  * @param cxDesc                A cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cx                    Pointer to the cell layer input tensor (input)
+ * @param cx                    Pointer to the cell layer input tensor. If cx is NULL, 
+ * then the initial cell state will be zero initialized. (input)
  * @param wDesc                 A weights tensor descriptor (input)
  * @param w                     Pointer to input weights tensor (input)
  * @param yDesc                 An array of fully packed tensor descriptors associated
@@ -2377,13 +2436,15 @@ MIOPEN_EXPORT miopenStatus_t miopenSetRNNLayerBias(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hy                    Pointer to the hidden layer output tensor (output)
+ * @param hy                    Pointer to the hidden layer output tensor. If hy is NULL, 
+ * then the final hidden state will not be saved. (output)
  * @param cyDesc                A cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cy                    Pointer to the cell layer output tensor (output)
+ * @param cy                    Pointer to the cell layer output tensor. If hy is NULL, 
+ * then the final cell state will not be saved. (output)
  * @param workSpace             Pointer to memory allocated for forward training (input)
  * @param workSpaceNumBytes     Number of allocated bytes in memory for the workspace (input)
  * @param reserveSpace          Pointer to memory allocated for random states (input / output)
@@ -2440,7 +2501,8 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNForwardTraining(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param dcy                   Pointer to the cell layer input tensor (input)
+ * @param dcy                   Pointer to the cell layer input tensor. If dcy is NULL, 
+ * then the initial delta cell state will be zero initialized. (input)
  * @param wDesc                 A weights tensor descriptor (input)
  * @param w                     Pointer to input weights tensor (input)
  * @param hxDesc                An input hidden tensor descriptor that has as its first dimension
@@ -2448,13 +2510,15 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNForwardTraining(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hx                    Pointer to output tensor (input)
+ * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL, 
+ * then the initial hidden state will be zero initialized. (input)
  * @param cxDesc                A input cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cx                    Pointer to the hidden layer output tensor (input)
+ * @param cx                    Pointer to the hidden layer input tensor. If cx is NULL, 
+ * then the initial cell state will be zero initialized. (input)
  * @param dxDesc                An array of tensor descriptors. These are the
  * input descriptors to each time step. The first dimension of each descriptor is the
  * batch size and may decrease from element n to element n+1 and not increase in size.
@@ -2466,13 +2530,15 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNForwardTraining(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param dhx                   Pointer to the cell layer output tensor (output)
+ * @param dhx                   Pointer to the delta hidden layer output tensor. If dhx is NULL
+ * the hidden gradient will not ouput. (output)
  * @param dcxDesc               A tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param dcx                   Pointer to the cell layer output tensor (output)
+ * @param dcx                   Pointer to the cell layer output tensor. If dcx is NULL
+ * the cell gradient will not ouput. (output)
  * @param workSpace             Pointer to memory allocated for forward training (input)
  * @param workSpaceNumBytes     Number of allocated bytes in memory for the workspace (input)
  * @param reserveSpace          Pointer to memory allocated for random states (input / output)
@@ -2525,7 +2591,8 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNBackwardData(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hx                    Pointer to the hidden layer input tensor (input)
+ * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL, 
+ * then the initial hidden state will be zero initialized. (input)
  * @param yDesc                 An array of fully packed tensor descriptors associated
  * with the output from each time step. The first dimension of the tensor descriptors
  * must equal the first dimension of the first descriptor (batch size) in the xDesc
@@ -2533,9 +2600,9 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNBackwardData(miopenHandle_t handle,
  * depends on the direction mode selected. If the direction mode is unidirectional,
  * the second dimension is the hiddenSize. If direction mode is bidirectional
  * the second dimension is twice the hiddenSize. (input)
- * @param y                     Pointer to the cell layer input tensor (input)
+ * @param y                     Pointer to the output tensor (input)
  * @param dwDesc                A weights tensor descriptor (input)
- * @param dw                    Pointer to input weights tensor (output)
+ * @param dw                    Pointer to input weights tensor (input / output)
  * @param workSpace             Pointer to memory allocated for forward training (input)
  * @param workSpaceNumBytes     Number of allocated bytes in memory for the workspace (input)
  * @param reserveSpace          Pointer to memory allocated for random states (input)
@@ -2576,13 +2643,15 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNBackwardWeights(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hx                    Pointer to the hidden layer input tensor (input)
+ * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL, 
+ * then the initial hidden state will be zero initialized. (input)
  * @param cxDesc                A cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cx                    Pointer to the cell layer input tensor (input)
+ * @param cx                    Pointer to the cell layer input tensor. If cx is NULL, 
+ * then the initial cell state will be zero initialized. (input)
  * @param wDesc                 A weights tensor descriptor (input)
  * @param w                     Pointer to input weights tensor (input)
  * @param yDesc                 An array of fully packed tensor descriptors associated
@@ -2598,13 +2667,15 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNBackwardWeights(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hy                    Pointer to the hidden layer output tensor (output)
+ * @param hy                    Pointer to the hidden layer output tensor. If hy is NULL, 
+ * then the final hidden state will not be saved. (output)
  * @param cyDesc                A output cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cy                    Pointer to the cell layer output tensor (output)
+ * @param cy                    Pointer to the cell layer output tensor. If cy is NULL, 
+ * then the final cell state will not be saved. (output)
  * @param workSpace             Pointer to memory allocated for forward training (input)
  * @param workSpaceNumBytes     Number of allocated bytes in memory for the workspace (input)
  * @return                      miopenStatus_t

From 91febb1c4844fc189ef2b931cbd606b61a60ccb5 Mon Sep 17 00:00:00 2001
From: Daniel Lowell <daniel.lowell@amd.com>
Date: Sun, 25 Mar 2018 20:27:40 -0500
Subject: [PATCH 03/12] Formatting

---
 include/miopen/miopen.h | 109 +++++++++++++++++++++-------------------
 test/rnn_vanilla.cpp    |   2 +-
 2 files changed, 59 insertions(+), 52 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index bfe9054236..1954adf61c 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -613,7 +613,8 @@ MIOPEN_EXPORT miopenStatus_t
 miopenDestroyConvolutionDescriptor(miopenConvolutionDescriptor_t convDesc);
 
 /*! @enum miopenConvFwdAlgorithm_t
- * Convolutional algorithm mode for forward propagation. MIOpen use cross-correlation for its convolution implementation.
+ * Convolutional algorithm mode for forward propagation. MIOpen use cross-correlation for its
+ * convolution implementation.
  */
 typedef enum {
     miopenConvolutionFwdAlgoGEMM     = 0, /*!< GEMM variant */
@@ -698,10 +699,12 @@ miopenConvolutionForwardGetWorkSpaceSize(miopenHandle_t handle,
  * to execute this function, miopenConvolutionForwardGetWorkSpaceSize() must be
  * run to determine the required memory for this search.
  *
- * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
+ * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If
+ * a
  * configuration match is not found, a default configuration will be returned.
  *
- * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
+ * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration.
+ * If
  * a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
@@ -831,10 +834,12 @@ miopenConvolutionBackwardDataGetWorkSpaceSize(miopenHandle_t handle,
  * execute this function, miopenConvolutionBackwardsDataGetWorkSpaceSize() must be run to determine
  * the required memory for this search.
  *
- * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
+ * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If
+ * a
  * configuration match is not found, a default configuration will be returned.
  *
- * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
+ * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration.
+ * If
  * a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
@@ -944,10 +949,12 @@ miopenConvolutionBackwardWeightsGetWorkSpaceSize(miopenHandle_t handle,
  * execute this function, miopenConvolutionBackwardsWeightsGetWorkSpaceSize() must be run to
  * determine the required memory for this search.
  *
- * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If a
+ * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If
+ * a
  * configuration match is not found, a default configuration will be returned.
  *
- * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration. If
+ * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration.
+ * If
  * a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
@@ -1938,11 +1945,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNHiddenTensorSize(miopenHandle_t handle,
  * hidden state GEMM.
  *
  * * paramID 0 and 4 are for the input gate operations.
- * 
+ *
  * * paramID 1 and 5 are for the forget gate operations.
- * 
+ *
  * * paramID 2 and 6 are for the memory gate operations.
- * 
+ *
  * * paramID 3 and 7 are for the output gate operations.
  *
  * For miopenGRU paramID 0 to 2 refer to the weight matrix offset associated
@@ -1991,11 +1998,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParamSize(miopenHandle_t handle,
  *
  * * biasID 3 and 7 are for the output gate operations.
  *
- * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM, 
+ * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM,
  * while 3 through 5 are associated with the hidden state GEMM.
  *
  * * biasID 0 and 3 are for the reset gate operations.
- * 
+ *
  * * biasID 1 and 4 are for the update gate operations.
  *
  * * biasID 2 and 5 are for the memory gate operations.
@@ -2030,11 +2037,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasSize(miopenHandle_t handle,
  * hidden state GEMM.
  *
  * * paramID 0 and 4 are for the input gate operations.
- * 
+ *
  * * paramID 1 and 5 are for the forget gate operations.
- * 
+ *
  * * paramID 2 and 6 are for the memory gate operations.
- * 
+ *
  * * paramID 3 and 7 are for the output gate operations.
  *
  * For miopenGRU paramID 0 to 2 refer to the weight matrix offset associated
@@ -2059,8 +2066,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasSize(miopenHandle_t handle,
  * nullptr then only the paramDesc is populated and returned. The size in bytes of the
  * layer parameter matrix can be determined by using miopenGetRNNLayerParamSize().
  *
- * Note: When inputSkip mode is selected there is no input layer matrix operation, 
- * and therefore no associated memory. In this case miopenGetRNNLayerParam() will return 
+ * Note: When inputSkip mode is selected there is no input layer matrix operation,
+ * and therefore no associated memory. In this case miopenGetRNNLayerParam() will return
  * a error status miopenStatusBadParm for input paramID associated with the input GEMM.
  *
  * @param handle          MIOpen handle (input)
@@ -2106,11 +2113,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParam(miopenHandle_t handle,
  * * biasID 3 and 7 are for the output gate operations.
  *
  *
- * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM, 
+ * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM,
  * while 3 through 5 are associated with the hidden state GEMM.
  *
  * * biasID 0 and 3 are for the reset gate operations.
- * 
+ *
  * * biasID 1 and 4 are for the update gate operations.
  *
  * * biasID 2 and 5 are for the memory gate operations.
@@ -2127,8 +2134,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParam(miopenHandle_t handle,
  * nullptr then only the biasDesc is populated and returned. The size in bytes of the
  * layer bias can be determined by using miopenGetRNNLayerBiasSize().
  *
- * Note: When inputSkip mode is selected there is no input layer matrix operation, 
- * and therefore no associated memory. In this case miopenGetRNNLayerBias() will return 
+ * Note: When inputSkip mode is selected there is no input layer matrix operation,
+ * and therefore no associated memory. In this case miopenGetRNNLayerBias() will return
  * a error status miopenStatusBadParm for input biasID associated with the input GEMM.
  *
  * @param handle          MIOpen handle (input)
@@ -2166,11 +2173,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBias(miopenHandle_t handle,
  * hidden state GEMM.
  *
  * * paramID 0 and 4 are for the input gate operations.
- * 
+ *
  * * paramID 1 and 5 are for the forget gate operations.
- * 
+ *
  * * paramID 2 and 6 are for the memory gate operations.
- * 
+ *
  * * paramID 3 and 7 are for the output gate operations.
  *
  * For miopenGRU paramID 0 to 2 refer to the weight matrix offset associated
@@ -2193,8 +2200,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBias(miopenHandle_t handle,
  * The argument layerParamOffset should either be nullptr, or an address to place the
  * offset. If layerParamOffset is nullptr then only the paramDesc is populated and returned.
  *
- * Note: When inputSkip mode is selected there is no input layer matrix operation, 
- * and therefore no associated memory. In this case miopenGetRNNLayerParamOffset() will return 
+ * Note: When inputSkip mode is selected there is no input layer matrix operation,
+ * and therefore no associated memory. In this case miopenGetRNNLayerParamOffset() will return
  * a error status miopenStatusBadParm for input paramID associated with the input GEMM.
  *
  *
@@ -2232,11 +2239,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParamOffset(miopenRNNDescriptor_t
  *
  * * biasID 3 and 7 are for the output gate operations.
  *
- * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM, 
+ * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM,
  * while 3 through 5 are associated with the hidden state GEMM.
  *
  * * biasID 0 and 3 are for the reset gate operations.
- * 
+ *
  * * biasID 1 and 4 are for the update gate operations.
  *
  * * biasID 2 and 5 are for the memory gate operations.
@@ -2251,8 +2258,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerParamOffset(miopenRNNDescriptor_t
  * The argument layerBiasOffset should either be nullptr, or point to an output address.
  * If layerBias is nullptr then only the biasDesc is populated and returned.
  *
- * Note: When inputSkip mode is selected there is no input layer matrix operation, 
- * and therefore no associated memory. In this case miopenGetRNNLayerBiasOffset() will return 
+ * Note: When inputSkip mode is selected there is no input layer matrix operation,
+ * and therefore no associated memory. In this case miopenGetRNNLayerBiasOffset() will return
  * a error status miopenStatusBadParm for input biasID associated with the input GEMM.
  *
  * @param rnnDesc         RNN layer descriptor type (input)
@@ -2284,11 +2291,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasOffset(miopenRNNDescriptor_t r
  * hidden state GEMM.
  *
  * * paramID 0 and 4 are for the input gate operations.
- * 
+ *
  * * paramID 1 and 5 are for the forget gate operations.
- * 
+ *
  * * paramID 2 and 6 are for the memory gate operations.
- * 
+ *
  * * paramID 3 and 7 are for the output gate operations.
  *
  *
@@ -2308,8 +2315,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetRNNLayerBiasOffset(miopenRNNDescriptor_t r
  * The input argument paramDesc is a previously populated tensor descriptor typically
  * by first calling miopenGetRNNLayerParam().
  *
- * Note: When inputSkip mode is selected there is no input layer matrix operation, 
- * and therefore no associated memory. In this case miopenSetRNNLayerParam() will return 
+ * Note: When inputSkip mode is selected there is no input layer matrix operation,
+ * and therefore no associated memory. In this case miopenSetRNNLayerParam() will return
  * a error status miopenStatusBadParm for input paramID associated with the input GEMM.
  *
  * @param handle          MIOpen handle (input)
@@ -2354,11 +2361,11 @@ MIOPEN_EXPORT miopenStatus_t miopenSetRNNLayerParam(miopenHandle_t handle,
  * * biasID 3 and 7 are for the output gate operations.
  *
  *
- * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM, 
+ * For miopenGRU biasID 0 to 2 refer to the biases associated with the input GEMM,
  * while 3 through 5 are associated with the hidden state GEMM.
  *
  * * biasID 0 and 3 are for the reset gate operations.
- * 
+ *
  * * biasID 1 and 4 are for the update gate operations.
  *
  * * biasID 2 and 5 are for the memory gate operations.
@@ -2369,8 +2376,8 @@ MIOPEN_EXPORT miopenStatus_t miopenSetRNNLayerParam(miopenHandle_t handle,
  * The input argument biasDesc is a previously populated tensor descriptor typically
  * by first calling miopenGetRNNLayeBias().
  *
- * Note: When inputSkip mode is selected there is no input layer matrix operation, 
- * and therefore no associated memory. In this case miopenSetRNNLayerBias will return 
+ * Note: When inputSkip mode is selected there is no input layer matrix operation,
+ * and therefore no associated memory. In this case miopenSetRNNLayerBias will return
  * a error status miopenStatusBadParm for input biasID associated with the input GEMM.
  *
  * @param handle          MIOpen handle (input)
@@ -2412,14 +2419,14 @@ MIOPEN_EXPORT miopenStatus_t miopenSetRNNLayerBias(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL, 
+ * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL,
  * then the initial hidden state will be zero initialized. (input)
  * @param cxDesc                A cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cx                    Pointer to the cell layer input tensor. If cx is NULL, 
+ * @param cx                    Pointer to the cell layer input tensor. If cx is NULL,
  * then the initial cell state will be zero initialized. (input)
  * @param wDesc                 A weights tensor descriptor (input)
  * @param w                     Pointer to input weights tensor (input)
@@ -2436,14 +2443,14 @@ MIOPEN_EXPORT miopenStatus_t miopenSetRNNLayerBias(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hy                    Pointer to the hidden layer output tensor. If hy is NULL, 
+ * @param hy                    Pointer to the hidden layer output tensor. If hy is NULL,
  * then the final hidden state will not be saved. (output)
  * @param cyDesc                A cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cy                    Pointer to the cell layer output tensor. If hy is NULL, 
+ * @param cy                    Pointer to the cell layer output tensor. If hy is NULL,
  * then the final cell state will not be saved. (output)
  * @param workSpace             Pointer to memory allocated for forward training (input)
  * @param workSpaceNumBytes     Number of allocated bytes in memory for the workspace (input)
@@ -2501,7 +2508,7 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNForwardTraining(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param dcy                   Pointer to the cell layer input tensor. If dcy is NULL, 
+ * @param dcy                   Pointer to the cell layer input tensor. If dcy is NULL,
  * then the initial delta cell state will be zero initialized. (input)
  * @param wDesc                 A weights tensor descriptor (input)
  * @param w                     Pointer to input weights tensor (input)
@@ -2510,14 +2517,14 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNForwardTraining(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL, 
+ * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL,
  * then the initial hidden state will be zero initialized. (input)
  * @param cxDesc                A input cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cx                    Pointer to the hidden layer input tensor. If cx is NULL, 
+ * @param cx                    Pointer to the hidden layer input tensor. If cx is NULL,
  * then the initial cell state will be zero initialized. (input)
  * @param dxDesc                An array of tensor descriptors. These are the
  * input descriptors to each time step. The first dimension of each descriptor is the
@@ -2591,7 +2598,7 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNBackwardData(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL, 
+ * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL,
  * then the initial hidden state will be zero initialized. (input)
  * @param yDesc                 An array of fully packed tensor descriptors associated
  * with the output from each time step. The first dimension of the tensor descriptors
@@ -2643,14 +2650,14 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNBackwardWeights(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL, 
+ * @param hx                    Pointer to the hidden layer input tensor. If hx is NULL,
  * then the initial hidden state will be zero initialized. (input)
  * @param cxDesc                A cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cx                    Pointer to the cell layer input tensor. If cx is NULL, 
+ * @param cx                    Pointer to the cell layer input tensor. If cx is NULL,
  * then the initial cell state will be zero initialized. (input)
  * @param wDesc                 A weights tensor descriptor (input)
  * @param w                     Pointer to input weights tensor (input)
@@ -2667,14 +2674,14 @@ MIOPEN_EXPORT miopenStatus_t miopenRNNBackwardWeights(miopenHandle_t handle,
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param hy                    Pointer to the hidden layer output tensor. If hy is NULL, 
+ * @param hy                    Pointer to the hidden layer output tensor. If hy is NULL,
  * then the final hidden state will not be saved. (output)
  * @param cyDesc                A output cell tensor descriptor that has as its first dimension
  * of the number of layers if the direction mode is unidirectional and twice the
  * number of layers if the direction mode is bidirectional. The second dimension of
  * the descriptor must equal the largest first dimension of the xDesc tensor descriptor
  * array. The third dimension equals the hiddenSize. (input)
- * @param cy                    Pointer to the cell layer output tensor. If cy is NULL, 
+ * @param cy                    Pointer to the cell layer output tensor. If cy is NULL,
  * then the final cell state will not be saved. (output)
  * @param workSpace             Pointer to memory allocated for forward training (input)
  * @param workSpaceNumBytes     Number of allocated bytes in memory for the workspace (input)
diff --git a/test/rnn_vanilla.cpp b/test/rnn_vanilla.cpp
index 9838652d87..db9348cc8c 100644
--- a/test/rnn_vanilla.cpp
+++ b/test/rnn_vanilla.cpp
@@ -45,7 +45,7 @@
 #include <utility>
 #include <cfloat>
 
-#define MIO_RNN_TEST_DEBUG 0
+#define MIO_RNN_TEST_DEBUG 1
 #define MIO_RNN_TIME_EVERYTHING 0
 
 /**********************************************

From 044918c2262af0fd13fa92eb41e1614a4c593538 Mon Sep 17 00:00:00 2001
From: Daniel Lowell <daniel.lowell@amd.com>
Date: Sun, 25 Mar 2018 20:40:30 -0500
Subject: [PATCH 04/12] Removed debug flag.

---
 test/rnn_vanilla.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/rnn_vanilla.cpp b/test/rnn_vanilla.cpp
index db9348cc8c..9838652d87 100644
--- a/test/rnn_vanilla.cpp
+++ b/test/rnn_vanilla.cpp
@@ -45,7 +45,7 @@
 #include <utility>
 #include <cfloat>
 
-#define MIO_RNN_TEST_DEBUG 1
+#define MIO_RNN_TEST_DEBUG 0
 #define MIO_RNN_TIME_EVERYTHING 0
 
 /**********************************************

From eec8ea11ec64cd34b70f3ed86d1a8dcf70848a39 Mon Sep 17 00:00:00 2001
From: Daniel Lowell <daniel.lowell@amd.com>
Date: Sun, 25 Mar 2018 20:43:42 -0500
Subject: [PATCH 05/12] More formatting.

---
 src/ocl/rnnocl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp
index 6a42a2fcec..c1edb6f444 100644
--- a/src/ocl/rnnocl.cpp
+++ b/src/ocl/rnnocl.cpp
@@ -3682,7 +3682,7 @@ void RNNDescriptor::RNNBackwardWeights(Handle& handle,
         {
             int in_bias_val = inputMode == miopenRNNskip ? 0 : wei_stride;
 
-                               hid_shift = li * batch_n * hy_stride;
+            hid_shift = li * batch_n * hy_stride;
             wei_shift = (li == 0) ? (wei_shift_bias + in_bias_val)
                                   : (wei_shift_bias + in_bias_val + li * 2 * wei_stride);
 

From c0c32e82bddfbe4ea04a7f4eeed7629438db8823 Mon Sep 17 00:00:00 2001
From: mayank daga <mayank.daga@amd.com>
Date: Fri, 30 Mar 2018 13:19:12 -0500
Subject: [PATCH 06/12] multi-line formatting

---
 include/miopen/miopen.h | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 1954adf61c..01aff0be79 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -700,12 +700,10 @@ miopenConvolutionForwardGetWorkSpaceSize(miopenHandle_t handle,
  * run to determine the required memory for this search.
  *
  * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If
- * a
- * configuration match is not found, a default configuration will be returned.
+ * a configuration match is not found, a default configuration will be returned.
  *
  * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration.
- * If
- * a match is not found, an exhaustive search is performed by running individual algorithms.
+ * If a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
  * @param xDesc              Tensor descriptor for data input tensor x (input)
@@ -835,12 +833,10 @@ miopenConvolutionBackwardDataGetWorkSpaceSize(miopenHandle_t handle,
  * the required memory for this search.
  *
  * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If
- * a
- * configuration match is not found, a default configuration will be returned.
+ * a configuration match is not found, a default configuration will be returned.
  *
  * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration.
- * If
- * a match is not found, an exhaustive search is performed by running individual algorithms.
+ * If a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
  * @param dyDesc             Tensor descriptor for data input tensor dy (input)
@@ -950,12 +946,10 @@ miopenConvolutionBackwardWeightsGetWorkSpaceSize(miopenHandle_t handle,
  * determine the required memory for this search.
  *
  * * If exhaustiveSearch == 0, MIOpen will look for the first kernel with a configuration match. If
- * a
- * configuration match is not found, a default configuration will be returned.
+ * a configuration match is not found, a default configuration will be returned.
  *
  * * If exhaustiveSearch == 1, MIOpen will look for the best kernel for the provided configuration.
- * If
- * a match is not found, an exhaustive search is performed by running individual algorithms.
+ * If a match is not found, an exhaustive search is performed by running individual algorithms.
  *
  * @param handle             MIOpen handle (input)
  * @param dyDesc             Tensor descriptor for data input tensor dy (input)

From 4b44009f10b206515e5e51fbe9346f11f44814ec Mon Sep 17 00:00:00 2001
From: mayank daga <mayank.daga@amd.com>
Date: Fri, 30 Mar 2018 13:21:33 -0500
Subject: [PATCH 07/12] fixed numeric

---
 doc/src/perfdatabase.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/perfdatabase.md b/doc/src/perfdatabase.md
index 11b0ce694c..20c34a7339 100644
--- a/doc/src/perfdatabase.md
+++ b/doc/src/perfdatabase.md
@@ -48,4 +48,4 @@ This variable allows to limit the scope of `MIOPEN_FIND_ENFORCE`, so that only f
 
 **CONV_BWD (3)** `MIOPEN_FIND_ENFORCE` affects only Backward Data convolutions.
 
-**CONV_WRW (3)** `MIOPEN_FIND_ENFORCE` affects only Backward With Regard to Weights (a.k.a WRW) convolutions.
\ No newline at end of file
+**CONV_WRW (4)** `MIOPEN_FIND_ENFORCE` affects only Backward With Regard to Weights (a.k.a WRW) convolutions.

From 768ba03b6e1f270b5b58f515496bd990eedf7b49 Mon Sep 17 00:00:00 2001
From: mayank daga <mayank.daga@amd.com>
Date: Fri, 30 Mar 2018 13:28:44 -0500
Subject: [PATCH 08/12] edits

---
 README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b6ff4241d5..d0b58d5a14 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ This will install by default to `/usr/local` but it can be installed in another
 cmake -P install_deps.cmake --prefix /some/local/dir
 ```
 
-If Ubuntu v16 is used then the `OpenSSL` and `Boost` packages can also be installed by:
+Instructions to manually install all the dependencies on Ubuntu v16 are present in this [section](#installing-the-dependencies-manually).
 ```
 sudo apt-get install libssl-dev
 sudo apt-get install libboost-dev
@@ -187,6 +187,18 @@ Also, githooks can be installed to format the code per-commit:
 ./.githooks/install
 ```
 
+## Installing the dependencies manually
+
+If Ubuntu v16 is used then the `OpenSSL` and `Boost` packages can also be installed by:
+```
+sudo apt-get install libssl-dev
+sudo apt-get install libboost-dev
+sudo apt-get install libboost-system-dev
+sudo apt-get install libboost-filesystem-dev
+```
+
+`half` header needs to be installed from [here](http://half.sourceforge.net/). 
+
 ## Using docker
 
 The easiest way is to use docker. You can build the top-level docker file:

From 5d59939c85caf74ed645b9f4b7f1241979df33ba Mon Sep 17 00:00:00 2001
From: Mayank Daga <mayank.daga@amd.com>
Date: Fri, 30 Mar 2018 13:35:07 -0500
Subject: [PATCH 09/12] Update README.md

---
 README.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/README.md b/README.md
index d0b58d5a14..d124bc83ae 100644
--- a/README.md
+++ b/README.md
@@ -33,12 +33,6 @@ cmake -P install_deps.cmake --prefix /some/local/dir
 ```
 
 Instructions to manually install all the dependencies on Ubuntu v16 are present in this [section](#installing-the-dependencies-manually).
-```
-sudo apt-get install libssl-dev
-sudo apt-get install libboost-dev
-sudo apt-get install libboost-system-dev
-sudo apt-get install libboost-filesystem-dev
-```
 
 ## Installing MIOpen with pre-built packages
 

From 98c159596b93cbbfde7b9e614ab0176cc20e491b Mon Sep 17 00:00:00 2001
From: mayank daga <mayank.daga@amd.com>
Date: Fri, 30 Mar 2018 13:39:08 -0500
Subject: [PATCH 10/12] fixed var. redefine

---
 src/kernels/MIOpenLRNBwd.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/MIOpenLRNBwd.cl b/src/kernels/MIOpenLRNBwd.cl
index b5c235b25e..9bdb6f7529 100644
--- a/src/kernels/MIOpenLRNBwd.cl
+++ b/src/kernels/MIOpenLRNBwd.cl
@@ -40,7 +40,7 @@
 #endif
 
 #define _FLOAT2 PPCAT(_FLOAT, TWO)
-#define _FLOAT2 PPCAT(_FLOAT, THREE)
+#define _FLOAT3 PPCAT(_FLOAT, THREE)
 #define _FLOAT4 PPCAT(_FLOAT, FOUR)
 #define _FLOAT8 PPCAT(_FLOAT, EIGHT)
 

From 2bbec29955877584fd4ccfcc9e56f66a45686df4 Mon Sep 17 00:00:00 2001
From: Mayank Daga <mayank.daga@amd.com>
Date: Fri, 30 Mar 2018 13:47:45 -0500
Subject: [PATCH 11/12] Update releasenotes.md

---
 doc/src/releasenotes.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/releasenotes.md b/doc/src/releasenotes.md
index 9c0d6bf705..e8a93e8240 100644
--- a/doc/src/releasenotes.md
+++ b/doc/src/releasenotes.md
@@ -5,17 +5,17 @@
 
 Notes: 
 
-- This release adds fp16 support for Inference using CNNs
 - Performance improvements for RNNs
 - Performance improvements for convolutions using 1x1 filters
 - Performance improvement for Batch Normalization
+- This release adds preliminary fp16 support for Inference using CNNs
 - Bug fixes for various components of MIOpen
 
-Changed:
+Changes:
 
 - Added 2 new API for RNNs: miopenGetRNNLayerParamOffset and miopenGetRNNLayerBiasOffset
-- Added in support for uninitialized hidden states and nullptr outputs in RNNs
-- Added support for Set and Scale operations for strided tensors with dimensions 1,2,3,4,5
+- Added support for uninitialized hidden states and nullptr outputs in RNNs
+- Added support for Set and Scale operations for strided tensors with dimensions 1 to 5
 - Added multi-thread and multi-process support for the performance database
 - Improved performance for OpTensor
 - Fixed bug in convolutions for backward bias

From 61ca3774ed5d4d54ca303a6c0c64a4f320ed5a83 Mon Sep 17 00:00:00 2001
From: Mayank Daga <mayank.daga@amd.com>
Date: Fri, 30 Mar 2018 13:49:18 -0500
Subject: [PATCH 12/12] Update README.md

---
 README.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d124bc83ae..5c2706f597 100644
--- a/README.md
+++ b/README.md
@@ -20,14 +20,10 @@ AMD's library for high peformance machine learning primitives. MIOpen supports t
 
 ## Installing the dependencies
 
-The dependencies can be installed with the `install_deps.cmake`, script:
+The dependencies can be installed with the `install_deps.cmake`, script: `cmake -P install_deps.cmake`
 
-```
-cmake -P install_deps.cmake
-```
 
 This will install by default to `/usr/local` but it can be installed in another location with `--prefix` argument:
-
 ```
 cmake -P install_deps.cmake --prefix /some/local/dir
 ```