diff --git a/README.md b/README.md
index 9aecd57e67..f4b3e99fce 100644
--- a/README.md
+++ b/README.md
@@ -18,8 +18,8 @@ MIOpen supports two programming models -
 * ROCm cmake modules can be installed from [here](https://github.com/RadeonOpenCompute/rocm-cmake)
 * [Half](http://half.sourceforge.net/) - IEEE 754-based half-precision floating point library
 * [Boost](http://www.boost.org/) at least version 1.58
-  * MIOpen uses `boost-system` and `boost-filesystem` packages to enable persistent [kernel cache](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/master/doc/src/cache.md)
-* [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS) Minimum version 2.0.0 (recommended version 2.2.0)
+  * MIOpen uses `boost-system` and `boost-filesystem` packages to enable persistent [kernel cache](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/cache.html)
+* [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS) Minimum version branch [master-rocm-2.6](https://github.com/ROCmSoftwarePlatform/rocBLAS/tree/master-rocm-2.6)
 
 
 ## Installing MIOpen with pre-built packages
@@ -44,12 +44,12 @@ cmake -P install_deps.cmake --prefix /some/local/dir
 ```
 This prefix can used to specify the dependency path during the configuration phase using the `CMAKE_PREFIX_PATH`.
 
-MIOpen's HIP backend uses [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS) by default. Users can intall rocBlas minimum release by using `apt-get install rocblas`. To disable using rocBlas set the configuration flag `-DMIOPEN_USE_ROCBLAS=Off`. rocBlas is *not* available for the OpenCL backend.
+MIOpen's HIP backend uses [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS) by default. Users can install rocBlas minimum release by using `apt-get install rocblas`. To disable using rocBlas set the configuration flag `-DMIOPEN_USE_ROCBLAS=Off`. rocBlas is *not* available for the OpenCL backend.
 
 
 ## Installing minimum dependencies in ROCm environment
 
-Users who are working in a fully installed and up to date ROCm environment may not wish to additionally install rocm-cmake, clang-ocl, MIOpenGEMM, or rocBLAS. This can be done by simpily inserting the command `--minimum` into the cmake command as shown below:
+Users who are working in a fully installed and up to date ROCm environment may not wish to additionally install rocm-cmake, clang-ocl, MIOpenGEMM, or rocBLAS. This can be done by simply inserting the command `--minimum` into the cmake command as shown below:
 
 ```
 cmake -P install_deps.cmake --minimum --prefix /some/local/dir
@@ -91,6 +91,7 @@ cmake -DMIOPEN_BACKEND=OpenCL -DCMAKE_PREFIX_PATH=/some/local/dir ..
 
 Set the C++ compiler to `hcc`.
 ```
+export CXX=<location-of-hcc-compiler>
 cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="<hip-installed-path>;<hcc-installed-path>;<clang-ocl-installed-path>" ..
 ```
 An example cmake step can be:
@@ -118,7 +119,7 @@ Database paths can be explicitly customized by means of `MIOPEN_SYSTEM_DB_PATH`
 
 If the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user database file. The user can find the file with the suffix `*.updb.txt` in the user perf db path. 
 
-More information about the performance database can be found [here](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/master/doc/src/perfdatabase.md).
+More information about the performance database can be found [here](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/perfdatabase.html).
 
 
 ### Persistent Program Cache
@@ -127,7 +128,7 @@ MIOpen by default caches the device programs in the location `~/.cache/miopen/`.
 
 Users can also disable the cache during runtime using the environmental variable set as `MIOPEN_DISABLE_CACHE=1`. 
 
-If the compiler changes, or the user modifies the kernels then the cache must be deleted for the MIOpen version in use; e.g., `rm -rf ~/.cache/miopen/<miopen-version-number>`. More information about the cache can be found [here](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/master/doc/src/cache.md).
+If the compiler changes, or the user modifies the kernels then the cache must be deleted for the MIOpen version in use; e.g., `rm -rf ~/.cache/miopen/<miopen-version-number>`. More information about the cache can be found [here](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/cache.html).
 
 
 ### Changing the cmake configuration
@@ -158,7 +159,7 @@ The driver can be built using the `MIOpenDriver` target:
 
 ` cmake --build . --config Release --target MIOpenDriver ` **OR** ` make MIOpenDriver `
 
-Documentation on how to run the driver is [here](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/master/driver/README.md). 
+Documentation on how to run the driver is [here](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/driver.html). 
 
 ## Running the tests
 
@@ -187,7 +188,10 @@ HTML and PDFs are generated using [Sphinx](http://www.sphinx-doc.org/en/stable/i
 
 Requirements for both Sphinx, Breathe, and the ReadTheDocs theme can be filled for these in the MIOpen/doc folder:
 
-`pip install -r ./requirements.txt`
+```
+pip install -r ./requirements.txt
+```
+
 
 Depending on your setup `sudo` may be required for the pip install.
 
@@ -214,7 +218,7 @@ sudo apt-get install libboost-system-dev
 sudo apt-get install libboost-filesystem-dev
 ```
 
-*Note:* MIOpen by default will attempt to build with Boost staticially linked libraries. If it is needed, the user can build with dynamically linked Boost libraries by using this flag during the configruation stage:
+*Note:* MIOpen by default will attempt to build with Boost statically linked libraries. If it is needed, the user can build with dynamically linked Boost libraries by using this flag during the configruation stage:
 ```
 -DBoost_USE_STATIC_LIBS=Off
 ```
@@ -226,9 +230,13 @@ The `half` header needs to be installed from [here](http://half.sourceforge.net/
 ## Using docker
 
 The easiest way is to use docker. You can build the top-level docker file:
+```
+docker build -t miopen .
+```
 
-    docker build -t miopen .
+Then to enter the development environment use `docker run`:
+```
+docker run --device='/dev/kfd' --device='/dev/dri' -v=`pwd`:/data -w /data --group-add video -it miopen
+```
 
-Then to enter the developement environment use `docker run`:
 
-    docker run --device='/dev/kfd' --device='/dev/dri' -v=`pwd`:/data -w /data --group-add video -it miopen
diff --git a/doc/src/DebugAndLogging.md b/doc/src/DebugAndLogging.md
index 9e119241ad..79e30fb48e 100644
--- a/doc/src/DebugAndLogging.md
+++ b/doc/src/DebugAndLogging.md
@@ -28,7 +28,8 @@ All logging messages output to standard error stream (`stderr`). The following e
 > **_NOTE:_ When asking for technical support, please include the console log obtained with the following settings:**
 > ```
 > export MIOPEN_ENABLE_LOGGING=1
-> export MIOPEN_LOG_LEVEL=5
+> export MIOPEN_ENABLE_LOGGING_CMD=1
+> export MIOPEN_LOG_LEVEL=6
 > ```
 
 * `MIOPEN_ENABLE_LOGGING_MPMT` - When enabled, each log line is prefixed with information which allows the user to identify records printed from different processes and/or threads. Useful for debugging multi-process/multi-threaded apps.
@@ -43,12 +44,13 @@ The following list of environment variables allow for enabling/disabling various
 > 0, no, false, disable, disabled - to disable kernels/algorithm
 > ```
 
-If a variable is not set, then MIOpen behaves as if it is set to `enabled`, unless otherwise specified. So all kinds of kernels/algorithms are enabled by default and variables can be used for disabling them.
+If a variable is not set, then MIOpen behaves as if it is set to `enabled`, unless otherwise specified. So all kinds of kernels/algorithms are enabled by default and the below variables can be used for disabling them. The exception to this rule is `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM` which is disabled by default:
 
 * `MIOPEN_DEBUG_CONV_FFT` – FFT convolution algorithm. 
 * `MIOPEN_DEBUG_CONV_DIRECT` – Direct convolution algorithm.
 * `MIOPEN_DEBUG_CONV_GEMM` - GEMM convolution algorithm. These are implemented on top of miopengemm or rocBlas.
 * `MIOPEN_DEBUG_GCN_ASM_KERNELS` – Kernels written in assembly language. So far, the most of the assembly kernels are implementing the Direct convolution algorithm.
+* `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM` – FP32 implicit GEMM convolution algorithm, disabled by default due to compatibility issue with older compiler. Set to 1 to turn on implicit GEMM algorithm.
 * `MIOPEN_DEBUG_AMD_ROCM_PRECOMPILED_BINARIES` - Binary kernels. Right now all the binary kernels are Winograd ones, however, not all Winograds are binaries. To disable all Winograd algorithms, the following two vars can be used:
 * `MIOPEN_DEBUG_AMD_WINOGRAD_3X3` - FP32 Winograd Fwd/Bwd, filter size fixed to 3x3.
 * `MIOPEN_DEBUG_AMD_WINOGRAD_RXS` - FP32 and FP16 Winograd Fwd/Bwd, variable filter size.
diff --git a/doc/src/Getting_Started_FusionAPI.md b/doc/src/Getting_Started_FusionAPI.md
index 8ecedcbcb5..7bc4ad5e24 100644
--- a/doc/src/Getting_Started_FusionAPI.md
+++ b/doc/src/Getting_Started_FusionAPI.md
@@ -190,231 +190,11 @@ Once the fusion plan object is destroyed, all the operations created are destroy
 The tables below outlines the supported fusions for fp32 and fp16 as well as any applicable constraints. **(C = convolution, B = bias, N = batch normalization, A = activation)**
 
 
-### Convolution based FP32 Fusion for Inference
-
-
-<table border=1 cellpadding=0 cellspacing=0 width=714 style='border-collapse:
- collapse;table-layout:fixed;width:535pt'>
- <col width=93 style='mso-width-source:userset;mso-width-alt:2986;width:70pt'>
- <col width=76 style='mso-width-source:userset;mso-width-alt:2432;width:57pt'>
- <col width=51 style='mso-width-source:userset;mso-width-alt:1621;width:38pt'>
- <col width=171 style='mso-width-source:userset;mso-width-alt:5461;width:128pt'>
- <col width=51 style='mso-width-source:userset;mso-width-alt:1621;width:38pt'>
- <col width=140 style='mso-width-source:userset;mso-width-alt:4480;width:105pt'>
- <col width=132 style='mso-width-source:userset;mso-width-alt:4224;width:99pt'>
- <tr height=21 style='height:16.0pt'>
-  <td colspan=7 height=21 class=xl66 width=714 style='height:16.0pt;width:535pt'>Single
-  Precision Floating Point</td>
- </tr>
- <tr height=45 style='height:34.0pt'>
-  <td height=45 class=xl66 style='height:34.0pt'><center><b>Combination</b></center></td>
-  <td class=xl66><center><b>Conv Algo</b></center></td>
-  <td class=xl66><center><b>Stride</b></center></td>
-  <td class=xl66><center><b>Filter Dims</b></center></td>
-  <td class=xl68 width=51 style='width:38pt'><center><b>N Mode*</b></center></td>
-  <td class=xl66><center><b>Activations</b></center></td>
-  <td class=xl68 width=132 style='width:99pt'><center><b>Other Constraints</b></center></td>
- </tr>
- <tr height=107 style='mso-height-source:userset;height:80.0pt'>
-  <td height=107 class=xl65 style='height:80.0pt'>CBNA</td>
-  <td class=xl65>Direct</td>
-  <td class=xl67 width=51 style='width:38pt'>1 and 2</td>
-  <td class=xl67 width=171 style='width:128pt'>3x3, 5x5, 7x7, 9x9, 11x11</td>
-  <td class=xl65>All</td>
-  <td class=xl65>All</td>
-  <td class=xl67 width=132 style='width:99pt'>stride and padding must be either
-  1 or 2</td>
- </tr>
- <tr height=63 style='mso-height-source:userset;height:47.0pt'>
-  <td rowspan=12 height=354 class=xl65 style='height:263.0pt'>CBA</td>
-  <td class=xl65>Direct</td>
-  <td class=xl65></td>
-  <td class=xl67 width=171 style='width:128pt'>1x1</td>
-  <td class=xl65></td>
-  <td class=xl65>All</td>
-  <td class=xl67 width=132 style='width:99pt'>stride/ padding not supported</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td rowspan=11 height=291 class=xl65 style='height:216.0pt'>Winograd</td>
-  <td class=xl65>1</td>
-  <td class=xl67 width=171 style='width:128pt'>1x1, 2x2</td>
-  <td class=xl65>N/A</td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>c &gt;= 18</td>
- </tr>
- <tr height=39 style='mso-height-source:userset;height:29.0pt'>
-  <td height=39 class=xl65 style='height:29.0pt'>1</td>
-  <td class=xl67 width=171 style='width:128pt'>3x3</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>c &gt;= 18 and c is even</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td height=23 class=xl65 style='height:17.0pt'>1</td>
-  <td class=xl67 width=171 style='width:128pt'>4x4, 5x5, 6x6</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>4 x c &gt;= 18</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td height=23 class=xl65 style='height:17.0pt'>1</td>
-  <td class=xl67 width=171 style='width:128pt'>7x7, 8x8, 9x9</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>12 x c &gt;= 18</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td height=23 class=xl65 style='height:17.0pt'>1</td>
-  <td class=xl67 width=171 style='width:128pt'>10x10, 11x11, 12x12</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>16 x c &gt;= 18</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td height=23 class=xl65 style='height:17.0pt'>1</td>
-  <td class=xl67 width=171 style='width:128pt'>larger filter sizes</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>none</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td height=23 class=xl65 style='height:17.0pt'>2</td>
-  <td class=xl67 width=171 style='width:128pt'>1x1</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>2 x c &gt;= 18</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td height=23 class=xl65 style='height:17.0pt'>2</td>
-  <td class=xl67 width=171 style='width:128pt'>2x2, 3x3, 4x4, 5x5, 6x6</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>4 x c &gt;= 18</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td height=23 class=xl65 style='height:17.0pt'>2</td>
-  <td class=xl67 width=171 style='width:128pt'>7x7</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>12 x c &gt;= 18</td>
- </tr>
- <tr height=45 style='height:34.0pt'>
-  <td height=45 class=xl65 style='height:34.0pt'>2</td>
-  <td class=xl67 width=171 style='width:128pt'>8x8, 9x9, 10x10, 11x11, 12x12</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>16 x c &gt;= 18</td>
- </tr>
- <tr height=23 style='height:17.0pt'>
-  <td height=23 class=xl65 style='height:17.0pt'>2</td>
-  <td class=xl67 width=171 style='width:128pt'>larger filter sizes</td>
-  <td class=xl65></td>
-  <td class=xl65>Relu, Leaky Relu</td>
-  <td class=xl67 width=132 style='width:99pt'>none</td>
- </tr>
- <tr height=45 style='height:34.0pt'>
-  <td height=45 class=xl65 style='height:34.0pt'>NA</td>
-  <td class=xl65>-</td>
-  <td class=xl65></td>
-  <td class=xl65>-</td>
-  <td class=xl65>All</td>
-  <td class=xl65>All</td>
-  <td class=xl67 width=132 style='width:99pt'>Padding not supported</td>
- </tr>
-</table>
-*N mode is either spatial, or per activation. For CBA other asymmetric kernels are supported as well, but are not enumerated here for brevity.
-
-<br><br>
-
-
-### Convolution based FP16 Fusion for Inference
-
-<table border=1 cellpadding=0 cellspacing=0 width=714 style='border-collapse:
- collapse;table-layout:fixed;width:535pt'>
- <col width=93 style='mso-width-source:userset;mso-width-alt:2986;width:70pt'>
- <col width=76 style='mso-width-source:userset;mso-width-alt:2432;width:57pt'>
- <col width=51 style='mso-width-source:userset;mso-width-alt:1621;width:38pt'>
- <col width=171 style='mso-width-source:userset;mso-width-alt:5461;width:128pt'>
- <col width=51 style='mso-width-source:userset;mso-width-alt:1621;width:38pt'>
- <col width=140 style='mso-width-source:userset;mso-width-alt:4480;width:105pt'>
- <col width=132 style='mso-width-source:userset;mso-width-alt:4224;width:99pt'>
- <tr height=21 style='height:16.0pt'>
-  <td colspan=7 height=21 class=xl67 width=714 style='height:16.0pt;width:535pt'><center><b>Half
-  Precision Floating Point</td></b></center>
- </tr>
- <tr height=45 style='height:34.0pt'>
-  <td height=45 class=xl66 style='height:34.0pt'><center><b>Combination</b></center></td>
-  <td class=xl66><center><b>Conv Algo</b></center></td>
-  <td class=xl66><center><b>Stride</b></center></td>
-  <td class=xl66><center><b>Filter Dims</b></center></td>
-  <td class=xl68 width=51 style='width:38pt'><center><b>N Mode*</b></center></td>
-  <td class=xl66><center><b>Activations</b></center></td>
-  <td class=xl68 width=132 style='width:99pt'><center><b>Other Constraints</b></center></td>
- </tr>
- <tr height=68 style='height:51.0pt'>
-  <td height=68 style='height:51.0pt'>CBNA</td>
-  <td>Direct</td>
-  <td class=xl69 width=51 style='width:38pt'>1 and 2</td>
-  <td>3x3, 5x5, 7x7, 9x9, 11x11</td>
-  <td>All</td>
-  <td>All</td>
-  <td class=xl68 width=132 style='width:99pt'>stride and padding must be either
-  1 or 2</td>
- </tr>
- <tr height=45 style='height:34.0pt'>
-  <td height=45 class=xl66 style='height:34.0pt'>CBA</td>
-  <td>Direct</td>
-  <td class=xl65></td>
-  <td>1x1</td>
-  <td></td>
-  <td>All</td>
-  <td class=xl68 width=132 style='width:99pt'>stride/ padding not supported</td>
- </tr>
-</table>
-
-*N mode is either spatial, or per activation. 
-<br><br>
-
-
-### Batch Normalization based fusion for FP32 and FP16 for Inference and Training
-<table border=1 cellpadding=0 cellspacing=0 width=713 style='border-collapse:
- collapse;table-layout:fixed;width:534pt'>
- <col width=108 style='mso-width-source:userset;mso-width-alt:3456;width:81pt'>
- <col width=87 style='width:65pt'>
- <col width=221 style='mso-width-source:userset;mso-width-alt:7082;width:166pt'>
- <col width=87 style='width:65pt'>
- <col width=123 style='mso-width-source:userset;mso-width-alt:3925;width:92pt'>
- <col width=87 style='width:65pt'>
- <tr height=45 style='height:34.0pt'>
-  <td height=45 class=xl65 width=108 style='height:34.0pt;width:81pt'><center><b>Combination</b></center></td>
-  <td class=xl65 width=87 style='width:65pt'><center><b>N mode*</b></center></td>
-  <td class=xl65 width=123 style='width:92pt'><center><b>Activations</b></center></td>
-  <td class=xl65 width=87 style='width:65pt'><center><b>Constraints</b></center></td>
- </tr>
- <tr height=45 style='height:34.0pt'>
-  <td height=45 class=xl66 width=108 style='height:34.0pt;width:81pt'>NA for inference</td>
-  <td class=xl66 width=87 style='width:65pt'><center>All</center></td>
-  <td class=xl66 width=123 style='width:92pt'><center>All</center></td>
-  <td class=xl66 width=87 style='width:65pt'>None </td>
- </tr>
- <tr height=45 style='height:34.0pt'>
-  <td height=46 class=xl67 width=108 style='height:34.0pt;width:81pt'>NA forward training</td>
-  <td class=xl66 width=87 style='width:65pt'><center>All</center></td>
-  <td class=xl66 width=123 style='width:92pt'><center>All</center></td>
-  <td class=xl66 width=87 style='width:65pt'>None </td>
- </tr>
-  <tr height=45 style='height:34.0pt'>
-  <td height=46 class=xl67 width=108 style='height:34.0pt;width:81pt'>NA backward training</td>
-  <td class=xl66 width=87 style='width:65pt'><center>All</center></td>
-  <td class=xl66 width=123 style='width:92pt'><center>All</center></td>
-  <td class=xl66 width=87 style='width:65pt'>None </td>
- </tr>
-
-</table>
-*N mode is either spatial, or per activation.
-<br><br>
+![Convolution based fp32 fusion](fp32fusions.png)
 
 
+![Convolution based fp16 fusion](fp16fusions.png)
+
 
 ## Performance Comparison to Non-Fused Kernels
 
diff --git a/doc/src/apireference.rst b/doc/src/apireference.rst
index 67d32b9f4d..80b9fe974b 100644
--- a/doc/src/apireference.rst
+++ b/doc/src/apireference.rst
@@ -7,6 +7,7 @@ API Reference
     :maxdepth: 4
     :caption: Contents:
 
+    datatypes
     handle
     tensor
     activation
@@ -16,4 +17,5 @@ API Reference
     lrn
     pooling
     softmax
-    fusion
\ No newline at end of file
+    fusion
+    loss
\ No newline at end of file
diff --git a/doc/src/datatypes.md b/doc/src/datatypes.md
new file mode 100644
index 0000000000..1a46e5fd62
--- /dev/null
+++ b/doc/src/datatypes.md
@@ -0,0 +1,38 @@
+
+# Datatypes
+
+
+MIOpen contains several datatypes at different levels of support. The enumerated datatypes are shown below:
+
+```
+typedef enum {
+    miopenHalf     = 0,
+    miopenFloat    = 1,
+    miopenInt32    = 2,
+    miopenInt8     = 3,
+    miopenInt8x4   = 4,
+    miopenBFloat16 = 5,
+} miopenDataType_t;
+```
+
+Of these types only `miopenFloat` and `miopenHalf` are fully supported across all layers in MIOpen. Please see the individual layers in API reference section for specific datatype support and limitations.
+
+Type descriptions:
+ * `miopenHalf` - 16-bit floating point
+ * `miopenFloat` - 32-bit floating point
+ * `miopenInt32` - 32-bit integer, used primarily for `int8` convolution outputs
+ * `miopenInt8` - 8-bit integer, currently only supported by `int8` convolution forward path, tensor set, tensor copy, tensor cast, tensor transform, tensor transpose, and im2col.
+ * `miopenInt8x4` - 8-bit 4 element vector type used primarily with `int8` convolutions forward path.
+ * `miopenBFloat16` - brain float fp-16 (8-bit exponent, 7-bit fraction), currently only supported by convolutions, tensor set, and tensor copy.
+
+
+Note: In addition to the standard datatypes above, pooling contains its own indexing datatypes:
+```
+typedef enum {
+    miopenIndexUint8  = 0,
+    miopenIndexUint16 = 1,
+    miopenIndexUint32 = 2,
+    miopenIndexUint64 = 3,
+} miopenIndexType_t;
+```
+
diff --git a/doc/src/driverTableCrop.png b/doc/src/driverTableCrop.png
new file mode 100644
index 0000000000..0fd6395f2b
Binary files /dev/null and b/doc/src/driverTableCrop.png differ
diff --git a/doc/src/find_and_immediate.md b/doc/src/find_and_immediate.md
new file mode 100644
index 0000000000..8da241798f
--- /dev/null
+++ b/doc/src/find_and_immediate.md
@@ -0,0 +1,160 @@
+Find and Immediate Mode
+=======================
+
+
+
+## Find API
+
+MIOpen contains several convolution algorithms for each stage of training or inference. Pre-MIOpen version 2.0 users needed to call Find methods in order generate a set of applicable algorithms.
+
+A typical workflow for the find stage:
+
+```
+miopenConvolutionForwardGetWorkSpaceSize(handle, 
+                                         weightTensorDesc, 
+                                         inputTensorDesc, 
+                                         convDesc, 
+                                         outputTensorDesc, 
+                                         &maxWorkSpaceSize);
+
+// < allocate workspace >
+
+
+// NOTE:
+// miopenFindConvolution*() call is expensive in terms of execution time and required workspace.
+// Therefore it is highly recommended to save off the selected algorithm and workspace required so that
+// can be reused later within the lifetime of the same MIOpen handle object.
+// In this way, there should be is no need to invoke miopenFind*() more than once per application lifetime.
+
+miopenFindConvolutionForwardAlgorithm(handle, 
+                                      inputTensorDesc, 
+                                      input_device_mem, 
+                                      weightTensorDesc, 
+                                      weight_device_mem,
+                                      convDesc,
+                                      outputTensorDesc, 
+                                      output_device_mem,,
+                                      request_algo_count,
+                                      &ret_algo_count,
+                                      perf_results,
+                                      workspace_device_mem,
+                                      maxWorkSpaceSize,
+                                      1);
+
+// < select fastest algorithm >
+
+// < free previously allocated workspace and allocate workspace required for the selected algorithm>
+
+miopenConvolutionForward(handle, &alpha,
+                         inputTensorDesc, 
+                         input_device_mem, 
+                         weightTensorDesc, 
+                         weight_device_mem,
+                         convDesc,
+                         perf_results[0].fwd_algo, // use the fastest algo
+                         &beta,
+                         outputTensorDesc, 
+                         output_device_mem,
+                         workspace_device_mem,
+                         perf_results[0].memory); //workspace size                                           
+```
+
+
+The results of Find() are returned in an array of `miopenConvAlgoPerf_t` structs in order of performance, with the fastest at index 0.
+
+This call sequence is executed once per session as it is inherently expensive. Of those, `miopenFindConvolution*()` is the most expensive call. It caches its own results on disk, so the subsequent calls during the same MIOpen session will execute faster. However, it is better to remember results of `miopenFindConvolution*()` in the application, as recommended above. 
+
+
+
+
+## Immediate Mode API
+
+MIOpen v2.0 introduces the immediate which removes the requirement for the `miopenFindConvolution*()` calls and their associated runtime costs. In this mode, the user can query the MIOpen runtime for all the supported _solutions_ for a given convolution configuration. These solutions may either be using the same algorithm or different ones. The sequence of operations for in immediate mode is similar to launching regular convolutions in MIOpen i.e. through the use of the `miopenFindConvolution*()` API. However, in this case the different APIs have much lower runtime cost. A typical convolution call would be similar to the following sequence of calls:
+
+* The user constructs the MIOpen handle and relevant descriptors such as the convolution descriptor as usual. 
+* With the above data structures, the user calls `miopenConvolution*GetSolutionCount` to get the **maximum** number of supported solutions for the convolution descriptor in question.
+* The count obtained above is used to allocate memory for the `miopenConvSolution_t` structure introduced in MIOpen v2.0
+* The user calls `miopenConvolution*GetSolution` to populate the `miopenConvSolution_t` structures allocated above. The returned list is ordered in the order of best performance, thus the first element would be the fastest. 
+* While the above structure returns the amount of workspace required for an alogrithm, the user may inquire the amount of a workspace required for a known solution id by using the `miopenConvolution*GetSolutionWorkspaceSize` API call. However, this is not a requirement, since the strucure returned by `miopenConvolution*GetSolution` would already have this information. 
+* Now the user may initiate the convolution operation in _immediate_ mode by calling `miopenConvolution*Immediate`. Which would populate the output tensor descriptor with the respective convolution result. However, the first call to `miopenConvolution*Immediate` may consume more time since the kernel may not be present in the kernel cache and may need to be compiled.
+* Optionally, the user may compile the solution of choice by calling `miopenConvolution*CompileSolution` which would ensure that the kernel represented by the chosen solution is populated in the kernel cache a priori, removing the necessity for compiling the kernel in question. 
+
+
+```
+miopenConvolutionForwardGetSolutionCount(handle, 
+                                         weightTensorDesc,
+                                         inputTensorDesc,
+                                         convDesc,
+                                         outputTensorDesc,
+                                         &solutionCount);
+
+
+// < allocate an array of miopenConvSolution_t of size solutionCount >
+
+
+miopenConvolutionForwardGetSolution(handle,
+                                    weightTensorDesc,
+                                    inputTensorDesc,
+                                    convDesc,
+                                    outputTensorDesc,
+                                    solutionCount,
+                                    &actualCount,
+                                    solutions);
+
+// < select a solution from solutions array >
+
+miopenConvolutionForwardGetSolutionWorkspaceSize(handle,
+                                                 weightTensorDesc,
+                                                 inputTensorDesc,
+                                                 convDesc,
+                                                 outputTensorDesc,
+                                                 selected->solution_id,
+                                                 &ws_size);
+ 
+// < allocate solution workspace of size ws_size >
+
+
+// This stage is optional
+miopenConvolutionForwardCompileSolution(handle,  
+                                        weightTensorDesc,
+                                        inputTensorDesc,
+                                        convDesc,
+                                        outputTensorDesc,
+                                        selected->solution_id);
+
+
+
+ miopenConvolutionForwardImmediate(handle,
+                                   weightTensor,
+                                   weight_device_mem,
+                                   inputTensorDesc,
+                                   input_device_mem,
+                                   convDesc,
+                                   outputTensorDesc,
+                                   output_device_mem,
+                                   workspace_device_mem,
+                                   ws_size,
+                                   selected->solution_id);                                                   
+```
+
+## Immediate Mode Fall Back
+
+The immediate mode is underpinned by the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html), however it may not contain every configuration of interest. Immediate mode's behavior when encountering a database miss is to fallback to a GEMM algorithm. The GEMM algorithm will handle most cases, however, if the user requires performance they should run the Find stage at least once. Fallback's `miopenConvolution*GetSolution` returns only one `miopenConvSolution_t` structure and its `time` member contains negative value. Future releases will implement a more robust heuristic based fallback, which is expected to provide better (but still non-optimal) performance.
+
+
+
+## Limitations of Immediate Mode
+
+### Architectual Limitations
+The system Find-Db has only been populated for the following architectures:
+ * gfx906 with 64 CUs
+ * gfx906 with 60 CUs
+ * gfx900 with 64 CUs
+ * gfx900 with 56 CUs
+
+If the user's architecture is not listed above they will need to run the Find API once on their system per application in order to take advantage of immediate mode's more efficient behavior.
+
+
+### Backend Limitations
+
+OpenCL support for immediate mode via the fallback is limited to fp32 datatypes. This is because this current release's fallback path goes through GEMM which on the OpenCL is serviced through MIOpenGEMM -- which itself only contains support for fp32. The HIP backend uses rocBLAS as its fallback path which contains a richer set of datatypes.
diff --git a/doc/src/finddb.md b/doc/src/finddb.md
new file mode 100644
index 0000000000..39d8cfd455
--- /dev/null
+++ b/doc/src/finddb.md
@@ -0,0 +1,44 @@
+Find-Db Database
+================
+
+Prior to MIOpen 2.0, users utilized calls such as `miopenFindConvolution*Algorithm()` to gather a set of convolution algorithms in the form of an array of `miopenConvSolution_t` structs. This process is time consuming because it requires online benchmarking of competing algorithms. In MIOpen 2.0 an [immediate mode](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/find_and_immediate.html) is introduced. 
+
+Immediate mode is based on a database which contains the results of calls to the legacy Find() stage. This database is called `Find-Db`. It consists of two parts:
+- **System Find-Db**, a system-wide storage which holds the pre-run values for the most applicable configurations,
+- **User Find-Db**, a per-user storage which is intended to hold results for arbitrary user-run configurations. It also performs double duty as a cache for the Find() stage.
+
+The User Find-Db **always takes precedence** over System Find-Db.
+
+By default, System Find-Db resides within MIOpen's install location, while User Find-Db resides in the user's home directory. See [Setting up locations](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#setting-up-locations) for more information.
+
+ * The System Find-Db is *not* modified upon installation of MIOpen.
+ * There are separate Find databases for HIP and OpenCL backends.
+
+### Populating the User Find-Db
+
+MIOpen collects Find-db information during the following MIOpen API calls:
+- `miopenFindConvolutionForwardAlgorithm()`
+- `miopenFindConvolutionBackwardDataAlgorithm()`
+- `miopenFindConvolutionBackwardWeightsAlgorithm()`
+
+During the call, find data entries are collected for one _problem configuration_ (implicitly defined by the tensor descriptors and convolution descriptor passed to API function).
+
+
+### Updating MIOpen and the User Find-Db
+
+When the user installs a new version of MIOpen, the new version of MIOpen will _ignore_ old **User find-db*** files. Thus, the user is _not required_ to move or delete their old User find-db files. However, the user may wish to re-collect the information into their brand new **User find-db**. This should be done in the same way as it was done with the previous version of the library -- _if_ it was done. This would keep Immediate mode optimized.
+
+
+### Disabling Find-Db
+
+By default MIOpen will use the Find-Db. Users can disable the Find-Db by setting the environmental variable `MIOPEN_DEBUG_DISABLE_FIND_DB` to 1:
+```
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+```
+
+**Note:** The System Find-Db has the ability to be cached into memory and may increase performance dramatically. To enable this option use the cmake configuration flag:
+```
+-DMIOPEN_DEBUG_FIND_DB_CACHING=On
+```
+
+
diff --git a/doc/src/fp16fusions.png b/doc/src/fp16fusions.png
new file mode 100644
index 0000000000..425a29c371
Binary files /dev/null and b/doc/src/fp16fusions.png differ
diff --git a/doc/src/fp32fusions.png b/doc/src/fp32fusions.png
new file mode 100644
index 0000000000..bf8ac94139
Binary files /dev/null and b/doc/src/fp32fusions.png differ
diff --git a/doc/src/index.rst b/doc/src/index.rst
index 2509db6110..8c12c07aa7 100644
--- a/doc/src/index.rst
+++ b/doc/src/index.rst
@@ -19,6 +19,8 @@ Sources and binaries can be found at `MIOpen's GitHub site <https://github.com/R
     DebugAndLogging
     cache
     perfdatabase
+    finddb
+    find_and_immediate
     Getting_Started_FusionAPI
     apireference
 
diff --git a/doc/src/loss.rst b/doc/src/loss.rst
new file mode 100644
index 0000000000..16a15905bf
--- /dev/null
+++ b/doc/src/loss.rst
@@ -0,0 +1,41 @@
+
+Loss Function Layer
+===================
+
+The loss function layer API documentation
+
+
+miopenCTCLossAlgo_t
+-------------------
+
+.. doxygenenum::  miopenCTCLossAlgo_t
+
+miopenCreateCTCLossDescriptor
+-----------------------------
+
+.. doxygenfunction::  miopenCreateCTCLossDescriptor
+
+miopenGetCTCLossDescriptor
+--------------------------
+
+.. doxygenfunction::  miopenGetCTCLossDescriptor
+
+miopenDestroyCTCLossDescriptor
+------------------------------
+
+.. doxygenfunction::  miopenDestroyCTCLossDescriptor
+
+miopenSetCTCLossDescriptor
+--------------------------
+
+.. doxygenfunction::  miopenSetCTCLossDescriptor
+
+miopenGetCTCLossWorkspaceSize
+-----------------------------
+
+.. doxygenfunction::  miopenGetCTCLossWorkspaceSize
+
+miopenCTCLoss
+-------------
+
+.. doxygenfunction::  miopenCTCLoss
diff --git a/doc/src/perfdatabase.md b/doc/src/perfdatabase.md
index fe8c55b15d..98dfe88e40 100644
--- a/doc/src/perfdatabase.md
+++ b/doc/src/perfdatabase.md
@@ -11,7 +11,7 @@ User PerfDb **always takes precedence** over System PerfDb.
 
 MIOpen also has auto-tuning functionality, which is able to find optimized kernel parameter values for a specific configuration. The auto-tune process may take a substantial amount of time, however, once the optimized values are found, they are stored in the User PerfDb. MIOpen then will automatically read and use these parameter values when needed again instead of running the expensive auto-tuning search.
 
-By default, System PerfDb resides within MIOpen's install location, while User PerfDb resides in the user's home directory. See [Setting up locations](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/master/README.md#setting-up-locations) for more information.
+By default, System PerfDb resides within MIOpen's install location, while User PerfDb resides in the user's home directory. See [Setting up locations](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#setting-up-locations) for more information.
 
 The System PerfDb is not modified upon installation of MIOpen.
 
diff --git a/doc/src/releasenotes.md b/doc/src/releasenotes.md
index 2d2f021433..62106de977 100644
--- a/doc/src/releasenotes.md
+++ b/doc/src/releasenotes.md
@@ -2,6 +2,42 @@
 ## MIOpen Release notes
 
 
+### 07/08/2019 [ 2.0.0 ]
+
+- This release contains several new features including an immediate mode for selecting convolutions, bfloat16 support, new layers, modes, and algorithms.
+- MIOpenDriver, a tool for benchmarking and developing kernels is now shipped with MIOpen.
+- BFloat16 now supported in HIP requires an updated rocBLAS as a GEMM backend.
+- Immediate mode API now provides the ability to quickly obtain a convolution kernel. 
+- MIOpen now contains HIP source kernels and implements the ImplicitGEMM kernels. This is a new feature and is currently disabled by default. Use the environmental variable "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=1" to activation this feature. ImplicitGEMM requires an up to date HIP version of at least 1.5.9211.
+- A new "loss" catagory of layers has been added, of which, CTC loss is the first. See the API reference for more details.
+- 2.0 is the last release of active support for gfx803 architectures. In future releases, MIOpen will not actively debug and develop new features specifically for gfx803.
+- System Find-Db in memory cache is disabled by default. Please see build instructions to enable this feature.
+
+
+Changes:
+
+- Added support for bfloat16 datatype in convolutions
+- Added softmax channel mode and new softmax version 2 API
+- Added fast / accurate / log softmax algorithms 
+- Added new implicit GEMM convolution algorithm for forward and backwards data passes, disabled by default
+- Added int32 datatype support for output tensors in int8 convolutions
+- Added immediate mode for finding the best convolution kernel for a given configuration
+- Added a Find-Db infrastructure which stashes results of find on a user's system
+- Added a shipped System Find-Db containing offline run Find() results
+- Added an additional, faster batch norm assembly kernel for fp16
+- Added CTC loss layer
+- Added MIOpenDriver as a default component in MIOpen's build [#34](https://github.com/ROCmSoftwarePlatform/MIOpen/issues/34)
+- Fixed C compatability for boolean types in C API [#103](https://github.com/ROCmSoftwarePlatform/MIOpen/issues/103)
+- Fixed incorrect calculation in per-activation batch norm backwards pass [#104](https://github.com/ROCmSoftwarePlatform/MIOpen/issues/104)
+- Fixed bug [#95](https://github.com/ROCmSoftwarePlatform/MIOpen/issues/95) with asm batch norm ISA 
+- Fixed IsApplicable bug in Conv3x3Asm for group convolutions
+- Improved performance of 1x1 stride 2 fp32 convolutions in the forward and backwards data passes
+- Improved 3-D convolution stability
+- Improved applicability of direct convolution backwards weights for 2x2, 5x10, and 5x20 filter sizes
+- Improved maintainability in kernels and cpp code
+- Updated rocBLAS minimum version to branch [master-rocm-2.6](https://github.com/ROCmSoftwarePlatform/rocBLAS/tree/master-rocm-2.6)
+
+
 ### 05/03/2019 [ 1.8.1 ]
 
 - This release contains minor bug fixes and additional performance database improvements.
diff --git a/doc/src/softmax.rst b/doc/src/softmax.rst
index 08bb123333..bcbf715504 100644
--- a/doc/src/softmax.rst
+++ b/doc/src/softmax.rst
@@ -3,24 +3,40 @@
 Softmax Layer
 =============
 
-Softmax layer types and functions
+Softmax type and layers
+
+
+miopenSoftmaxAlgorithm_t
+------------------------
+
+.. doxygenenum::  miopenSoftmaxAlgorithm_t
+
+
+miopenSoftmaxMode_t
+-------------------
+
+.. doxygenenum::  miopenSoftmaxMode_t
+
 
 miopenSoftmaxForward
 --------------------
 
 .. doxygenfunction::  miopenSoftmaxForward
 
+
 miopenSoftmaxBackward
 ---------------------
 
 .. doxygenfunction::  miopenSoftmaxBackward
 
+
 miopenSoftmaxForward_V2
---------------------
+-----------------------
 
 .. doxygenfunction::  miopenSoftmaxForward_V2
 
+
 miopenSoftmaxBackward_V2
----------------------
+------------------------
 
 .. doxygenfunction::  miopenSoftmaxBackward_V2
diff --git a/driver/README.md b/driver/README.md
index 739e3a883f..ce2f744f5c 100644
--- a/driver/README.md
+++ b/driver/README.md
@@ -1,15 +1,50 @@
 # MIOpenDriver
 
 The `MIOpenDriver` enables the user to test the functionality of any particular 
-layer in MIOpen in both the forward and backward direction. 
+layer in MIOpen in both the forward and backward direction. MIOpen is shipped with `MIOpenDriver` and its install directory is `miopen/bin` located in the install directory path.
+
+
+## Building the Driver
+
 MIOpenDriver can be build by typing:
 
 ```make MIOpenDriver``` from the ```build``` directory.
 
+
+## Base Arguments
 All the supported layers in MIOpen can be found by the supported `base_args` here:
 
 ``` ./bin/MIOpenDriver --help ```
 
+The supported base arguments:
+
+ * `conv` - Convolutions
+ * `CBAInfer` - Convolution+Bias+Activation fusions for inference
+ * `pool` - Pooling
+ * `lrn` - Local Response Normalization
+ * `activ` - Activations
+ * `softmax` - Softmax
+ * `bnorm` - Batch Normalization
+ * `rnn` - Recurrent Neural Networks (including LSTM and GRU)
+ * `gemm` - General Matrix Multiplication
+ * `ctc` - CTC Loss Function
+
+ These base arguments support fp32 float type, but some of the drivers suport further datatypes -- specifically, half precision (fp16), brain float16 (bfp16), and 8-bit integers (int8).
+ To toggle half precision simpily add the suffix `fp16` to end of the base argument; e.g., `convfp16`.
+ Likewise, to toggle brain float16 just add the suffix `bfp16`, and to use 8-bit integers add `int8`.
+
+ Notes for this release:
+  * Only convolutions support bfp16 and int8
+  * RNN's support fp16 but only on the HIP backend
+  * CTC loss function only supports fp32
+
+Summary of base_args meant for different datatypes and different operations:
+
+![DatatypeSupport](driverTableCrop.png)
+
+
+## Executing MIOpenDriver
+
 To execute from the build directory: 
 
 ```./bin/MIOpenDriver *base_arg* *layer_specific_args*```
@@ -28,6 +63,11 @@ MIOpenDriver example usages:
 
 ```./bin/MIOpenDriver conv -W 32 -H 32 -c 3 -k 32 -x 5 -y 5 -p 2 -q 2 -s 0 -F 1```  
 
+- Convolution with half or bfloat16 input type
+
+```./bin/MIOpenDriver convfp16 -W 32 -H 32 -c 3 -k 32 -x 5 -y 5 -p 2 -q 2 -s 0 -F 1```
+```./bin/MIOpenDriver convbfp16 -W 32 -H 32 -c 3 -k 32 -x 5 -y 5 -p 2 -q 2 -s 0 -F 1```
+
 - Pooling with default parameters:
 
 ```./bin/MIOpenDriver pool```  
@@ -48,3 +88,7 @@ MIOpenDriver example usages:
 
 `./bin/MIOpenDriver *base_arg* -?` **OR**  `./bin/MIOpenDriver *base_arg* -h (--help)`
 
+Note: By default the CPU verification is turned on. Verification can be disabled using `-V 0`.
+
+
+
diff --git a/driver/driver.hpp b/driver/driver.hpp
index 20bb0fcac6..78ae4010d8 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -127,8 +127,9 @@ void PadBufferSize(size_t& sz, int datatype_sz)
 [[gnu::noreturn]] void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-    printf("Supported Base Arguments: conv[fp16|bfp16], CBAInfer[fp16], pool[fp16], lrn[fp16], "
-           "activ[fp16], softmax[fp16], bnorm[fp16], rnn, gemm, ctc\n");
+    printf(
+        "Supported Base Arguments: conv[fp16|int8|bfp16], CBAInfer[fp16], pool[fp16], lrn[fp16], "
+        "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc\n");
     exit(0);
 }
 
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 4adf027cb6..42e79fe2f3 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -300,7 +300,7 @@ MIOPEN_DECLARE_OBJECT(miopenCTCLossDescriptor);
 typedef enum {
     miopenHalf  = 0, /*!< 16-bit floating point (Fully supported) */
     miopenFloat = 1, /*!< 32-bit floating point (Fully supported) */
-    miopenInt32 = 2, /*!< 32-bit int point (Not supported) */
+    miopenInt32 = 2, /*!< 32-bit int point (Partially supported) */
     miopenInt8  = 3, /*!< 8-bit int point (Partially supported) */
     miopenInt8x4 =
         4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */
@@ -555,6 +555,8 @@ MIOPEN_EXPORT miopenStatus_t miopenOpTensor(miopenHandle_t handle,
                                             void* C);
 
 /*! @brief Fills a tensor with a single value.
+ *
+ * Supported datatypes are fp32, fp16, and bfp16
  *
  * @param handle     MIOpen handle (input)
  * @param yDesc      Tensor descriptor for tensor y (input)
@@ -568,6 +570,8 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensor(miopenHandle_t handle,
                                              const void* alpha);
 
 /*! @brief Scales all elements in a tensor by a single value.
+ *
+ * Supported datatypes are fp32 and fp16
  *
  * @param handle     MIOpen handle (input)
  * @param yDesc      Tensor descriptor for tensor y (input)
@@ -590,6 +594,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetTensorNumBytes(miopenTensorDescriptor_t te
                                                      size_t* numBytes);
 
 /*! @brief Copies one tensor to another tensor with a different layout.
+ *
+ * Currently this is used for transforming from int8 to int8x4 vector datatypes
  *
  * @param handle     MIOpen handle (input)
  * @param alpha      Floating point scaling factor, allocated on the host (input)
@@ -819,7 +825,7 @@ typedef enum {
     miopenConvolutionFwdAlgoDirect       = 1, /*!< Direct convolutions */
     miopenConvolutionFwdAlgoFFT          = 2, /*!< Fast Fourier Transform indirect convolutions */
     miopenConvolutionFwdAlgoWinograd     = 3, /*!< Winograd indirect convolutions */
-    miopenConvolutionFwdAlgoImplicitGEMM = 5, /*!< Implicit GEMM convolutions */
+    miopenConvolutionFwdAlgoImplicitGEMM = 5, /*!< Implicit GEMM convolutions, fp32 only and disabled by default */
 } miopenConvFwdAlgorithm_t;
 
 /*! @enum miopenConvBwdWeightsAlgorithm_t
@@ -841,7 +847,7 @@ typedef enum {
     miopenConvolutionBwdDataAlgoWinograd = 3, /*!< Winograd indirect convolutions */
     miopenTransposeBwdDataAlgoGEMM =
         4, /*!< Deprecated Transpose GEMM variant legacy, ToBe Removed */
-    miopenConvolutionBwdDataAlgoImplicitGEMM = 5, /*!< Implicit GEMM convolutions */
+    miopenConvolutionBwdDataAlgoImplicitGEMM = 5, /*!< Implicit GEMM convolutions, fp32 only and disabled by default */
 } miopenConvBwdDataAlgorithm_t;
 
 typedef enum {
@@ -849,7 +855,7 @@ typedef enum {
     miopenConvolutionAlgoDirect       = 1, /*!< Direct convolutions */
     miopenConvolutionAlgoFFT          = 2, /*!< Fast Fourier Transform indirect convolutions */
     miopenConvolutionAlgoWinograd     = 3, /*!< Winograd indirect convolutions */
-    miopenConvolutionAlgoImplicitGEMM = 5, /*!< Implicit GEMM convolutions */
+    miopenConvolutionAlgoImplicitGEMM = 5, /*!< Implicit GEMM convolutions, fp32 only and disabled by default */
 } miopenConvAlgorithm_t;
 
 /*! @struct miopenConvAlgoPerf_t
@@ -1006,6 +1012,7 @@ miopenConvolutionForwardCompileSolution(miopenHandle_t handle,
 
 /*! @brief Executes the Forward convolution operation based on the provided solution ID.
  *
+ * Supported datatypes are fp32, fp16, bfp16, and int8
  *
  * @param handle         MIOpen handle (input)
  * @param wDesc          Tensor descriptor for weight tensor w (input)