diff --git a/.github/workflows/fbgemm_docs.yml b/.github/workflows/fbgemm_docs.yml
new file mode 100644
index 000000000..327e5dd6b
--- /dev/null
+++ b/.github/workflows/fbgemm_docs.yml
@@ -0,0 +1,84 @@
+# This workflow builds the fbgemm_gpu docs and deploys them to gh-pages.
+name: Generate documentation
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build_docs_job:
+    runs-on: linux.2xlarge
+    steps:
+    # Checkout the repository to the GitHub Actions runner
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+    # Update references
+    # TODO: update the git submodule sync after we fixed the auto-sync part
+    - name: Git Sumbodule Update
+      run: |
+        git submodule init
+        git submodule update --remote --recursive
+        git log
+    - name: Update pip
+      run: |
+        sudo yum update -y
+        sudo yum -y install git python3-pip
+        sudo pip3 install --upgrade pip
+    - name: Setup conda
+      run: |
+        wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
+        bash ~/miniconda.sh -b -p $HOME/miniconda
+    - name: setup Path
+      run: |
+        echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
+        echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
+    - name: create conda env
+      run: |
+        conda create --name build_binary python=3.9
+        conda info
+    - name: check python version
+      run: |
+        conda run -n build_binary python --version
+    - name: Install gcc
+      shell: bash
+      run: |
+        sudo yum group install -y "Development Tools"
+    - name: Setup Path
+      run: |
+        echo /usr/local/bin >> $GITHUB_PATH
+    - name: Install PyTorch
+      shell: bash
+      run: |
+        conda run -n build_binary python -m pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+    - name: Test PyTorch Installation
+      run: |
+        conda run -n build_binary python -c "import torch.distributed"
+        echo "torch.distributed succeeded"
+    - name: Install fbgemm_gpu nightly
+      run: |
+        conda run -n build_binary python -m pip install fbgemm-gpu-nightly-cpu
+    - name: Test fbgemm_gpu installation
+      shell: bash
+      run: |
+        conda run -n build_binary \
+          python -c "import fbgemm_gpu"
+    - name: Install Doxygen
+      run: |
+        conda install -n build_binary -c conda-forge doxygen
+        which doxygen
+    - name: Build the docset
+      run: |
+        conda run -n build_binary python -m pip install -r fbgemm_gpu/docs/requirements.txt
+        cd ./fbgemm_gpu/docs
+        conda run -n build_binary doxygen Doxyfile.in
+        conda run -n build_binary make html
+        cd ..
+    - name: Get output time
+      run: echo "The time was ${{ steps.build.outputs.time }}"
+    - name: Deploy
+      uses: JamesIves/github-pages-deploy-action@releases/v3
+      with:
+          ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          BRANCH: gh-pages # The branch the action should deploy to.
+          FOLDER: fbgemm_gpu/docs/build/html # The folder the action should deploy.
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
index 552b5eb33..f70e20dae 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_nightly_build.yml
@@ -117,6 +117,7 @@ jobs:
           --python-tag=${{ matrix.python-tag }} \
           -DTORCH_CUDA_ARCH_LIST="'7.0;8.0'" \
           --plat-name=manylinux1_x86_64
+        ls -lt dist/*.whl
     - name: Upload wheel as GHA artifact
       uses: actions/upload-artifact@v2
       with:
@@ -271,4 +272,5 @@ jobs:
             --username __token__ \
             --password "$PYPI_TOKEN" \
             --skip-existing \
+            --verbose \
             fbgemm_gpu_nightly-*.whl
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
index ebaeee2b1..8e6ba7291 100644
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ b/.github/workflows/fbgemm_nightly_build_cpu.yml
@@ -105,6 +105,7 @@ jobs:
           --python-tag=${{ matrix.python-tag }} \
           --cpu_only \
           --plat-name=manylinux1_x86_64
+        ls -lt dist/*.whl
     - name: Upload wheel as GHA artifact
       uses: actions/upload-artifact@v2
       with:
@@ -156,4 +157,5 @@ jobs:
             --username __token__ \
             --password "$PYPI_TOKEN" \
             --skip-existing \
+            --verbose \
             fbgemm_gpu/dist/fbgemm_gpu_nightly_cpu-*.whl
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
index 1d8acca06..c39490c32 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_release_build.yml
@@ -119,6 +119,7 @@ jobs:
           --python-tag=${{ matrix.python-tag }} \
           -DTORCH_CUDA_ARCH_LIST="'7.0;8.0'" \
           --plat-name=manylinux1_x86_64
+        ls -lt dist/*.whl
     - name: Upload wheel as GHA artifact
       uses: actions/upload-artifact@v2
       with:
@@ -273,4 +274,5 @@ jobs:
             --username __token__ \
             --password "$PYPI_TOKEN" \
             --skip-existing \
+            --verbose \
             fbgemm_gpu-*.whl
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
index 27c0c5888..99664baac 100644
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ b/.github/workflows/fbgemm_release_build_cpu.yml
@@ -107,6 +107,7 @@ jobs:
           --python-tag=${{ matrix.python-tag }} \
           --cpu_only \
           --plat-name=manylinux1_x86_64
+        ls -lt dist/*.whl
     - name: Upload wheel as GHA artifact
       uses: actions/upload-artifact@v2
       with:
@@ -158,4 +159,5 @@ jobs:
             --username __token__ \
             --password "$PYPI_TOKEN" \
             --skip-existing \
+            --verbose \
             fbgemm_gpu/dist/fbgemm_gpu_cpu-*.whl
diff --git a/.github/workflows/fbgemmci.yml b/.github/workflows/fbgemmci.yml
index 092226d19..250544bed 100644
--- a/.github/workflows/fbgemmci.yml
+++ b/.github/workflows/fbgemmci.yml
@@ -248,6 +248,113 @@ jobs:
         python -c "import fbgemm_gpu"
         python -c "import fbgemm_gpu.split_embedding_codegen_lookup_invokers"
 
+  build_amd_gpu:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+
+    steps:
+    - name: Free space
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
+
+    - uses: actions/checkout@v2
+
+    - name: Install ROCm 5.1.1
+      shell: bash
+      run: |
+        sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10
+        wget https://repo.radeon.com/amdgpu-install/22.10.1/ubuntu/focal/amdgpu-install_22.10.1.50101-1_all.deb
+        export DEBIAN_FRONTEND=noninteractive
+        sudo apt install -y ./amdgpu-install_22.10.1.50101-1_all.deb
+        amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms
+        sudo rm amdgpu-install_22.10.1.50101-1_all.deb
+
+    - name: Install dependencies
+      shell: bash
+      run: |
+        sudo apt-get update
+        sudo apt-get -y install git pip python3-dev mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
+        sudo apt-get install -y hipify-clang || true
+        sudo pip install cmake scikit-build ninja jinja2 numpy hypothesis --no-input
+        sudo apt-get clean
+        # Install pytorch 1.11 as required by fbgemm_gpu
+        sudo pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.1.1/
+
+    - name: Checkout submodules
+      shell: bash
+      run: |
+        cd fbgemm_gpu
+        git submodule sync
+        git submodule update --init --recursive
+
+    - name: Build fbgemm_gpu
+      shell: bash
+      run: |
+        sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10
+        cd fbgemm_gpu
+        # build for MI250 only to save time.
+        sudo PYTORCH_ROCM_ARCH=gfx90a python3 setup.py build develop
+
+    - name: Test fbgemm_gpu installation
+      shell: bash
+      run: |
+        cd fbgemm_gpu
+        cd test
+        python3 input_combine_test.py
+        python3 quantize_ops_test.py
+        python3 sparse_ops_test.py
+        python3 -c "import fbgemm_gpu"
+        python3 -c "import fbgemm_gpu.split_embedding_codegen_lookup_invokers"
+
+  test_amd_gpu:
+    runs-on: rocm
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+
+    steps:
+    - name: pre-checkout
+      shell: bash
+      run: |
+        if [ -d ${{ github.workspace }} ]
+        then
+          sudo chown -R $USER:$USER ${{ github.workspace }}
+        fi
+        sudo add-apt-repository ppa:git-core/ppa
+        sudo apt update
+        sudo apt -y install --only-upgrade git
+
+    - uses: actions/checkout@v2
+      with:
+        ref: ${{ github.ref }}
+        submodules: 'true'
+
+    - name: build fbgemm_gpu and test
+      shell: bash
+      run: |
+        set -eux
+        env
+        ls -l
+        DOCKER_IMAGE=rocm/pytorch:rocm5.1.1_ubuntu20.04_py3.7_pytorch_staging_base
+        docker pull $DOCKER_IMAGE
+        JENKINS_REPO_DIR=fbgemm-private-jenkins
+        JENKINS_REPO_DIR_BAREMETAL=$PWD
+        JENKINS_REPO_DIR_DOCKER=/workspace/$JENKINS_REPO_DIR
+        DOCKER_OPTIONS="\
+        --user 0 \
+        --network=host \
+        --ipc=host \
+        --shm-size 16G \
+        --group-add video \
+        --cap-add=SYS_PTRACE \
+        --security-opt seccomp=unconfined \
+        --device=/dev/kfd \
+        --device=/dev/dri \
+        -v $JENKINS_REPO_DIR_BAREMETAL:$JENKINS_REPO_DIR_DOCKER
+        "
+        docker run $DOCKER_OPTIONS $DOCKER_IMAGE $JENKINS_REPO_DIR_DOCKER/.jenkins/rocm/build_and_test.sh $JENKINS_REPO_DIR_DOCKER
+
   build_cpu_only:
     runs-on: ${{ matrix.os }}
     strategy:
diff --git a/.jenkins/rocm/build_and_test.sh b/.jenkins/rocm/build_and_test.sh
new file mode 100755
index 000000000..dadd1342c
--- /dev/null
+++ b/.jenkins/rocm/build_and_test.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# exit immediately on failure, or if an undefined variable is used
+set -eux
+
+FBGEMM_REPO_DIR=${1:-/workspace/FBGEMM}
+
+git config --global --add safe.directory "$FBGEMM_REPO_DIR"
+git config --global --add safe.directory "$FBGEMM_REPO_DIR/third_party/asmjit"
+git config --global --add safe.directory "$FBGEMM_REPO_DIR/third_party/cpuinfo"
+git config --global --add safe.directory "$FBGEMM_REPO_DIR/third_party/googletest"
+git config --global --add safe.directory "$FBGEMM_REPO_DIR/third_party/hipify_torch"
+
+# Install dependencies
+apt-get update --allow-insecure-repositories && \
+  apt-get install -y --allow-unauthenticated \
+  git \
+  jq \
+  sshfs \
+  sshpass \
+  unzip
+
+apt-get install -y locales
+locale-gen en_US.UTF-8
+
+pip3 install click
+pip3 install jinja2
+pip3 install ninja
+pip3 install scikit-build
+pip3 install --upgrade hypothesis
+pip3 install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.1.1/
+
+pip3 list
+
+# Build fbgemm_gpu
+cd "$FBGEMM_REPO_DIR/fbgemm_gpu"
+MAX_JOBS="$(nproc)"
+export MAX_JOBS
+export PYTORCH_ROCM_ARCH="gfx908"
+python setup.py build develop
+
+export FBGEMM_TEST_WITH_ROCM=1
+
+# Test fbgemm_gpu
+cd test
+
+python batched_unary_embeddings_test.py --verbose
+python input_combine_test.py --verbose
+python jagged_tensor_ops_test.py --verbose
+python layout_transform_ops_test.py --verbose
+python merge_pooled_embeddings_test.py --verbose
+python metric_ops_test.py --verbose
+python permute_pooled_embedding_modules_test.py --verbose
+python quantize_ops_test.py --verbose
+python sparse_ops_test.py --verbose
+python split_embedding_inference_converter_test.py --verbose
+python split_table_batched_embeddings_test.py --verbose
+python uvm_test.py --verbose
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6be603dfa..d8b6b989c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -182,6 +182,10 @@ if(NOT TARGET asmjit)
 
   add_subdirectory("${ASMJIT_SRC_DIR}" "${FBGEMM_BINARY_DIR}/asmjit")
   set_property(TARGET asmjit PROPERTY POSITION_INDEPENDENT_CODE ON)
+  # add a flag required for mac build
+  if(NOT MSVC)
+    target_compile_options(asmjit PRIVATE "-Wno-sign-conversion")
+  endif()
 endif()
 
 if(NOT TARGET cpuinfo)
@@ -293,6 +297,10 @@ endif()
 
 if(FBGEMM_BUILD_BENCHMARKS)
   add_subdirectory(bench)
+  # add a flag to enable Clang 14
+  set_source_files_properties(
+    bench/GEMMsBenchmark.cc
+    PROPERTIES COMPILE_FLAGS "-Wno-unused-variable")
 endif()
 
 if(FBGEMM_BUILD_DOCS)
diff --git a/README.md b/README.md
index 48de40e7c..195f5e011 100644
--- a/README.md
+++ b/README.md
@@ -72,9 +72,8 @@ cd FBGEMM
 # if you are updating an existing checkout
 git submodule sync
 git submodule update --init --recursive
-mkdir build && cd build
-cmake ..
-make
+cmake -B build
+make -C build
 ```
 
 To run the tests after building FBGEMM (if tests are built), use the following
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 09c1669ac..2a1babf90 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -10,17 +10,14 @@ if(SKBUILD)
   message("The project is built using scikit-build")
 endif()
 
-if(EXISTS "/usr/bin/nvidia-smi")
-  message("NVIDIA GPU detected.")
-  option(USE_CUDA "Use CUDA" ON)
-  option(USE_ROCM "Use ROCm" OFF)
-elseif(EXISTS "/opt/rocm/bin/rocm-smi")
+option(USE_CUDA "Use CUDA" ON)
+option(USE_ROCM "Use ROCm" OFF)
+
+if(((EXISTS "/opt/rocm/") OR (EXISTS $ENV{ROCM_PATH}))
+   AND NOT (EXISTS "/bin/nvcc"))
   message("AMD GPU detected.")
-  option(USE_CUDA "Use CUDA" OFF)
-  option(USE_ROCM "Use ROCm" ON)
-else()
-  message("Unable to detect GPU vendor")
-  message(FATAL_ERROR "")
+  set(USE_CUDA OFF)
+  set(USE_ROCM ON)
 endif()
 
 if(FBGEMM_CPU_ONLY)
@@ -28,6 +25,7 @@ if(FBGEMM_CPU_ONLY)
 endif()
 
 message("${message_line}")
+message(STATUS "USE_ROCM ${USE_ROCM}")
 
 if(FBGEMM_CPU_ONLY OR USE_ROCM)
   project(
@@ -41,25 +39,20 @@ else()
     LANGUAGES CXX C CUDA)
 endif()
 
-if(USE_CUDA)
-  set(default_cuda_architectures 60 61 70 75 80)
-  set(cuda_architectures_doc
-      "CUDA architectures to build for. Default is ${default_cuda_architectures}")
-  set(cuda_architectures
-      "${default_cuda_architectures}"
-      CACHE STRING "${cuda_architectures_doc}")
+find_package(Torch REQUIRED)
+find_package(PythonExtensions REQUIRED)
 
-  message("${message_line}")
-  message("fbgemm_gpu:")
-  message("Building for cuda_architectures = \"${cuda_architectures}\"")
-  message("${message_line}")
+set(FBGEMM ${CMAKE_CURRENT_SOURCE_DIR}/..)
+set(THIRDPARTY ${FBGEMM}/third_party)
 
 if(DEFINED GLIBCXX_USE_CXX11_ABI)
   if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
     set(CXX_STANDARD_REQUIRED ON)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
-    message("${CMAKE_CXX_FLAGS}")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
   endif()
+  message("${CMAKE_CXX_FLAGS}")
 endif()
 
 #
@@ -68,49 +61,22 @@ endif()
 # constructor exists to convert from "int" to "__half" errors in
 # gen_embedding_forward_quantized_split_[un]weighted_codegen_cuda.cu
 #
-  set(TORCH_CUDA_OPTIONS
-      --expt-relaxed-constexpr
-      -D__CUDA_NO_HALF_OPERATORS__
-      # -D__CUDA_NO_HALF_CONVERSIONS__
-      -D__CUDA_NO_BFLOAT16_CONVERSIONS__
-      -D__CUDA_NO_HALF2_OPERATORS__)
-endif()
 
-find_package(Torch REQUIRED)
-find_package(PythonExtensions REQUIRED)
-
-set(FBGEMM ${CMAKE_CURRENT_SOURCE_DIR}/..)
-set(THIRDPARTY ${FBGEMM}/third_party)
+set(TORCH_CUDA_OPTIONS
+    --expt-relaxed-constexpr -D__CUDA_NO_HALF_OPERATORS__
+    # -D__CUDA_NO_HALF_CONVERSIONS__
+    -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__)
 
 if(USE_ROCM)
-  if(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
-    SET(FBGEMM_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a)
-  else()
-    SET(FBGEMM_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH})
-  endif()
-
-  list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${THIRDPARTY}/hipify_torch/cmake")
+  list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake"
+       "${THIRDPARTY}/hipify_torch/cmake")
   include(Hip)
-  if(NOT FBGEMM_HAVE_HIP)
-    message(FATAL_ERROR "Not able to find HIP installation.")
-  endif()  
   include(Hipify)
-  list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-  set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
-  
-  find_package(rocBLAS REQUIRED)
-  find_package(hipFFT REQUIRED)
-  find_package(hipRAND REQUIRED)
-  find_package(rocRAND REQUIRED)
-  find_package(hipSPARSE REQUIRED)
-  find_package(OpenMP REQUIRED)
-  find_package(rocPRIM REQUIRED)
-  
+
   message("${message_line}")
-  message(STATUS "hip found ${ROCM_FOUND}")
+  message(STATUS "hip found ${HIP_FOUND}")
 endif()
 
-
 #
 # GENERATED CUDA, CPP and Python code
 #
@@ -197,49 +163,41 @@ set(codegen_dependencies
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/embedding_backward_template_helpers.cuh
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/embedding_common.h
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
-    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/quantize_ops_gpu.h
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/quantize_ops_utils.h
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_utils.cuh
-    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/sparse_ops_utils.h
-)
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/sparse_ops_utils.h)
 
-if(USE_CUDA)
-  add_custom_command(
-    OUTPUT ${gen_cpu_source_files} ${gen_gpu_source_files} 
-           ${gen_gpu_host_source_files} ${gen_python_files}
+if(USE_ROCM)
+  execute_process(
     COMMAND
       "${PYTHON_EXECUTABLE}"
       "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py"
-      "--opensource"
-    DEPENDS "${codegen_dependencies}")
-
-    set_source_files_properties(
-      ${gen_cpu_source_files} PROPERTIES COMPILE_OPTIONS
-                              "-mavx2;-mf16c;-mfma;-fopenmp")
-elseif(USE_ROCM)
-  execute_process(
-        COMMAND
-        "${PYTHON_EXECUTABLE}"
-        "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py"
-        "--opensource")
+      "--opensource" DEPENDS "${codegen_dependencies}")
 
   set(header_include_dir
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  )
-  hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR} HEADER_INCLUDE_DIR ${header_include_dir})
+      ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/src
+      ${CMAKE_CURRENT_SOURCE_DIR})
 
-  set_source_files_properties(
-      ${gen_cpu_source_files} PROPERTIES COMPILE_OPTIONS
-                              "-mavx2;-mf16c;-mfma")
+  hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR} HEADER_INCLUDE_DIR
+         ${header_include_dir})
+else()
+  add_custom_command(
+    OUTPUT ${gen_cpu_source_files} ${gen_gpu_source_files}
+           ${gen_gpu_host_source_files} ${gen_python_files}
+    COMMAND
+      "${PYTHON_EXECUTABLE}"
+      "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py" "--opensource"
+    DEPENDS "${codegen_dependencies}")
 endif()
 
+set_source_files_properties(
+  ${gen_cpu_source_files} PROPERTIES COMPILE_OPTIONS
+                                     "-mavx2;-mf16c;-mfma;-fopenmp")
 set_source_files_properties(
   ${gen_cpu_source_files}
   PROPERTIES
     INCLUDE_DIRECTORIES
-    "${CMAKE_CURRENT_SOURCE_DIR};${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_SOURCE_DIR}/../include"
+    "${CMAKE_CURRENT_SOURCE_DIR};${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_SOURCE_DIR}/../include;${THIRDPARTY}/asmjit/src"
 )
 
 set_source_files_properties(
@@ -257,8 +215,8 @@ set_source_files_properties(${gen_gpu_source_files}
                             PROPERTIES COMPILE_OPTIONS "${TORCH_CUDA_OPTIONS}")
 
 if(NOT FBGEMM_CPU_ONLY)
-  set(gen_source_files ${gen_gpu_source_files}
-      ${gen_gpu_host_source_files} ${gen_cpu_source_files})
+  set(gen_source_files ${gen_gpu_source_files} ${gen_gpu_host_source_files}
+                       ${gen_cpu_source_files})
 else()
   set(gen_source_files ${gen_cpu_source_files})
 endif()
@@ -285,14 +243,18 @@ set(cpp_fbgemm_files_avx2 "../src/EmbeddingSpMDMAvx2.cc"
 set_source_files_properties(${cpp_fbgemm_files_avx2}
                             PROPERTIES COMPILE_OPTIONS "-mavx2;-mf16c;-mfma")
 
-set(cpp_fbgemm_files ${cpp_fbgemm_files_normal} ${cpp_fbgemm_files_avx2})
 set(cpp_fbgemm_files_avx512 "../src/EmbeddingSpMDMAvx512.cc")
-if(USE_CUDA)
-  set_source_files_properties(
-    ${cpp_fbgemm_files_avx512}
-    PROPERTIES COMPILE_OPTIONS
-              "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
-  list(APPEND cpp_fbgemm_files ${cpp_fbgemm_files_avx512})
+
+set_source_files_properties(
+  ${cpp_fbgemm_files_avx512}
+  PROPERTIES COMPILE_OPTIONS
+             "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
+
+if(USE_ROCM)
+  set(cpp_fbgemm_files ${cpp_fbgemm_files_normal} ${cpp_fbgemm_files_avx2})
+else()
+  set(cpp_fbgemm_files ${cpp_fbgemm_files_normal} ${cpp_fbgemm_files_avx2}
+                       ${cpp_fbgemm_files_avx512})
 endif()
 
 set(cpp_fbgemm_files_include_directories
@@ -307,18 +269,15 @@ set_source_files_properties(
 # Actual static SOURCES
 #
 
-# Ensure NVML_LIB_PATH is empty if it wasn't set and if the
-# default lib path doesn't exist.
+# Ensure NVML_LIB_PATH is empty if it wasn't set and if the default lib path
+# doesn't exist.
 if(NOT NVML_LIB_PATH)
   set(DEFAULT_NVML_LIB_PATH
-    "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
+      "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
 
   if(EXISTS ${DEFAULT_NVML_LIB_PATH})
-    message(
-      STATUS
-      "Setting NVML_LIB_PATH: \
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so"
-    )
+    message(STATUS "Setting NVML_LIB_PATH: \
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
     set(NVML_LIB_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so")
   endif()
 endif()
@@ -336,7 +295,9 @@ set(fbgemm_gpu_sources_cpu
     src/sparse_ops_cpu.cpp)
 
 if(NOT FBGEMM_CPU_ONLY)
-  list(APPEND fbgemm_gpu_sources_cpu
+  list(
+    APPEND
+    fbgemm_gpu_sources_cpu
     codegen/embedding_forward_quantized_host.cpp
     codegen/embedding_backward_dense_host.cpp
     codegen/embedding_bounds_check_host.cpp
@@ -347,33 +308,37 @@ if(NOT FBGEMM_CPU_ONLY)
     src/permute_pooled_embedding_ops_split_cpu.cpp
     src/quantize_ops_gpu.cpp
     src/sparse_ops_gpu.cpp
-    src/split_table_batched_embeddings.cpp)
+    src/split_table_batched_embeddings.cpp
+    src/metric_ops_host.cpp)
 
-    if(NVML_LIB_PATH)
-      list(APPEND fbgemm_gpu_sources_cpu
-        src/merge_pooled_embeddings_cpu.cpp
-        src/merge_pooled_embeddings_gpu.cpp)
-    endif()
+  if(NVML_LIB_PATH)
+    list(APPEND fbgemm_gpu_sources_cpu src/merge_pooled_embeddings_cpu.cpp
+         src/merge_pooled_embeddings_gpu.cpp)
+  endif()
 endif()
 
-set(fbgemm_gpu_sources_cpu_option "-mavx;-mf16c;-mfma;-mavx2")
-if(USE_CUDA)
-  set_source_files_properties(
-    ${fbgemm_gpu_sources_cpu} PROPERTIES COMPILE_OPTIONS
-                                        "${fbgemm_gpu_sources_cpu_option};-fopenmp")
-endif()
+set_source_files_properties(
+  ${fbgemm_gpu_sources_cpu} PROPERTIES COMPILE_OPTIONS
+                                       "-mavx;-mf16c;-mfma;-mavx2;-fopenmp")
 
 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_gpu
-    codegen/embedding_bounds_check.cu src/cumem_utils.cu
-    src/histogram_binning_calibration_ops.cu src/jagged_tensor_ops.cu
-    src/layout_transform_ops.cu src/permute_pooled_embedding_ops.cu
-    src/permute_pooled_embedding_ops_split.cu
-    src/quantize_ops.cu src/sparse_ops.cu src/split_embeddings_cache_cuda.cu
-    src/split_embeddings_utils.cu)
-
-  set_source_files_properties(${fbgemm_gpu_sources_gpu}
-                            PROPERTIES COMPILE_OPTIONS "${TORCH_CUDA_OPTIONS}")
+      codegen/embedding_bounds_check.cu
+      src/cumem_utils.cu
+      src/histogram_binning_calibration_ops.cu
+      src/jagged_tensor_ops.cu
+      src/layout_transform_ops.cu
+      src/permute_pooled_embedding_ops.cu
+      src/permute_pooled_embedding_ops_split.cu
+      src/quantize_ops.cu
+      src/sparse_ops.cu
+      src/split_embeddings_cache_cuda.cu
+      src/split_embeddings_utils.cu
+      src/metric_ops.cu)
+
+  set_source_files_properties(
+    ${fbgemm_gpu_sources_gpu} PROPERTIES COMPILE_OPTIONS
+                                         "${TORCH_CUDA_OPTIONS}")
 
   # XXXUPS!!! Replace with real
   set_source_files_properties(
@@ -394,38 +359,47 @@ endif()
 if(USE_ROCM)
   set(abspath_gen_source_files)
   foreach(filename_gen_source_file ${gen_source_files})
-    list(APPEND abspath_gen_source_files "${CMAKE_BINARY_DIR}/${filename_gen_source_file}")
+    list(APPEND abspath_gen_source_files
+         "${CMAKE_BINARY_DIR}/${filename_gen_source_file}")
   endforeach()
 endif()
 
-if(USE_CUDA)
-  add_library(fbgemm_gpu_py MODULE ${fbgemm_gpu_sources} ${gen_source_files}
-                                  ${cpp_asmjit_files} ${cpp_fbgemm_files})
-  set_property(TARGET fbgemm_gpu_py PROPERTY CUDA_ARCHITECTURES
-                                  "${cuda_architectures}")
-  if(NOT FBGEMM_CPU_ONLY)
-    target_compile_definitions(fbgemm_gpu_py PRIVATE FBGEMM_CUB_USE_NAMESPACE)
-  endif()
-  set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)
-elseif(USE_ROCM)
+#
+# MODULE
+#
+
+if(USE_ROCM)
   get_hipified_list("${fbgemm_gpu_sources}" fbgemm_gpu_sources)
   get_hipified_list("${abspath_gen_source_files}" abspath_gen_source_files)
   get_hipified_list("${cpp_fbgemm_files}" cpp_fbgemm_files)
 
-  set(FBGEMM_ALL_HIP_FILES ${fbgemm_gpu_sources} ${abspath_gen_source_files} ${cpp_fbgemm_files})
-  set_source_files_properties(${FBGEMM_ALL_HIP_FILES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  set(FBGEMM_ALL_HIP_FILES ${fbgemm_gpu_sources} ${abspath_gen_source_files}
+                           ${cpp_fbgemm_files})
+  set_source_files_properties(${FBGEMM_ALL_HIP_FILES}
+                              PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
   hip_include_directories("${cpp_fbgemm_files_include_directories}")
-  
-  hip_add_library(fbgemm_gpu_py SHARED ${cpp_asmjit_files} ${FBGEMM_ALL_HIP_FILES} ${FBGEMM_HIP_HCC_LIBRARIES} 
-                  HIPCC_OPTIONS ${HIP_HCC_FLAGS})
-  target_include_directories(fbgemm_gpu_py PUBLIC ${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
-endif()
-list (GET TORCH_INCLUDE_DIRS 0 TORCH_PATH)
-if(EXISTS "${TORCH_PATH}/ATen/cuda/CUDAGeneratorImpl.h")
-  target_compile_definitions(fbgemm_gpu_py PRIVATE NEW_GENERATOR_PATH)
-endif()
-if(EXISTS "${TORCH_PATH}/ATen/cuda/Atomic.cuh")
-  target_compile_definitions(fbgemm_gpu_py PRIVATE NEW_ATOMIC_PATH)
+
+  hip_add_library(
+    fbgemm_gpu_py
+    SHARED
+    ${cpp_asmjit_files}
+    ${FBGEMM_ALL_HIP_FILES}
+    ${FBGEMM_HIP_HCC_LIBRARIES}
+    HIPCC_OPTIONS
+    ${HIP_HCC_FLAGS})
+  target_include_directories(
+    fbgemm_gpu_py PUBLIC ${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE}
+                         ${ROCM_SMI_INCLUDE})
+  list(GET TORCH_INCLUDE_DIRS 0 TORCH_PATH)
+else()
+  add_library(fbgemm_gpu_py MODULE ${fbgemm_gpu_sources} ${gen_source_files}
+                                   ${cpp_asmjit_files} ${cpp_fbgemm_files})
+  set_property(TARGET fbgemm_gpu_py PROPERTY CUDA_ARCHITECTURES
+                                             "${cuda_architectures}")
+
+  if(NOT FBGEMM_CPU_ONLY)
+    target_compile_definitions(fbgemm_gpu_py PRIVATE FBGEMM_CUB_USE_NAMESPACE)
+  endif()
 endif()
 
 set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "")
@@ -435,9 +409,7 @@ if(NVML_LIB_PATH)
   target_link_libraries(fbgemm_gpu_py ${NVML_LIB_PATH})
 endif()
 target_include_directories(fbgemm_gpu_py PRIVATE ${TORCH_INCLUDE_DIRS})
-if(USE_CUDA)
-  set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)
-endif()
+set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)
 
 install(TARGETS fbgemm_gpu_py DESTINATION fbgemm_gpu)
 
diff --git a/fbgemm_gpu/README.md b/fbgemm_gpu/README.md
index 680b55f14..5f95d8c93 100644
--- a/fbgemm_gpu/README.md
+++ b/fbgemm_gpu/README.md
@@ -143,6 +143,11 @@ cd ../bench
 python split_table_batched_embeddings_benchmark.py
 ```
 
+To run the tests and benchmarks on a GPU-capable device in CPU-only mode use CUDA_VISIBLE_DEVICES=-1
+```
+CUDA_VISIBLE_DEVICES=-1 python split_table_batched_embeddings_test.py
+```
+
 ## How FBGEMM_GPU works
 For a high-level overview, design philosophy and brief descriptions of various
 parts of FBGEMM_GPU please see our Wiki (work in progress).
@@ -151,6 +156,10 @@ parts of FBGEMM_GPU please see our Wiki (work in progress).
 We have extensively used comments in our source files. The best and up-do-date
 documentation is available in the source files.
 
+# Building API Documentation
+
+See [docs/README.md](docs/README.md).
+
 ## Join the FBGEMM community
 See the [`CONTRIBUTING`](../CONTRIBUTING.md) file for how to help out.
 
diff --git a/fbgemm_gpu/bench/README.md b/fbgemm_gpu/bench/README.md
new file mode 100644
index 000000000..4588ff4a9
--- /dev/null
+++ b/fbgemm_gpu/bench/README.md
@@ -0,0 +1,7 @@
+### Benchmarks
+
+## TorchRec FusedTableBatchedEmbeddingBags
+
+[Torchrec](https://pytorch.org/torchrec/) uses fbgemm_gpu embedding and embedding bag implementations for Fused, Batched, Quantized versions of embedding and embeddingbag (in addition to other kernels).
+They have run benchmarks on FusedEmbeddingBagCollection, which is implemented with fbgemm_gpu's [`SplitTableBatchedEmbeddingBagsCodegen`](https://github.com/pytorch/FBGEMM/blob/253b8842eeb2b33e65f7e2a7cfb79923b0e46bd7/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py#L171). They benchmark utilizing UVM and UVM-caching.
+The [results](https://github.com/pytorch/torchrec/tree/main/benchmarks) show between 13x and 23x usecase in DLRM embedding sizes.
diff --git a/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py b/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py
index 14dc9c359..08ca3f554 100644
--- a/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py
@@ -121,7 +121,7 @@ def main(batch_size, num_tables, num_tasks, repeats) -> None:
         param.detach().copy_(ref_emb.emb_modules[i].weight)
     output_ref = ref_emb(offsets, indices)
     output = unary_emb(offsets_tensor, indices_tensor)
-    torch.testing.assert_allclose(output_ref, output)
+    torch.testing.assert_close(output_ref, output)
     # backward
     d_output = torch.randn([num_tasks, batch_size, len(hash_sizes)]).to(device) * 0.1
     output_ref.backward(d_output)
@@ -131,7 +131,8 @@ def main(batch_size, num_tables, num_tasks, repeats) -> None:
         d_weight_ref.append(emb.weight.grad)
     d_weight_ref = torch.cat(d_weight_ref).view(num_tasks, -1)
     d_weight = unary_emb.weight.grad
-    torch.testing.assert_allclose(d_weight_ref, d_weight.squeeze())
+    # pyre-fixme[16]: Optional type has no attribute `squeeze`.
+    torch.testing.assert_close(d_weight_ref, d_weight.squeeze())
 
     # A100 40MB L2 cache
     elapse, _ = benchmark_torch_function(ref_emb, (offsets, indices), iters=repeats)
diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
index d9abf3626..6edf200bb 100644
--- a/fbgemm_gpu/bench/bench_utils.py
+++ b/fbgemm_gpu/bench/bench_utils.py
@@ -3,12 +3,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import itertools
+import logging
+import statistics
 import time
-from typing import Tuple
+from typing import Callable, List, Optional, Tuple
 
+import numpy as np
 import torch
+from fbgemm_gpu.split_table_batched_embeddings_ops import SparseType
+
+# pyre-fixme[21]: Could not find name `default_rng` in `numpy.random` (stubbed).
+from numpy.random import default_rng
 from torch import Tensor
 
+logging.basicConfig(level=logging.DEBUG)
+
 
 def benchmark_torch_function(
     # pyre-fixme[2]: Parameter must be annotated.
@@ -46,3 +56,365 @@ def benchmark_torch_function(
 
     # pyre-fixme[61]: `output` is undefined, or not always defined.
     return float(elapsed_time) / iters, output
+
+
+def round_up(a: int, b: int) -> int:
+    return int((a + b - 1) // b) * b
+
+
+def get_device() -> torch.device:
+    # pyre-fixme[7]: Expected `device` but got `Union[int, device]`.
+    return (
+        torch.cuda.current_device()
+        if torch.cuda.is_available()
+        else torch.device("cpu")
+    )
+
+
+# Merged indices with shape (T, B, L) -> (flattened indices with shape
+# (T * B * L), offsets with shape (T * B + 1))
+def get_table_batched_offsets_from_dense(
+    merged_indices: Tensor,
+) -> Tuple[Tensor, Tensor]:
+    (T, B, L) = merged_indices.size()
+    lengths = np.ones((T, B)) * L
+    flat_lengths = lengths.flatten()
+    return (
+        merged_indices.long().contiguous().view(-1).to(get_device()),
+        torch.tensor(([0] + np.cumsum(flat_lengths).tolist())).long().to(get_device()),
+    )
+
+
+def get_offsets_from_dense(indices: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    (B, L) = indices.size()
+    return (
+        indices.contiguous().view(-1),
+        torch.tensor(
+            np.cumsum(np.asarray([0] + [L for _ in range(B)])[:-1]).astype(np.int64)
+        ),
+    )
+
+
+def b_indices(
+    b: Callable[..., torch.Tensor],
+    x: torch.Tensor,
+    per_sample_weights: Optional[torch.Tensor] = None,
+    use_cpu: bool = False,
+    do_pooling: bool = True,
+) -> torch.Tensor:
+    (indices, offsets) = get_offsets_from_dense(x)
+    if do_pooling:
+        return b(
+            indices.cuda(),
+            offsets.cuda(),
+            per_sample_weights=per_sample_weights,
+        )
+    else:
+        return b(indices.cuda())
+
+
+def generate_requests(
+    iters: int,
+    B: int,
+    T: int,
+    L: int,
+    E: int,
+    # inter-batch indices reuse rate
+    reuse: float = 0.0,
+    # alpha <= 1.0: use uniform distribution
+    # alpha > 1.0: use zipf distribution
+    alpha: float = 1.0,
+    weights_precision: SparseType = SparseType.FP32,
+    weighted: bool = False,
+    requests_data_file: Optional[str] = None,
+    # Comma-separated list of table numbers
+    tables: Optional[str] = None,
+) -> List[Tuple[torch.IntTensor, torch.IntTensor, Optional[Tensor]]]:
+    if requests_data_file is not None:
+        indices_tensor, offsets_tensor, lengths_tensor = torch.load(requests_data_file)
+
+        average_L = 0
+        if tables is not None:
+            emb_tables = tuple(int(x) for x in tables.split(","))
+            indices = torch.zeros(0, dtype=indices_tensor.dtype)
+            offsets = torch.zeros(1, dtype=offsets_tensor.dtype)
+            total_L = 0
+            for t in emb_tables:
+                t_offsets = offsets_tensor[B * t : B * (t + 1) + 1]
+                total_L += t_offsets[-1] - t_offsets[0]
+                indices = torch.cat(
+                    (indices, indices_tensor[t_offsets[0] : t_offsets[-1]])
+                )
+                offsets = torch.cat(
+                    (
+                        offsets,
+                        t_offsets[1:] - t_offsets[0] + offsets[-1],
+                    )
+                )
+            indices_tensor = indices
+            offsets_tensor = offsets
+            average_L = int(total_L / B)
+
+            assert np.prod(offsets_tensor.size()) - 1 == np.prod((T, B)), (
+                f"Requested tables: {emb_tables} "
+                f"does not conform to inputs (T, B) = ({T}, {B})."
+            )
+            logging.warning(
+                f"Using (indices = {indices_tensor.size()}, offsets = {offsets_tensor.size()}) based "
+                f"on tables: {emb_tables}"
+            )
+        else:
+            average_L = int((offsets_tensor[-1] - offsets_tensor[0]) / B)
+            assert (np.prod(offsets_tensor.size()) - 1) == np.prod((T, B)), (
+                f"Data file (indices = {indices_tensor.size()}, "
+                f"offsets = {offsets_tensor.size()}, lengths = {lengths_tensor.size()}) "
+                f"does not conform to inputs (T, B) = ({T}, {B})."
+            )
+
+        assert (
+            L == average_L
+        ), f"Requested L does not align with provided data file ({L} vs. {average_L})"
+        assert E > max(indices_tensor), (
+            f"Number of embeddings is not enough to support maximum index "
+            f"provided by data file {E} vs. {max(indices_tensor)}"
+        )
+
+        weights_tensor = (
+            None
+            if not weighted
+            else torch.randn(indices_tensor.size(), device=get_device())
+        )
+        rs = []
+        for _ in range(iters):
+            rs.append(
+                (
+                    indices_tensor.to(get_device()),
+                    offsets_tensor.to(get_device()),
+                    weights_tensor,
+                )
+            )
+        return rs
+
+    if alpha <= 1.0:
+        all_indices = torch.randint(
+            low=0,
+            high=E,
+            size=(iters, T, B, L),
+            device=get_device(),
+            dtype=torch.int32,
+        )
+        # each bag is usually sorted
+        (all_indices, _) = torch.sort(all_indices)
+        all_indices = all_indices.reshape(iters, T, B * L)
+    else:
+        assert E >= L, "num-embeddings must be greater than equal to bag-size"
+        # oversample and then remove duplicates to obtain sampling without
+        # replacement
+        all_indices = (np.random.zipf(a=alpha, size=(iters, T, B, 3 * L)) - 1) % E
+        all_indices = torch.ops.fbgemm.bottom_unique_k_per_row(
+            torch.as_tensor(all_indices), L
+        )
+        rng = default_rng()
+        permutation = torch.as_tensor(
+            rng.choice(E, size=all_indices.max().item() + 1, replace=False)
+        )
+        all_indices = permutation.gather(0, all_indices.flatten())
+        all_indices = all_indices.to(get_device()).int().reshape(iters, T, B * L)
+    for it in range(iters - 1):
+        for t in range(T):
+            reused_indices = torch.randperm(B * L, device=get_device())[
+                : int(B * L * reuse)
+            ]
+            all_indices[it + 1, t, reused_indices] = all_indices[it, t, reused_indices]
+
+    rs = []
+    for it in range(iters):
+        weights_tensor = (
+            None if not weighted else torch.randn(T * B * L, device=get_device())
+        )
+        rs.append(
+            get_table_batched_offsets_from_dense(all_indices[it].view(T, B, L))
+            + (weights_tensor,)
+        )
+    return rs
+
+
+def benchmark_requests(
+    requests: List[Tuple[torch.IntTensor, torch.IntTensor, Optional[Tensor]]],
+    func: Callable[[Tensor, Tensor, Optional[Tensor]], Tensor],
+    flush_gpu_cache_size_mb: int = 0,
+    check_median: bool = False,
+    num_warmups: int = 0,
+) -> float:
+    times = []
+
+    if num_warmups > 0:
+        indices, offsets, weights = requests[0]
+        for _ in range(num_warmups):
+            func(indices, offsets, weights)
+
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+    for (indices, offsets, weights) in requests:
+        start_time = time.time()
+        if torch.cuda.is_available():
+            if flush_gpu_cache_size_mb:
+                _ = torch.rand(
+                    flush_gpu_cache_size_mb * 1024 * 1024 // 4, dtype=torch.float
+                )
+                torch.cuda.synchronize()
+            start_event.record()
+        func(indices, offsets, weights)
+        if torch.cuda.is_available():
+            end_event.record()
+            torch.cuda.synchronize()
+            it_time = start_event.elapsed_time(end_event) * 1.0e-3
+            times.append(it_time)
+        else:
+            it_time = time.time() - start_time
+            times.append(it_time)
+    avg_time = sum(times) / len(requests)
+    median_time = statistics.median(times)
+    return median_time if check_median else avg_time
+
+
+def benchmark_requests_refer(
+    requests: List[Tuple[torch.IntTensor, torch.IntTensor, Optional[Tensor]]],
+    T: int,
+    B: int,
+    L: int,
+    E: int,
+    D: int,
+    pooling_mode: str,
+    weighted: bool,
+    flush_gpu_cache_size_mb: int = 0,
+    check_median: bool = False,
+) -> float:
+    do_pooling = pooling_mode in ["sum", "mean"]
+    if do_pooling:
+        nn_embedding_list = [
+            torch.nn.EmbeddingBag(E, D, mode=pooling_mode, sparse=True).cuda()
+        ] * T
+    else:
+        nn_embedding_list = [torch.nn.Embedding(E, D, sparse=True).cuda()] * T
+
+    times = []
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+    for (indices, _, weights) in requests:
+        indices_list = indices.view(T, B, L).split(1)
+
+        if weighted:
+            assert weights is not None
+            weights_list = weights.view(T, B, L).split(1)
+
+        start_time = time.time()
+        if torch.cuda.is_available():
+            if flush_gpu_cache_size_mb:
+                _ = torch.rand(
+                    flush_gpu_cache_size_mb * 1024 * 1024 // 4, dtype=torch.float
+                )
+                torch.cuda.synchronize()
+            start_event.record()
+
+        nn_embedding_output = (
+            [
+                b_indices(nn_embedding, x, use_cpu=False, do_pooling=do_pooling)
+                for (nn_embedding, x) in zip(nn_embedding_list, indices_list)
+            ]
+            if not weighted
+            else [
+                b_indices(
+                    nn_embedding,
+                    x,
+                    per_sample_weights=xw.view(-1),
+                    use_cpu=False,
+                    do_pooling=do_pooling,
+                )
+                for (nn_embedding, x, xw) in zip(
+                    nn_embedding_list,
+                    indices_list,
+                    # pyre-fixme[61]: `weights_list` is undefined, or not always
+                    #  defined.
+                    weights_list,
+                )
+            ]
+        )
+        if do_pooling:
+            final_output = torch.cat(
+                [f.view(B, -1) for f in nn_embedding_output], dim=1
+            )
+        else:
+            final_output = torch.cat(nn_embedding_output, dim=0).view(-1, D)
+
+        if torch.cuda.is_available():
+            end_event.record()
+            torch.cuda.synchronize()
+            it_time = start_event.elapsed_time(end_event) * 1.0e-3
+            times.append(it_time)
+        else:
+            it_time = time.time() - start_time
+            times.append(it_time)
+    avg_time = sum(times) / len(requests)
+    median_time = statistics.median(times)
+    return median_time if check_median else avg_time
+
+
+def benchmark_pipelined_requests(
+    requests: List[Tuple[torch.IntTensor, torch.IntTensor, Optional[Tensor]]],
+    func1: Callable[[Tensor, Tensor, Optional[Tensor]], None],
+    func2: Callable[[Tensor, Tensor, Optional[Tensor]], None],
+    flush_gpu_cache_size_mb: int = 0,
+    check_median: bool = False,
+) -> Tuple[float, float]:
+    torch.cuda.synchronize()
+    start_events = [
+        (torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True))
+        for _ in requests
+    ]
+    end_events = [
+        (torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True))
+        for _ in requests
+    ]
+    for ((indices, offsets, indices_weights), start_event, end_event) in zip(
+        requests, start_events, end_events
+    ):
+        if flush_gpu_cache_size_mb:
+            _ = torch.rand(
+                flush_gpu_cache_size_mb * 1024 * 1024 // 4, dtype=torch.float
+            )
+            torch.cuda.synchronize()
+        start_event[0].record()
+        func1(indices, offsets, indices_weights)
+        end_event[0].record()
+        start_event[1].record()
+        func2(indices, offsets, indices_weights)
+        end_event[1].record()
+    torch.cuda.synchronize()
+    avg_time = (
+        sum(
+            start_event[0].elapsed_time(end_event[0]) * 1.0e-3
+            for start_event, end_event in zip(start_events, end_events)
+        )
+        / len(requests),
+        sum(
+            start_event[1].elapsed_time(end_event[1]) * 1.0e-3
+            for start_event, end_event in zip(start_events, end_events)
+        )
+        / len(requests),
+    )
+    median_time = (
+        statistics.median(
+            start_event[0].elapsed_time(end_event[0]) * 1.0e-3
+            for start_event, end_event in zip(start_events, end_events)
+        ),
+        statistics.median(
+            start_event[1].elapsed_time(end_event[1]) * 1.0e-3
+            for start_event, end_event in zip(start_events, end_events)
+        ),
+    )
+    return median_time if check_median else avg_time
diff --git a/fbgemm_gpu/bench/jagged_tensor_benchmark.py b/fbgemm_gpu/bench/jagged_tensor_benchmark.py
index 95075597f..64e7177f3 100644
--- a/fbgemm_gpu/bench/jagged_tensor_benchmark.py
+++ b/fbgemm_gpu/bench/jagged_tensor_benchmark.py
@@ -50,6 +50,7 @@ def device(
         else torch.float32
     )
 
+    # pyre-fixme[6]: For 1st param expected `int` but got `Union[bool, float, int]`.
     values_2d = torch.rand(total_lengths, embedding_dim, dtype=dtype)
 
     if torch.cuda.is_available():
@@ -57,8 +58,7 @@ def device(
         values_2d = values_2d.cuda()
 
     time, output = benchmark_torch_function(
-        torch.ops.fbgemm.jagged_2d_to_dense,
-        (values_2d, offsets, max_len),
+        torch.ops.fbgemm.jagged_2d_to_dense, (values_2d, offsets, max_len), iters=1000
     )
 
     num_bytes = (
@@ -68,6 +68,19 @@ def device(
     )
     logging.info(f"jagged_2d_to_dense {time} sec {num_bytes / time / 1e9} GB/s")
 
+    total_L = values_2d.size(0)
+    time, jagged_output = benchmark_torch_function(
+        torch.ops.fbgemm.dense_to_jagged, (output, [offsets], total_L), iters=1000
+    )
+
+    # Recompute num_bytes to disinclude entire dense tensor
+    num_bytes = offsets.numel() * offsets.element_size() + 2 * (
+        values_2d.numel() * values_2d.element_size()
+    )
+    logging.info(f"dense_to_jagged (2d) {time} sec {num_bytes / time / 1e9} GB/s")
+
+    # pyre-fixme[6]: For 1st param expected `Union[List[int], Size,
+    #  typing.Tuple[int, ...]]` but got `Union[bool, float, int]`.
     values_1d = torch.rand(total_lengths)
     if torch.cuda.is_available():
         values_1d = values_1d.cuda()
@@ -77,6 +90,7 @@ def device(
             values_1d, offsets, max_len, padding_value=0
         ),
         (),
+        iters=1000,
     )
 
     num_bytes = (
@@ -86,6 +100,18 @@ def device(
     )
     logging.info(f"jagged_1d_to_dense {time} sec {num_bytes / time / 1e9} GB/s")
 
+    total_L = values_1d.size(0)
+    output_1d = torch.unsqueeze(output, -1)
+    time, jagged_output = benchmark_torch_function(
+        torch.ops.fbgemm.dense_to_jagged, (output_1d, [offsets], total_L), iters=1000
+    )
+
+    # Recompute num_bytes to disinclude entire dense tensor
+    num_bytes = offsets.numel() * offsets.element_size() + 2 * (
+        values_1d.numel() * values_1d.element_size()
+    )
+    logging.info(f"dense_to_jagged (1d) {time} sec {num_bytes / time / 1e9} GB/s")
+
 
 if __name__ == "__main__":
     cli()
diff --git a/fbgemm_gpu/bench/merge_embeddings_benchmark.py b/fbgemm_gpu/bench/merge_embeddings_benchmark.py
index 0a820da14..8b56af31a 100644
--- a/fbgemm_gpu/bench/merge_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/merge_embeddings_benchmark.py
@@ -9,7 +9,7 @@
 
 import logging
 import signal
-from typing import Tuple, List
+from typing import List, Tuple
 
 import click
 import fbgemm_gpu
@@ -26,14 +26,19 @@
 else:
     from fbgemm_gpu.bench.bench_utils import benchmark_torch_function
 
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:merge_pooled_embeddings")
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu:merge_pooled_embeddings_cpu"
+    )
+
 
 from fbgemm_gpu.split_table_batched_embeddings_ops import (
-    SparseType,
     BoundsCheckMode,
-    IntNBitTableBatchedEmbeddingBagsCodegen,
     EmbeddingLocation,
+    IntNBitTableBatchedEmbeddingBagsCodegen,
+    SparseType,
 )
-from torch.profiler import ProfilerActivity, profile
+from torch.profiler import profile, ProfilerActivity
 
 
 def get_gpu_device(gpu_num) -> torch.device:
diff --git a/fbgemm_gpu/bench/quantize_ops_benchmark.py b/fbgemm_gpu/bench/quantize_ops_benchmark.py
index f15e1417c..e4f01ee7f 100644
--- a/fbgemm_gpu/bench/quantize_ops_benchmark.py
+++ b/fbgemm_gpu/bench/quantize_ops_benchmark.py
@@ -33,17 +33,7 @@ def cli() -> None:
     pass
 
 
-@cli.command()
-@click.option("--flush-gpu-cache-size-mb", default=0)
-@click.option("--iters", default=100)
-@click.option("--warmup-runs", default=2)
-@settings(max_examples=10, deadline=None)
-# pyre-ignore
-@given(
-    num_columns=st.sampled_from([2 ** n for n in range(4, 10)]),
-    num_rows=st.sampled_from([2 ** n for n in range(4, 10)]),
-)
-def bench(
+def bench_impl(
     flush_gpu_cache_size_mb: int,
     iters: int,
     num_columns: int,
@@ -57,11 +47,17 @@ def bench(
         "int2_quant": 0.0,
         "fp8_143_quant": 0.0,
         "fp8_152_quant": 0.0,
+        "fp16_quant": 0.0,
+        "bf16_quant_fbgemm": 0.0,
+        "bf16_quant_pytorch": 0.0,
         "int8_dequant": 0.0,
         "int4_dequant": 0.0,
         "int2_dequant": 0.0,
         "fp8_143_dequant": 0.0,
         "fp8_152_dequant": 0.0,
+        "fp16_dequant": 0.0,
+        "bf16_dequant_fbgemm": 0.0,
+        "bf16_dequant_pytorch": 0.0,
     }
 
     benchmark = functools.partial(
@@ -72,6 +68,8 @@ def bench(
     )
 
     input_data = torch.rand(num_rows, num_columns).float()
+    if torch.cuda.is_available():
+        input_data = input_data.cuda()
 
     quant_data_8bit = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(input_data)
     quant_data_4bit = torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
@@ -86,9 +84,11 @@ def bench(
     quant_data_fp8_152 = torch.ops.fbgemm.FloatToHFP8Quantized(
         input_data, 5, 30, (2 - 2 ** (-2))
     )
-
-    if torch.cuda.is_available():
-        input_data = input_data.cuda()
+    quant_data_fp16 = input_data.half()
+    quant_data_bf16_fbgemm = torch.ops.fbgemm.FloatToBfloat16Quantized(
+        input_data.contiguous()
+    )
+    quant_data_bf16_pytorch = input_data.bfloat16().view(torch.half)
 
     average_time["int8_quant"], _ = benchmark(
         torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized,
@@ -98,7 +98,6 @@ def bench(
         torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf,
         (input_data, 4),
     )
-
     average_time["int2_quant"], _ = benchmark(
         torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf,
         (input_data, 2),
@@ -111,12 +110,23 @@ def bench(
         torch.ops.fbgemm.FloatToHFP8Quantized,
         (input_data, 5, 30, (2 - 2 ** (-2))),
     )
+    average_time["fp16_quant"], _ = benchmark(
+        lambda tensor: tensor.half(),
+        (input_data,),
+    )
+    average_time["bf16_quant_fbgemm"], _ = benchmark(
+        torch.ops.fbgemm.FloatToBfloat16Quantized,
+        (input_data,),
+    )
+    average_time["bf16_quant_pytorch"], _ = benchmark(
+        lambda tensor: tensor.bfloat16().view(torch.half),
+        (input_data,),
+    )
 
     average_time["int8_dequant"], _ = benchmark(
         torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloat,
         (quant_data_8bit,),
     )
-
     average_time["int4_dequant"], _ = benchmark(
         torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloat,
         (quant_data_4bit, 4),
@@ -133,11 +143,75 @@ def bench(
         torch.ops.fbgemm.HFP8QuantizedToFloat,
         (quant_data_fp8_152, 5, 30),
     )
+    average_time["fp16_dequant"], _ = benchmark(
+        lambda tensor: tensor.float(),
+        (quant_data_fp16,),
+    )
+    average_time["bf16_dequant_fbgemm"], _ = benchmark(
+        torch.ops.fbgemm.Bfloat16QuantizedToFloat,
+        (quant_data_bf16_fbgemm,),
+    )
+    average_time["bf16_dequant_pytorch"], _ = benchmark(
+        lambda tensor: tensor.view(torch.bfloat16).float(),
+        (quant_data_bf16_pytorch,),
+    )
+
     logging.info(f"-------------- ncols={num_columns}, nrows={num_rows}-------------")
     for k, t_time in average_time.items():
         logging.info(f"{k} time per iter: {t_time * 1.0e6:.0f}us")
 
 
+@settings(max_examples=10, deadline=None)
+# pyre-ignore
+@given(
+    num_columns=st.sampled_from([2**n for n in range(4, 10)]),
+    num_rows=st.sampled_from([2**n for n in range(4, 10)]),
+)
+def bench_spectrum(
+    flush_gpu_cache_size_mb: int,
+    iters: int,
+    num_columns: int,
+    num_rows: int,
+    warmup_runs: int,
+) -> None:
+    bench_impl(
+        flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+        iters=iters,
+        num_columns=num_columns,
+        num_rows=num_rows,
+        warmup_runs=warmup_runs,
+    )
+
+
+@cli.command()
+@click.option("--flush-gpu-cache-size-mb", default=0)
+@click.option("--iters", default=100)
+@click.option("--num-columns", default=-1)
+@click.option("--num-rows", default=-1)
+@click.option("--warmup-runs", default=2)
+def bench(
+    flush_gpu_cache_size_mb: int,
+    iters: int,
+    num_columns: int,
+    num_rows: int,
+    warmup_runs: int,
+) -> None:
+    if num_columns == -1 or num_rows == -1:
+        bench_spectrum(
+            flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+            iters=iters,
+            warmup_runs=warmup_runs,
+        )
+    else:
+        bench_impl(
+            flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
+            iters=iters,
+            num_columns=num_columns,
+            num_rows=num_rows,
+            warmup_runs=warmup_runs,
+        )
+
+
 @cli.command()
 @click.option("--flush-gpu-cache-size-mb", default=0)
 @click.option("--iters", default=100)
diff --git a/fbgemm_gpu/bench/sparse_ops_benchmark.py b/fbgemm_gpu/bench/sparse_ops_benchmark.py
index 8ab67381e..9d4e53a8b 100644
--- a/fbgemm_gpu/bench/sparse_ops_benchmark.py
+++ b/fbgemm_gpu/bench/sparse_ops_benchmark.py
@@ -3,11 +3,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import functools
 import logging
 import random
 
 import click
 import fbgemm_gpu
+import numpy as np
 import torch
 
 logging.basicConfig(level=logging.DEBUG)
@@ -69,5 +71,158 @@ def device(
     logging.info(f"expand_into_jagged_permute {time} sec {num_bytes / time / 1e9} GB/s")
 
 
+@cli.command()
+@click.option("--row-size", default=25600)
+@click.option("--batch-size", default=4096)
+@click.option("--unique-batch-size", default=1024)
+@click.option("--input-precision", type=str, default="fp32")
+def batch_reuse_index_select_device(
+    row_size: int, batch_size: int, unique_batch_size: int, input_precision: str
+) -> None:
+    # A function for generating indices in batch_reuse
+    # pyre-fixme[11]: Annotation `array` is not defined as a type.
+    def gen_inverse_index(curr_size: int, final_size: int) -> np.array:
+        inverse_index = list(range(curr_size))
+        np_arr = np.array(inverse_index)
+        for _ in range(final_size - curr_size):
+            inverse_index.append(np.random.randint(0, curr_size))
+            np_arr = np.array(inverse_index)
+            np.random.shuffle(np_arr)
+        return np_arr
+
+    dtype = torch.float
+    if input_precision == "fp32":
+        dtype = torch.float
+    elif input_precision == "fp16":
+        dtype = torch.half
+    else:
+        raise RuntimeError(f"Does not support data type {input_precision}")
+
+    # pyre-fixme[16]: Module `cuda` has no attribute `IntTensor`.
+    indices = torch.cuda.IntTensor(gen_inverse_index(unique_batch_size, batch_size))
+
+    input = torch.rand(unique_batch_size, row_size, dtype=dtype, device="cuda")
+    input.requires_grad = True
+    num_bytes = 2 * batch_size * row_size * input.element_size()
+    time, output = benchmark_torch_function(
+        torch.ops.fbgemm.index_select_dim0, (input, indices, 0, unique_batch_size)
+    )
+    logging.info(
+        f"index_select_dim0 forward: {dtype}, {num_bytes} bytes read/write, {time * 1e3} ms, {num_bytes / time / 1e9} GB/s"
+    )
+
+    grad = torch.rand_like(output, dtype=dtype, device="cuda")
+    num_bytes = (input.numel() + output.numel()) * input.element_size()
+    time, _ = benchmark_torch_function(
+        functools.partial(output.backward, retain_graph=True), (grad,)
+    )
+    logging.info(
+        f"index_select_dim0 backward: {dtype}, {num_bytes} bytes read/write, {time * 1e3} ms, {num_bytes / time / 1e9} GB/s"
+    )
+
+
+@cli.command()
+@click.option("--max-seq-length", default=500)
+@click.option("--batch-size", default=4096)
+@click.option("--num-cols", default=256)
+@click.option("--num-jagged-tensor-rows", default=4096)
+@click.option("--num-zero-padding", default=1024)
+@click.option("--index-dtype", type=click.Choice(["int", "long"]), default="int")
+@click.option(
+    "--jagged-tensor-dtype", type=click.Choice(["float", "half"]), default="float"
+)
+def jagged_index_select_2d_bench(
+    max_seq_length: int,
+    batch_size: int,
+    num_cols: int,
+    num_jagged_tensor_rows: int,
+    num_zero_padding: int,
+    index_dtype: str,
+    jagged_tensor_dtype: str,
+) -> None:
+    def jagged_index_select_2d_ref(
+        values: torch.Tensor, lengths: torch.Tensor, inverse_lookup: torch.Tensor
+    ) -> torch.Tensor:
+        offsets = torch.ops.fbgemm.asynchronous_exclusive_cumsum(lengths)
+        end_offsets = offsets + lengths
+        full_start_offset = torch.index_select(offsets, 0, inverse_lookup)
+        full_end_offset = torch.index_select(end_offsets, 0, inverse_lookup)
+        index_ranges = torch.stack(
+            (full_start_offset, full_end_offset), dim=0
+        ).transpose(0, 1)
+
+        to_be_merged_tensors = []
+        for row in index_ranges:
+            to_be_merged_tensors.append(torch.arange(row[0], row[1], device="cuda"))
+        all_indices = torch.cat(to_be_merged_tensors, dim=0)
+        new_embeddings = torch.index_select(values, 0, all_indices)
+        return new_embeddings
+
+    index_t = {"int": torch.int, "long": torch.long}[index_dtype]
+    scalar_t = {"float": torch.float, "half": torch.half}[jagged_tensor_dtype]
+
+    lengths = torch.randint(
+        low=0,
+        high=max_seq_length,
+        size=(num_jagged_tensor_rows,),
+        dtype=index_t,
+        device="cuda",
+    )
+    indices, _ = torch.sort(
+        torch.randint(
+            low=0,
+            high=num_jagged_tensor_rows,
+            size=(batch_size,),
+            dtype=index_t,
+            device="cuda",
+        )
+    )
+    values = torch.rand(
+        int(lengths.sum().item()), num_cols, dtype=scalar_t, device="cuda"
+    )
+    values.requires_grad = True
+
+    indices[batch_size - num_zero_padding :] = 0
+
+    time, (output, _) = benchmark_torch_function(
+        torch.ops.fbgemm.jagged_index_select,
+        (values, lengths, indices),
+        num_warmups=10,
+        iters=100,
+    )
+    time_ref, output_ref = benchmark_torch_function(
+        jagged_index_select_2d_ref,
+        (values, lengths, indices),
+        num_warmups=10,
+        iters=100,
+    )
+    logging.info(
+        f"jagged_index_select_2d_bench "
+        f"(max_seq_length={max_seq_length}, "
+        f"batch_size={batch_size}, "
+        f"num_cols={num_cols}, "
+        f"num_jagged_tensor_rows={num_jagged_tensor_rows}, "
+        f"num_zero_padding={num_zero_padding}, "
+        f"index_dtype={index_dtype}, "
+        f"jagged_tensor_dtype={jagged_tensor_dtype})"
+    )
+    logging.info(f"forward: fbgemm {time * 1e3:.3f} ms, ref {time_ref * 1e3:.3f} ms")
+
+    grad = torch.rand_like(output)
+    time, _ = benchmark_torch_function(
+        functools.partial(output.backward, retain_graph=True),
+        (grad,),
+        num_warmups=10,
+        iters=100,
+    )
+    time_ref, _ = benchmark_torch_function(
+        functools.partial(output_ref.backward, retain_graph=True),
+        (grad,),
+        num_warmups=10,
+        iters=100,
+    )
+    logging.info(f"backward: fbgemm {time * 1e3:.3f} ms, ref {time_ref * 1e3:.3f} ms")
+
+
 if __name__ == "__main__":
     cli()
diff --git a/fbgemm_gpu/bench/split_embeddings_cache_benchmark.py b/fbgemm_gpu/bench/split_embeddings_cache_benchmark.py
index 8beb07053..7f7f9a3c1 100644
--- a/fbgemm_gpu/bench/split_embeddings_cache_benchmark.py
+++ b/fbgemm_gpu/bench/split_embeddings_cache_benchmark.py
@@ -17,7 +17,7 @@
     EmbeddingLocation,
     IntNBitTableBatchedEmbeddingBagsCodegen,
 )
-from torch import Tensor, nn
+from torch import nn, Tensor
 
 logging.basicConfig(level=logging.DEBUG)
 
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index eb5f8845d..7b550c21f 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -5,12 +5,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import itertools
+
 import logging
 import math
 import random
 import statistics
-import time
 from typing import Callable, List, Optional, Tuple
 
 import click
@@ -32,14 +31,14 @@
     ComputeDevice,
     DenseTableBatchedEmbeddingBagsCodegen,
     EmbeddingLocation,
-    OptimType,
-    SparseType,
-    SplitTableBatchedEmbeddingBagsCodegen,
     IntNBitTableBatchedEmbeddingBagsCodegen,
+    OptimType,
     PoolingMode,
+    RecordCacheMetrics,
     rounded_row_size_in_bytes,
+    SparseType,
+    SplitTableBatchedEmbeddingBagsCodegen,
 )
-from numpy.random import default_rng
 from torch import Tensor
 
 # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
@@ -47,364 +46,28 @@
 
 if open_source:
     # pyre-ignore[21]
-    from bench_utils import benchmark_torch_function
-else:
-    from fbgemm_gpu.bench.bench_utils import benchmark_torch_function
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-
-def round_up(a: int, b: int) -> int:
-    return int((a + b - 1) // b) * b
-
-
-def get_device() -> torch.device:
-    return (
-        torch.cuda.current_device()
-        if torch.cuda.is_available()
-        else torch.device("cpu")
-    )
-
-
-# Merged indices with shape (T, B, L) -> (flattened indices with shape
-# (T * B * L), offsets with shape (T * B + 1))
-def get_table_batched_offsets_from_dense(
-    merged_indices: Tensor,
-) -> Tuple[Tensor, Tensor]:
-    (T, B, L) = merged_indices.size()
-    lengths = np.ones((T, B)) * L
-    flat_lengths = lengths.flatten()
-    return (
-        merged_indices.long().contiguous().view(-1).to(get_device()),
-        torch.tensor(([0] + np.cumsum(flat_lengths).tolist())).long().to(get_device()),
+    from bench_utils import (
+        benchmark_pipelined_requests,
+        benchmark_requests,
+        benchmark_requests_refer,
+        benchmark_torch_function,
+        generate_requests,
+        get_device,
+        round_up,
     )
-
-
-def get_offsets_from_dense(indices: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    (B, L) = indices.size()
-    return (
-        indices.contiguous().view(-1),
-        torch.tensor(
-            np.cumsum(np.asarray([0] + [L for _ in range(B)])[:-1]).astype(np.int64)
-        ),
+else:
+    from fbgemm_gpu.bench.bench_utils import (
+        benchmark_pipelined_requests,
+        benchmark_requests,
+        benchmark_requests_refer,
+        benchmark_torch_function,
+        generate_requests,
+        get_device,
+        round_up,
     )
 
 
-def b_indices(
-    b: Callable[..., torch.Tensor],
-    x: torch.Tensor,
-    per_sample_weights: Optional[torch.Tensor] = None,
-    use_cpu: bool = False,
-    do_pooling: bool = True,
-) -> torch.Tensor:
-    (indices, offsets) = get_offsets_from_dense(x)
-    if do_pooling:
-        return b(
-            indices.cuda(),
-            offsets.cuda(),
-            per_sample_weights=per_sample_weights,
-        )
-    else:
-        return b(indices.cuda())
-
-
-def generate_requests(
-    iters: int,
-    B: int,
-    T: int,
-    L: int,
-    E: int,
-    # inter-batch indices reuse rate
-    reuse: float = 0.0,
-    # alpha <= 1.0: use uniform distribution
-    # alpha > 1.0: use zipf distribution
-    alpha: float = 1.0,
-    weights_precision: SparseType = SparseType.FP32,
-    weighted: bool = False,
-    requests_data_file: Optional[str] = None,
-    # Comma-separated list of table numbers
-    tables: Optional[str] = None,
-) -> List[Tuple[torch.IntTensor, torch.IntTensor, Optional[Tensor]]]:
-    if requests_data_file is not None:
-        indices_tensor, offsets_tensor, lengths_tensor = torch.load(requests_data_file)
-
-        average_L = 0
-        if tables is not None:
-            emb_tables = tuple(int(x) for x in tables.split(","))
-            indices = torch.zeros(0, dtype=indices_tensor.dtype)
-            offsets = torch.zeros(1, dtype=offsets_tensor.dtype)
-            total_L = 0
-            for t in emb_tables:
-                t_offsets = offsets_tensor[B * t : B * (t + 1) + 1]
-                total_L += t_offsets[-1] - t_offsets[0]
-                indices = torch.cat(
-                    (indices, indices_tensor[t_offsets[0] : t_offsets[-1]])
-                )
-                offsets = torch.cat(
-                    (
-                        offsets,
-                        t_offsets[1:] - t_offsets[0] + offsets[-1],
-                    )
-                )
-            indices_tensor = indices
-            offsets_tensor = offsets
-            average_L = int(total_L / B)
-
-            assert np.prod(offsets_tensor.size()) - 1 == np.prod((T, B)), (
-                f"Requested tables: {emb_tables} "
-                f"does not conform to inputs (T, B) = ({T}, {B})."
-            )
-            logging.warning(
-                f"Using (indices = {indices_tensor.size()}, offsets = {offsets_tensor.size()}) based "
-                f"on tables: {emb_tables}"
-            )
-        else:
-            average_L = int((offsets_tensor[-1] - offsets_tensor[0]) / B)
-            assert (np.prod(offsets_tensor.size()) - 1) == np.prod((T, B)), (
-                f"Data file (indices = {indices_tensor.size()}, "
-                f"offsets = {offsets_tensor.size()}, lengths = {lengths_tensor.size()}) "
-                f"does not conform to inputs (T, B) = ({T}, {B})."
-            )
-
-        assert (
-            L == average_L
-        ), f"Requested L does not align with provided data file ({L} vs. {average_L})"
-        assert E > max(indices_tensor), (
-            f"Number of embeddings is not enough to support maximum index "
-            f"provided by data file {E} vs. {max(indices_tensor)}"
-        )
-
-        weights_tensor = (
-            None
-            if not weighted
-            else torch.randn(indices_tensor.size(), device=get_device())
-        )
-        rs = []
-        for _ in range(iters):
-            rs.append(
-                (
-                    indices_tensor.to(get_device()),
-                    offsets_tensor.to(get_device()),
-                    weights_tensor,
-                )
-            )
-        return rs
-
-    if alpha <= 1.0:
-        all_indices = torch.randint(
-            low=0,
-            high=E,
-            size=(iters, T, B, L),
-            device=get_device(),
-            dtype=torch.int32,
-        )
-        # each bag is usually sorted
-        (all_indices, _) = torch.sort(all_indices)
-        all_indices = all_indices.reshape(iters, T, B * L)
-    else:
-        assert E >= L, "num-embeddings must be greater than equal to bag-size"
-        # oversample and then remove duplicates to obtain sampling without
-        # replacement
-        all_indices = (np.random.zipf(a=alpha, size=(iters, T, B, 3 * L)) - 1) % E
-        for index_tuple in itertools.product(range(iters), range(T), range(B)):
-            # sample without replacement from
-            # https://stats.stackexchange.com/questions/20590/how-do-i-sample-without-replacement-using-a-sampling-with-replacement-function
-            r = set()
-            for x in all_indices[index_tuple]:
-                if x not in r:
-                    r.add(x)
-                    if len(r) == L:
-                        break
-            assert (len(r)) == L, "too skewed distribution (alpha too big)"
-            all_indices[index_tuple][:L] = list(r)
-        # shuffle indices so we don't have unintended spatial locality
-        all_indices = torch.as_tensor(all_indices[:, :, :, :L])
-        rng = default_rng()
-        permutation = torch.as_tensor(
-            rng.choice(E, size=all_indices.max().item() + 1, replace=False)
-        )
-        all_indices = permutation.gather(0, all_indices.flatten())
-        all_indices = all_indices.to(get_device()).int().reshape(iters, T, B * L)
-    for it in range(iters - 1):
-        for t in range(T):
-            reused_indices = torch.randperm(B * L, device=get_device())[
-                : int(B * L * reuse)
-            ]
-            all_indices[it + 1, t, reused_indices] = all_indices[it, t, reused_indices]
-
-    rs = []
-    for it in range(iters):
-        weights_tensor = (
-            None if not weighted else torch.randn(T * B * L, device=get_device())
-        )
-        rs.append(
-            get_table_batched_offsets_from_dense(all_indices[it].view(T, B, L))
-            + (weights_tensor,)
-        )
-    return rs
-
-
-def benchmark_requests(
-    requests: List[Tuple[torch.IntTensor, torch.IntTensor, Optional[Tensor]]],
-    func: Callable[[Tensor, Tensor, Optional[Tensor]], Tensor],
-    flush_gpu_cache_size_mb: int = 0,
-    check_median: bool = False,
-) -> float:
-    times = []
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-    for (indices, offsets, weights) in requests:
-        start_time = time.time()
-        if torch.cuda.is_available():
-            if flush_gpu_cache_size_mb:
-                _ = torch.rand(
-                    flush_gpu_cache_size_mb * 1024 * 1024 // 4, dtype=torch.float
-                )
-                torch.cuda.synchronize()
-            start_event.record()
-        func(indices, offsets, weights)
-        if torch.cuda.is_available():
-            end_event.record()
-            torch.cuda.synchronize()
-            it_time = start_event.elapsed_time(end_event) * 1.0e-3
-            times.append(it_time)
-        else:
-            it_time = time.time() - start_time
-            times.append(it_time)
-    avg_time = sum(times) / len(requests)
-    median_time = statistics.median(times)
-    return median_time if check_median else avg_time
-
-
-def benchmark_requests_refer(
-    requests: List[Tuple[torch.IntTensor, torch.IntTensor, Optional[Tensor]]],
-    T: int,
-    B: int,
-    L: int,
-    E: int,
-    D: int,
-    pooling_mode: str,
-    weighted: bool,
-    flush_gpu_cache_size_mb: int = 0,
-    check_median: bool = False,
-) -> float:
-    do_pooling = pooling_mode in ["sum", "mean"]
-    if do_pooling:
-        nn_embedding_list = [
-            torch.nn.EmbeddingBag(E, D, mode=pooling_mode, sparse=True).cuda()
-        ] * T
-    else:
-        nn_embedding_list = [torch.nn.Embedding(E, D, sparse=True).cuda()] * T
-
-    times = []
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-    for (indices, _, weights) in requests:
-        indices_list = indices.view(T, B, L).split(1)
-
-        if weighted:
-            assert weights is not None
-            weights_list = weights.view(T, B, L).split(1)
-
-        start_time = time.time()
-        if torch.cuda.is_available():
-            if flush_gpu_cache_size_mb:
-                _ = torch.rand(
-                    flush_gpu_cache_size_mb * 1024 * 1024 // 4, dtype=torch.float
-                )
-                torch.cuda.synchronize()
-            start_event.record()
-
-        nn_embedding_output = (
-            [
-                b_indices(nn_embedding, x, use_cpu=False, do_pooling=do_pooling)
-                for (nn_embedding, x) in zip(nn_embedding_list, indices_list)
-            ]
-            if not weighted
-            else [
-                b_indices(
-                    nn_embedding,
-                    x,
-                    per_sample_weights=xw.view(-1),
-                    use_cpu=False,
-                    do_pooling=do_pooling,
-                )
-                for (nn_embedding, x, xw) in zip(
-                    nn_embedding_list,
-                    indices_list,
-                    # pyre-fixme[61]: `weights_list` is undefined, or not always
-                    #  defined.
-                    weights_list,
-                )
-            ]
-        )
-        if do_pooling:
-            final_output = torch.cat(
-                [f.view(B, -1) for f in nn_embedding_output], dim=1
-            )
-        else:
-            final_output = torch.cat(nn_embedding_output, dim=0).view(-1, D)
-
-        if torch.cuda.is_available():
-            end_event.record()
-            torch.cuda.synchronize()
-            it_time = start_event.elapsed_time(end_event) * 1.0e-3
-            times.append(it_time)
-        else:
-            it_time = time.time() - start_time
-            times.append(it_time)
-    avg_time = sum(times) / len(requests)
-    median_time = statistics.median(times)
-    return median_time if check_median else avg_time
-
-
-def benchmark_pipelined_requests(
-    requests: List[Tuple[torch.IntTensor, torch.IntTensor, Optional[Tensor]]],
-    func1: Callable[[Tensor, Tensor, Optional[Tensor]], None],
-    func2: Callable[[Tensor, Tensor, Optional[Tensor]], None],
-    flush_gpu_cache_size_mb: int = 0,
-) -> Tuple[float, float]:
-    torch.cuda.synchronize()
-    start_events = [
-        (torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True))
-        for _ in requests
-    ]
-    end_events = [
-        (torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True))
-        for _ in requests
-    ]
-    for ((indices, offsets, indices_weights), start_event, end_event) in zip(
-        requests, start_events, end_events
-    ):
-        if flush_gpu_cache_size_mb:
-            _ = torch.rand(
-                flush_gpu_cache_size_mb * 1024 * 1024 // 4, dtype=torch.float
-            )
-            torch.cuda.synchronize()
-        start_event[0].record()
-        func1(indices, offsets, indices_weights)
-        end_event[0].record()
-        start_event[1].record()
-        func2(indices, offsets, indices_weights)
-        end_event[1].record()
-    torch.cuda.synchronize()
-    return (
-        sum(
-            start_event[0].elapsed_time(end_event[0]) * 1.0e-3
-            for start_event, end_event in zip(start_events, end_events)
-        )
-        / len(requests),
-        sum(
-            start_event[1].elapsed_time(end_event[1]) * 1.0e-3
-            for start_event, end_event in zip(start_events, end_events)
-        )
-        / len(requests),
-    )
+logging.basicConfig(level=logging.DEBUG)
 
 
 @click.group()
@@ -428,7 +91,9 @@ def cli() -> None:
 @click.option("--reuse", default=0.0)
 @click.option("--row-wise/--no-row-wise", default=True)
 @click.option("--weighted", is_flag=True, default=False)
+@click.option("--pooling", type=str, default="sum")
 @click.option("--weighted-num-requires-grad", type=int, default=None)
+@click.option("--bounds-check-mode", type=int, default=BoundsCheckMode.NONE.value)
 @click.option("--flush-gpu-cache-size-mb", default=0)
 @click.option("--dense", is_flag=True, default=False)
 @click.option("--output-dtype", type=SparseType, default=SparseType.FP32)
@@ -449,7 +114,9 @@ def device(  # noqa C901
     reuse: float,
     row_wise: bool,
     weighted: bool,
+    pooling: str,
     weighted_num_requires_grad: Optional[int],
+    bounds_check_mode: int,
     flush_gpu_cache_size_mb: int,
     dense: bool,
     output_dtype: SparseType,
@@ -496,6 +163,17 @@ def device(  # noqa C901
     else:
         managed_option = EmbeddingLocation.MANAGED
 
+    if pooling is None or pooling == "sum":
+        pooling = "sum"
+        pooling_mode = PoolingMode.SUM
+        do_pooling = True
+    elif pooling == "mean":
+        pooling_mode = PoolingMode.MEAN
+        do_pooling = True
+    else:  # "none"
+        pooling_mode = PoolingMode.NONE
+        do_pooling = False
+
     if dense:
         emb = DenseTableBatchedEmbeddingBagsCodegen(
             [
@@ -505,6 +183,7 @@ def device(  # noqa C901
                 )
                 for d in Ds
             ],
+            pooling_mode=pooling_mode,
             use_cpu=not torch.cuda.is_available(),
         )
     else:
@@ -526,6 +205,8 @@ def device(  # noqa C901
             weights_precision=weights_precision,
             stochastic_rounding=stoc,
             output_dtype=output_dtype,
+            pooling_mode=pooling_mode,
+            bounds_check_mode=BoundsCheckMode(bounds_check_mode),
         )
     emb = emb.to(get_device())
 
@@ -534,6 +215,18 @@ def device(  # noqa C901
 
     nparams = sum(w.numel() for w in emb.split_embedding_weights())
     param_size_multiplier = weights_precision.bit_rate() / 8.0
+    output_size_multiplier = output_dtype.bit_rate() / 8.0
+    if do_pooling:
+        read_write_bytes = (
+            output_size_multiplier * B * sum(Ds)
+            + param_size_multiplier * B * sum(Ds) * L
+        )
+    else:
+        read_write_bytes = (
+            output_size_multiplier * B * sum(Ds) * L
+            + param_size_multiplier * B * sum(Ds) * L
+        )
+
     logging.info(
         f"Embedding parameters: {nparams / 1.0e9: .2f} GParam, "
         f"{nparams * param_size_multiplier / 1.0e9: .2f} GB"
@@ -570,7 +263,7 @@ def device(  # noqa C901
     logging.info(
         f"Forward, B: {B}, "
         f"E: {E}, T: {T}, D: {D}, L: {L}, W: {weighted}, "
-        f"BW: {param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
+        f"BW: {read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
         f"T: {time_per_iter * 1.0e6:.0f}us"
     )
 
@@ -578,7 +271,10 @@ def device(  # noqa C901
         # backward bench not representative
         return
 
-    grad_output = torch.randn(B, sum(Ds)).to(get_device())
+    if do_pooling:
+        grad_output = torch.randn(B, sum(Ds)).to(get_device())
+    else:
+        grad_output = torch.randn(B * T * L, D).to(get_device())
     # backward
     time_per_iter = benchmark_requests(
         requests,
@@ -592,7 +288,7 @@ def device(  # noqa C901
     )
     logging.info(
         f"ForwardBackward, B: {B}, E: {E}, T: {T}, D: {D}, L: {L}, "
-        f"BW: {3 * param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f} GB/s, "
+        f"BW: {3 * read_write_bytes / time_per_iter / 1.0e9: .2f} GB/s, "
         f"T: {time_per_iter * 1.0e6:.0f}us"
     )
 
@@ -1068,6 +764,8 @@ def benchmark_cpu_requests(
 @click.option("--requests_data_file", type=str, default=None)
 @click.option("--tables", type=str, default=None)
 @click.option("--output-dtype", type=SparseType, default=SparseType.FP16)
+@click.option("--fp8-exponent-bits", type=int, default=None)
+@click.option("--fp8-exponent-bias", type=int, default=None)
 def nbit_cpu(  # noqa C901
     alpha: float,
     bag_size: int,
@@ -1087,6 +785,8 @@ def nbit_cpu(  # noqa C901
     requests_data_file: Optional[str],
     tables: Optional[str],
     output_dtype: SparseType,
+    fp8_exponent_bits: Optional[int],
+    fp8_exponent_bias: Optional[int],
 ) -> None:
     np.random.seed(42)
     torch.manual_seed(42)
@@ -1110,6 +810,8 @@ def nbit_cpu(  # noqa C901
         device="cpu",
         index_remapping=[torch.arange(E) for _ in Ds] if index_remapping else None,
         output_dtype=output_dtype,
+        fp8_exponent_bits=fp8_exponent_bits,
+        fp8_exponent_bias=fp8_exponent_bias,
     ).cpu()
     emb.fill_random_weights()
 
@@ -1177,8 +879,7 @@ def nbit_cpu(  # noqa C901
 @click.option("--row-wise/--no-row-wise", default=True)
 @click.option("--weighted", is_flag=True, default=False)
 @click.option("--pooling", type=str, default="sum")
-@click.option("--weighted-num-requires-grad", type=int, default=None)
-@click.option("--bounds-check-mode", type=int, default=BoundsCheckMode.WARNING.value)
+@click.option("--bounds-check-mode", type=int, default=BoundsCheckMode.NONE.value)
 @click.option("--pruning-ratio", type=float, default=None)
 @click.option("--load-factor", default=0.75)
 @click.option("--use-array-for-index-remapping", is_flag=True, default=True)
@@ -1191,6 +892,8 @@ def nbit_cpu(  # noqa C901
 @click.option("--run-reference", is_flag=True, default=False)
 @click.option("--requests_data_file", type=str, default=None)
 @click.option("--tables", type=str, default=None)
+@click.option("--fp8-exponent-bits", type=int, default=None)
+@click.option("--fp8-exponent-bias", type=int, default=None)
 def nbit_device(  # noqa C901
     alpha: float,
     bag_size: int,
@@ -1206,7 +909,6 @@ def nbit_device(  # noqa C901
     row_wise: bool,
     weighted: bool,
     pooling: str,
-    weighted_num_requires_grad: Optional[int],
     bounds_check_mode: int,
     pruning_ratio: Optional[float],
     load_factor: float,
@@ -1220,6 +922,8 @@ def nbit_device(  # noqa C901
     run_reference: bool,
     requests_data_file: Optional[str],
     tables: Optional[str],
+    fp8_exponent_bits: Optional[int],
+    fp8_exponent_bias: Optional[int],
 ) -> None:
     np.random.seed(42)
     torch.manual_seed(42)
@@ -1280,6 +984,8 @@ def nbit_device(  # noqa C901
         use_array_for_index_remapping=use_array_for_index_remapping,
         output_dtype=output_dtype,
         pooling_mode=pooling_mode,
+        fp8_exponent_bits=fp8_exponent_bits,
+        fp8_exponent_bias=fp8_exponent_bias,
     ).cuda()
     emb.fill_random_weights()
 
@@ -1455,6 +1161,8 @@ def nbit_device(  # noqa C901
 @click.option("--cache-algorithm", default="lru")
 @click.option("--cache-load-factor", default=0.2)
 @click.option("--enforce-hbm", is_flag=True, default=False)
+@click.option("--fp8-exponent-bits", type=int, default=None)
+@click.option("--fp8-exponent-bias", type=int, default=None)
 def nbit_uvm(
     alpha: bool,
     bag_size: int,
@@ -1476,6 +1184,8 @@ def nbit_uvm(
     cache_algorithm: str,
     cache_load_factor: float,
     enforce_hbm: bool,
+    fp8_exponent_bits: Optional[int],
+    fp8_exponent_bias: Optional[int],
 ) -> None:
     np.random.seed(42)
     torch.manual_seed(42)
@@ -1522,6 +1232,8 @@ def nbit_uvm(
         cache_load_factor=cache_load_factor,
         cache_algorithm=cache_alg,
         enforce_hbm=enforce_hbm,
+        fp8_exponent_bits=fp8_exponent_bits,
+        fp8_exponent_bias=fp8_exponent_bias,
     ).cuda()
     emb_uvm.fill_random_weights()
 
@@ -1538,6 +1250,8 @@ def nbit_uvm(
                 for d in Ds[T_uvm:]
             ],
             output_dtype=output_dtype,
+            fp8_exponent_bits=fp8_exponent_bits,
+            fp8_exponent_bias=fp8_exponent_bias,
         ).cuda()
         emb_gpu.fill_random_weights()
 
@@ -1560,6 +1274,8 @@ def nbit_uvm(
             cache_load_factor=cache_load_factor,
             cache_algorithm=cache_alg,
             enforce_hbm=enforce_hbm,
+            fp8_exponent_bits=fp8_exponent_bits,
+            fp8_exponent_bias=fp8_exponent_bias,
         ).cuda()
         emb_mixed.fill_random_weights()
 
@@ -1603,13 +1319,15 @@ def nbit_uvm(
     if T_gpu > 0:
         nparams_byte = sum(w.numel() for (w, _) in emb_mixed.split_embedding_weights())
         logging.info(
-            f"{weights_precision} Embedding tables: {E * T + E_uvm * T_uvm} rows, {nparams_byte / param_size_multiplier / 1.0e9: .2f} GParam, "
+            f"{weights_precision} Embedding tables: {E * T_gpu + E_uvm * T_uvm} rows, {nparams_byte / param_size_multiplier / 1.0e9: .2f} GParam, "
             f"{nparams_byte / 1.0e9: .2f} GB"  # IntN TBE use byte for storage
         )
         logging.info(
-            f"Accessed weights per batch: {B * (T * L + T_uvm * L_uvm)} rows, "
-            f"{B * (T * L * sum(Ds[T_uvm:]) + T_uvm * L_uvm * sum(Ds[:T_uvm])) * param_size_multiplier / 1.0e9: .2f} GB"
+            f"Accessed weights per batch: {B * (T_gpu * L + T_uvm * L_uvm)} rows, "
+            f"{B * (L * sum(Ds[T_uvm:]) + L_uvm * sum(Ds[:T_uvm])) * param_size_multiplier / 1.0e9: .2f} GB"
         )
+    torch.cuda.cudart().cudaProfilerStart()
+    torch.cuda.nvtx.range_push("uvm forward")
 
     time_per_iter = benchmark_requests(
         requests_uvm,
@@ -1626,7 +1344,8 @@ def nbit_uvm(
         f"BW: {read_write_bytes_uvm / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
         f"Time: {time_per_iter * 1.0e6:.0f}us"
     )
-
+    torch.cuda.nvtx.range_pop()
+    torch.cuda.cudart().cudaProfilerStop()
     if T_gpu > 0:
         requests = []
         assert requests_gpu is not None
@@ -1694,9 +1413,6 @@ def nbit_uvm(
                 indices,
                 offsets,
             ),
-            # pyre-fixme[6]: Expected `(Tensor, Tensor, Optional[Tensor]) -> None` for
-            #  3rd param but got `(indices: Any, offsets: Any, indices_weights: Any) ->
-            #  Tensor`.
             lambda indices, offsets, indices_weights: emb_mixed.forward(
                 indices,
                 offsets,
@@ -1734,6 +1450,10 @@ def nbit_uvm(
 @click.option("--flush-gpu-cache-size-mb", default=0)
 @click.option("--output-dtype", type=SparseType, default=SparseType.FP16)
 @click.option("--enforce-hbm", is_flag=True, default=False)
+@click.option("--record-cache-miss-counter", is_flag=True, default=False)
+@click.option("--record-tablewise-cache-miss", is_flag=True, default=False)
+@click.option("--fp8-exponent-bits", type=int, default=None)
+@click.option("--fp8-exponent-bias", type=int, default=None)
 def nbit_cache(  # noqa C901
     alpha: float,
     bag_size: int,
@@ -1751,6 +1471,10 @@ def nbit_cache(  # noqa C901
     flush_gpu_cache_size_mb: int,
     output_dtype: SparseType,
     enforce_hbm: bool,
+    record_cache_miss_counter: bool,
+    record_tablewise_cache_miss: bool,
+    fp8_exponent_bits: Optional[int],
+    fp8_exponent_bias: Optional[int],
 ) -> None:
     np.random.seed(42)
     torch.manual_seed(42)
@@ -1782,6 +1506,8 @@ def nbit_cache(  # noqa C901
         ],
         output_dtype=output_dtype,
         enforce_hbm=enforce_hbm,
+        fp8_exponent_bits=fp8_exponent_bits,
+        fp8_exponent_bias=fp8_exponent_bias,
     ).cuda()
     emb_nc.fill_random_weights()
 
@@ -1796,10 +1522,15 @@ def nbit_cache(  # noqa C901
             )
             for d in Ds
         ],
+        record_cache_metrics=RecordCacheMetrics(
+            record_cache_miss_counter, record_tablewise_cache_miss
+        ),
         cache_load_factor=cache_load_factor,
         cache_algorithm=cache_alg,
         output_dtype=output_dtype,
         enforce_hbm=enforce_hbm,
+        fp8_exponent_bits=fp8_exponent_bits,
+        fp8_exponent_bias=fp8_exponent_bias,
     ).cuda()
     emb.fill_random_weights()
 
@@ -1807,7 +1538,11 @@ def nbit_cache(  # noqa C901
     param_size_multiplier = weights_precision.bit_rate() / 8.0
     output_size_multiplier = output_dtype.bit_rate() / 8.0
     read_write_bytes = (
-        output_size_multiplier * B * sum(Ds) + param_size_multiplier * B * sum(Ds) * L
+        param_size_multiplier
+        * B
+        * sum(Ds)
+        * L
+        # output_size_multiplier * B * sum(Ds) + param_size_multiplier * B * sum(Ds) * L
     )
     logging.info(
         f"{weights_precision} Embedding tables: {E * T} rows, {nparams_byte / param_size_multiplier / 1.0e9: .2f} GParam, "
@@ -1837,14 +1572,20 @@ def nbit_cache(  # noqa C901
         f"T: {time_per_iter * 1.0e6:.0f}us"
     )
 
-    # exchanged_cache_lines = [100]
     # warm up
     for indices, offsets, _ in warmup_requests:
         emb.forward(indices.int(), offsets.int())
+
     # get cache miss rate (forward only) and exchanged cache lines (prefetch)
     cache_misses = []
     exchanged_cache_lines = []
+    unique_indices = []
+    input_indices = []
     NOT_FOUND = -1
+    # reset the cache miss counters after warmup
+    if record_cache_miss_counter or record_tablewise_cache_miss:
+        emb.reset_cache_miss_counter()
+
     for indices, offsets, _ in requests:
         # pyre-fixme[29]:
         #  `Union[BoundMethod[typing.Callable(Tensor.clone)[[Named(self,
@@ -1862,6 +1603,14 @@ def nbit_cache(  # noqa C901
             (emb.lxu_cache_locations_list.top() == NOT_FOUND).sum().item()
         )
         emb.forward(indices, offsets)
+        linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
+            emb.cache_hash_size_cumsum,
+            indices,
+            offsets,
+        )
+        unique_indices.append(len(torch.unique(linear_cache_indices, sorted=False)))
+        input_indices.append(len(indices))
+
     logging.info(
         f"Exchanged cache lines -- mean: {sum(exchanged_cache_lines)/len(requests): .2f}, "
         f"max: {max(exchanged_cache_lines)}, min: {min(exchanged_cache_lines)}"
@@ -1870,20 +1619,35 @@ def nbit_cache(  # noqa C901
         f"Cache miss -- mean: {sum(cache_misses)/len(requests)}, "
         f"max: {max(cache_misses)}, min: {min(cache_misses)}"
     )
-
+    logging.info(
+        f"input_indices -- mean: {sum(input_indices)/len(requests)}, "
+        f"max: {max(input_indices)}, min: {min(input_indices)}"
+    )
+    logging.info(
+        f"unique_indices -- mean: {sum(unique_indices)/len(requests)}, "
+        f"max: {max(unique_indices)}, min: {min(unique_indices)}"
+    )
+    unique_miss_rate = [a / b for (a, b) in zip(exchanged_cache_lines, unique_indices)]
+    logging.info(
+        f"unique_miss_rate -- mean: {sum(unique_miss_rate)/len(requests)}, "
+        f"max: {max(unique_miss_rate)}, min: {min(unique_miss_rate)}"
+    )
+    if record_cache_miss_counter or record_tablewise_cache_miss:
+        emb.print_cache_miss_counter()
     # benchmark prefetch
-    emb.reset_cache_states()
+    if record_cache_miss_counter or record_tablewise_cache_miss:
+        emb.reset_cache_states()
     for indices, offsets, _ in warmup_requests:
         emb.forward(indices, offsets)
+
+    torch.cuda.cudart().cudaProfilerStart()
+    torch.cuda.nvtx.range_push("pipeline")
     prefetch_time, forward_time = benchmark_pipelined_requests(
         requests,
         lambda indices, offsets, indices_weights: emb.prefetch(
             indices,
             offsets,
         ),
-        # pyre-fixme[6]: Expected `(Tensor, Tensor, Optional[Tensor]) -> None` for
-        #  3rd param but got `(indices: Any, offsets: Any, indices_weights: Any) ->
-        #  Tensor`.
         lambda indices, offsets, indices_weights: emb.forward(
             indices,
             offsets,
@@ -1892,6 +1656,7 @@ def nbit_cache(  # noqa C901
         flush_gpu_cache_size_mb=flush_gpu_cache_size_mb,
     )
     e2e_time = prefetch_time + forward_time
+    torch.cuda.nvtx.range_pop()
 
     logging.info(
         f"Forward(LXU) {weights_precision}, reuse: {reuse}, alpha: {alpha}, B: {B}, "
@@ -1903,6 +1668,7 @@ def nbit_cache(  # noqa C901
         f"TfwdTime: {forward_time * 1.0e6:.0f}us, "
         f"{read_write_bytes / forward_time / 1.0e9: .2f} GB/s"
     )
+    torch.cuda.cudart().cudaProfilerStop()
 
 
 @cli.command()
@@ -1956,7 +1722,7 @@ def hashtable(  # noqa C901
     )
     hash_table_offsets = torch.tensor([0] + np.cumsum(capacities).tolist()).long()
 
-    assert hash_table.numel() * 4 < 2 ** 32
+    assert hash_table.numel() * 4 < 2**32
     # initialize
     hash_table[:, :] = -1
     torch.ops.fbgemm.pruned_hashmap_insert(
@@ -2154,6 +1920,8 @@ def bounds_check_indices(  # noqa C901
 @click.option("--weights-precision", type=SparseType, default=SparseType.INT4)
 @click.option("--output-dtype", type=SparseType, default=SparseType.FP16)
 @click.option("--iters", type=int, default=100)
+@click.option("--fp8-exponent-bits", type=int, default=None)
+@click.option("--fp8-exponent-bias", type=int, default=None)
 def emb_inplace_update(  # noqa C901
     num_tables: int,
     embedding_dim: int,
@@ -2162,6 +1930,8 @@ def emb_inplace_update(  # noqa C901
     weights_precision: SparseType,
     output_dtype: SparseType,
     iters: int,
+    fp8_exponent_bits: Optional[int],
+    fp8_exponent_bias: Optional[int],
 ) -> None:
     if open_source:
         logging.warning(
@@ -2206,6 +1976,8 @@ def emb_inplace_update(  # noqa C901
         embedding_specs=embedding_specs,
         output_dtype=output_dtype,
         device=torch.cuda.current_device(),
+        fp8_exponent_bits=fp8_exponent_bits,
+        fp8_exponent_bias=fp8_exponent_bias,
     )
     # Initilize the random weights for int nbit table split embedding bag
     op.fill_random_weights()
@@ -2241,6 +2013,8 @@ def emb_inplace_update(  # noqa C901
         high=255,
         size=(update_weight_size,),
         dtype=torch.uint8,
+        # pyre-fixme[6]: For 5th param expected `Union[None, str, device]` but got
+        #  `int`.
         device=torch.cuda.current_device(),
     )
 
@@ -2282,16 +2056,22 @@ def emb_inplace_update(  # noqa C901
 
     update_table_idx = torch.tensor(
         update_table_idx,
+        # pyre-fixme[6]: For 2nd param expected `Union[None, str, device]` but got
+        #  `int`.
         device=torch.cuda.current_device(),
         dtype=torch.int32,
     )
     update_row_idx = torch.tensor(
         update_row_idx,
+        # pyre-fixme[6]: For 2nd param expected `Union[None, str, device]` but got
+        #  `int`.
         device=torch.cuda.current_device(),
         dtype=torch.int32,
     )
     update_offsets = torch.tensor(
         update_offsets,
+        # pyre-fixme[6]: For 2nd param expected `Union[None, str, device]` but got
+        #  `int`.
         device=torch.cuda.current_device(),
         dtype=torch.int64,
     )
diff --git a/fbgemm_gpu/build.sh b/fbgemm_gpu/build.sh
deleted file mode 100755
index f181dd6a9..000000000
--- a/fbgemm_gpu/build.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-export MAX_JOBS=32
-python3.6 setup.py build develop 2>&1 | tee build.log
diff --git a/fbgemm_gpu/cmake/Hip.cmake b/fbgemm_gpu/cmake/Hip.cmake
index cdc225e9d..7db59ea9f 100644
--- a/fbgemm_gpu/cmake/Hip.cmake
+++ b/fbgemm_gpu/cmake/Hip.cmake
@@ -5,7 +5,11 @@ IF(NOT DEFINED ENV{ROCM_PATH})
 ELSE()
   SET(ROCM_PATH $ENV{ROCM_PATH})
 ENDIF()
-
+if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS})
+  set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include)
+else()
+  set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS})
+endif()
 # HIP_PATH
 IF(NOT DEFINED ENV{HIP_PATH})
   SET(HIP_PATH ${ROCM_PATH}/hip)
@@ -60,10 +64,10 @@ ELSE()
 ENDIF()
 
 # THRUST_PATH
-IF(DEFINED ENV{THRUST_PATH})
-  SET(THRUST_PATH $ENV{THRUST_PATH})
-ELSE()
+IF(NOT DEFINED ENV{THRUST_PATH})
   SET(THRUST_PATH ${ROCM_PATH}/include)
+ELSE()
+  SET(THRUST_PATH $ENV{THRUST_PATH})
 ENDIF()
 
 # HIPRAND_PATH
@@ -94,12 +98,117 @@ set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
 ADD_DEFINITIONS(-DNDEBUG)
 ADD_DEFINITIONS(-DUSE_ROCM)
 
+IF(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
+  SET(FBGEMM_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a)
+ELSE()
+  SET(FBGEMM_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH})
+ENDIF()
+
 # Find the HIP Package
 find_package(HIP)
 
 IF(HIP_FOUND)
   set(FBGEMM_HAVE_HIP TRUE)
 
+  # Find ROCM version for checks
+  # ROCM 5.0 and later will have header api for version management
+  if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)
+
+    set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
+    set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
+    file(WRITE ${file} ""
+      "#include <rocm_version.h>\n"
+      "#include <cstdio>\n"
+
+      "#ifndef ROCM_VERSION_PATCH\n"
+      "#define ROCM_VERSION_PATCH 0\n"
+      "#endif\n"
+      "#define STRINGIFYHELPER(x) #x\n"
+      "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
+      "int main() {\n"
+      "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
+      "  return 0;\n"
+      "}\n"
+      )
+
+    try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+      RUN_OUTPUT_VARIABLE rocm_version_from_header
+      COMPILE_OUTPUT_VARIABLE output_var
+      )
+    # We expect the compile to be successful if the include directory exists.
+    if(NOT compile_result)
+      message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+    endif()
+    message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
+    set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
+    message("\n***** ROCm version from rocm_version.h ****\n")
+  endif()
+
+  string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
+
+  if(ROCM_VERSION_DEV_MATCH)
+    set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
+    set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
+    set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
+    set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
+    math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
+  endif()
+
+  message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
+  message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
+  message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
+  message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
+  message("ROCM_VERSION_DEV_INT:   ${ROCM_VERSION_DEV_INT}")
+
+  math(EXPR TORCH_HIP_VERSION "(${HIP_VERSION_MAJOR} * 100) + ${HIP_VERSION_MINOR}")
+  message("HIP_VERSION_MAJOR: ${HIP_VERSION_MAJOR}")
+  message("HIP_VERSION_MINOR: ${HIP_VERSION_MINOR}")
+  message("TORCH_HIP_VERSION: ${TORCH_HIP_VERSION}")
+
+  message("\n***** Library versions from dpkg *****\n")
+  execute_process(COMMAND dpkg -l COMMAND grep rocm-dev COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep rocm-libs COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep hsakmt-roct COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep rocr-dev COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep -w hcc COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep hip-base COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep hip_hcc COMMAND awk "{print $2 \" VERSION: \" $3}")
+
+  message("\n***** Library versions from cmake find_package *****\n")
+
+  # As of ROCm 5.1.x, all *.cmake files are under /opt/rocm/lib/cmake/<package>
+  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.1.0")
+    set(hip_DIR ${HIP_PATH}/lib/cmake/hip)
+    set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
+    set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
+    set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
+    set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand)
+    set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand)
+    set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas)
+    set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen)
+    set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft)
+    set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft)
+    set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse)
+    set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl)
+    set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim)
+    set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub)
+    set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust)
+    set(ROCclr_DIR ${ROCM_PATH}/rocclr/lib/cmake/rocclr)
+    set(ROCRAND_INCLUDE ${ROCM_PATH}/include)
+    set(ROCM_SMI_INCLUDE ${ROCM_PATH}/rocm_smi/include)
+  else()
+    message(FATAL_ERROR "\n***** The minimal ROCm version is 5.1.0 but have ${ROCM_VERSION_DEV} installed *****\n")
+  endif()
+
+  find_package(hip REQUIRED)
+  find_package(rocblas REQUIRED)
+  find_package(hipfft REQUIRED)
+  find_package(hiprand REQUIRED)
+  find_package(rocrand REQUIRED)
+  find_package(hipsparse REQUIRED)
+  find_package(rocprim REQUIRED)
+
   if(HIP_COMPILER STREQUAL clang)
     set(hip_library_name amdhip64)
   else()
@@ -115,6 +224,7 @@ IF(HIP_FOUND)
   # list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
   list(APPEND HIP_CXX_FLAGS -D__HIP_NO_BFLOAT16_CONVERSIONS__=1)
   list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF2_OPERATORS__=1)
+  list(APPEND HIP_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   list(APPEND HIP_CXX_FLAGS -mavx2)
   list(APPEND HIP_CXX_FLAGS -mf16c)
   list(APPEND HIP_CXX_FLAGS -mfma)
@@ -126,34 +236,17 @@ IF(HIP_FOUND)
   list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
   list(APPEND HIP_HCC_FLAGS -Wno-defaulted-function-deleted)
   foreach(fbgemm_rocm_arch ${FBGEMM_ROCM_ARCH})
-    list(APPEND HIP_HCC_FLAGS --amdgpu-target=${fbgemm_rocm_arch})
+    list(APPEND HIP_HCC_FLAGS --offload-arch=${fbgemm_rocm_arch})
   endforeach()
 
-  set(hip_DIR ${HIP_PATH}/lib/cmake/hip)
-  set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
-  set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
-  set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
-  set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand)
-  set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
-  set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
-  set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
-  set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
-  set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft)
-  set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
-  set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl)
-  set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim)
-  set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub)
-  set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust)
-  set(ROCclr_DIR ${ROCM_PATH}/rocclr/lib/cmake/rocclr)
-
-  find_package(hip REQUIRED)
-
-  set(ROCRAND_INCLUDE ${ROCRAND_PATH}/include)
-  set(ROCM_SMI_INCLUDE ${ROCM_PATH}/rocm_smi/include)
-
   set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE})
   set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}> $<INSTALL_INTERFACE:include> ${FBGEMM_HIP_INCLUDE})
 
   hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
 
+  list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
+  set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
+
+ELSE()
+  message("Not able to find HIP installation.")
 ENDIF()
diff --git a/fbgemm_gpu/codegen/__init__.template b/fbgemm_gpu/codegen/__init__.template
index 6f361ae93..de8bf21dd 100644
--- a/fbgemm_gpu/codegen/__init__.template
+++ b/fbgemm_gpu/codegen/__init__.template
@@ -13,9 +13,7 @@ import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_lars_sgd as loo
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_adam as lookup_partial_rowwise_adam  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_lamb as lookup_partial_rowwise_lamb  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad as lookup_rowwise_adagrad  # noqa: F401
-import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad_with_weight_decay as lookup_rowwise_adagrad_with_weight_decay  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_sgd as lookup_sgd  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_sgd as lookup_approx_sgd  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad as lookup_approx_rowwise_adagrad  # noqa: F401
-import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad_with_weight_decay as lookup_approx_rowwise_adagrad_with_weight_decay  # noqa: F401
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_weighted_adagrad as lookup_rowwise_weighted_adagrad  # noqa: F401
diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
index 7b7b015d2..d06a0f0ca 100644
--- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py
+++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py
@@ -434,6 +434,9 @@ def rowwise_adagrad() -> None:
         } else if (weight_decay_mode == 2) {
             // Decoupled weight decay
             correction = 1.0 - learning_rate * weight_decay;
+        } else {
+            // default value
+            correction = 1.0;
         }
     }
     multiplier = shfl_sync(multiplier, 0);
@@ -461,6 +464,9 @@ def rowwise_adagrad() -> None:
         } else if (weight_decay_mode == 2) {
             // Decoupled weight decay
             correction = 1.0 - learning_rate * weight_decay;
+        } else {
+            // default value
+            correction = 1.0;
         }
         for (int64_t d = 0; d < D; ++d) {
             host_weights_data[embedding_begin + d] = correction * host_weights_data[embedding_begin + d] - grad_buffer[d] * multiplier;
@@ -549,6 +555,9 @@ def rowwise_adagrad_with_weight_decay() -> None:
         } else if (weight_decay_mode == 2) {
             // Decoupled weight decay
             correction = 1.0 - learning_rate * weight_decay;
+        } else {
+            // default value
+            correction = 1.0;
         }
     }
     multiplier = shfl_sync(multiplier, 0);
@@ -576,6 +585,9 @@ def rowwise_adagrad_with_weight_decay() -> None:
         } else if (weight_decay_mode == 2) {
             // Decoupled weight decay
             correction = 1.0 - learning_rate * weight_decay;
+        } else {
+            // default value
+            correction = 1.0;
         }
         for (int64_t d = 0; d < D; ++d) {
             host_weights_data[embedding_begin + d] = correction * host_weights_data[embedding_begin + d] - grad_buffer[d] * multiplier;
@@ -1236,13 +1248,16 @@ def forward_quantized() -> None:
     class elem_type:
         enum_name: str
         cpp_type_name: str
+        primitive_type: str
+        bit_width: int
 
     type_map = {
-        32: elem_type("FP32", "float"),
-        16: elem_type("FP16", "__half2"),
-        8: elem_type("INT8", "uint32_t"),
-        4: elem_type("INT4", "uint32_t"),
-        2: elem_type("INT2", "uint32_t"),
+        "FP32": elem_type("FP32", "float", "FP", 32),
+        "FP16": elem_type("FP16", "__half2", "FP", 16),
+        "FP8": elem_type("FP8", "uint32_t", "FP", 8),
+        "INT8": elem_type("INT8", "uint32_t", "INT", 8),
+        "INT4": elem_type("INT4", "uint32_t", "INT", 4),
+        "INT2": elem_type("INT2", "uint32_t", "INT", 2),
     }
 
     template = env.get_template("embedding_forward_quantized_split_template.cu")
@@ -1316,4 +1331,3 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
-    # hipify_gen()
diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
index 0fba82286..7117db750 100644
--- a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
@@ -13,6 +13,7 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
 
 Tensor dense_embedding_codegen_forward_unweighted_cuda(
     Tensor dev_weights,
@@ -173,8 +174,12 @@ class SplitLookupFunction_Dense_Op
     using torch::autograd::Variable;
 
     auto grad_output = grad_outputs[0];
-    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
-        grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {
+
+    // FIXME: to support aligned memory access in Vec4T load/store function
+    // 16 for FP32 and 8 for FP16
+    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
+      grad_output = at::empty_like(grad_output).copy_(grad_output);
+    } else if (!grad_output.is_contiguous()) {
       grad_output = grad_output.contiguous();
     }
 
@@ -323,8 +328,11 @@ class SplitNoBagLookupFunction_Dense_Op
     using torch::autograd::Variable;
 
     auto grad_output = grad_outputs[0];
-    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
-        grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {
+    // FIXME: to support aligned memory access in Vec4T load/store function
+    // 16 for FP32 and 8 for FP16
+    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
+      grad_output = at::empty_like(grad_output).copy_(grad_output);
+    } else if (!grad_output.is_contiguous()) {
       grad_output = grad_output.contiguous();
     }
 
diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp
index ae01d965b..053c123e9 100644
--- a/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp
@@ -13,6 +13,7 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
 
 Tensor split_embedding_backward_codegen_dense_cpu(
     Tensor grad_output,
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp
index 28e50399b..024765225 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp
@@ -17,6 +17,7 @@
 #include "fbgemm_gpu/embedding_common.h"
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
 
 namespace {
 template <typename scalar_t, typename grad_t>
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
index bc994ccb9..2fedcc39f 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp
@@ -19,6 +19,7 @@
 #include "fbgemm_gpu/cpu_utils.h"
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
 
 namespace internal {
 template <typename T>
@@ -61,8 +62,7 @@ void split_embedding_backward_exact_cpu_kernel(
   const bool has_weights = indice_weights.defined();
   auto grad_stride = grad_output.size(1);
 
-  std::vector<::internal::BatchedHyperCompressedSparseColumn> batched_cscs(
-      num_tables);
+  std::vector<::internal::HyperCompressedSparseColumn> cscs(num_tables);
 
   auto get_hash_size = [&hash_size_cumsum_data](int feature_begin) {
     int64_t hash_size;
@@ -83,8 +83,8 @@ void split_embedding_backward_exact_cpu_kernel(
     int feature_begin = table_to_feature_offset[t];
     int64_t hash_size = get_hash_size(feature_begin);
 
-    ::internal::batched_csr2csc(
-        batched_cscs[t],
+    ::internal::csr2csc(
+        cscs[t],
         B,
         offsets.accessor<int64_t, 1>(),
         indices.accessor<int64_t, 1>(),
@@ -95,16 +95,13 @@ void split_embedding_backward_exact_cpu_kernel(
         table_to_feature_offset + t,
         hash_size);
   }
-  // sort based csr2csc handles segment_ids differently
-  bool is_csr2csc_sort = batched_cscs[0].weights == nullptr;
 
   for (int t = 0; t < num_tables; ++t) {
     int feature_begin = table_to_feature_offset[t];
 
-    int c_begin = batched_cscs[t].table_ptr[0];
-    int c_end = batched_cscs[t].table_ptr[1];
-    int* col_segment_ptr = batched_cscs[t].column_segment_ptr;
-    int* col_segment_indices = batched_cscs[t].column_segment_indices;
+    int num_non_zero_columns = cscs[t].num_non_zero_columns;
+    int* col_segment_ptr = cscs[t].column_segment_ptr;
+    int* col_segment_indices = cscs[t].column_segment_indices;
 
     auto hash_size = get_hash_size(feature_begin);
 
@@ -127,7 +124,7 @@ void split_embedding_backward_exact_cpu_kernel(
           /*IndexType=*/int32_t,
           /*OffsetType=*/int32_t>(
           D,
-          batched_cscs[t].weights != nullptr,
+          cscs[t].weights != nullptr,
           /*normalize_by_lengths=*/false,
           /*prefetch=*/16,
           /*is_weight_positional=*/false,
@@ -138,7 +135,7 @@ void split_embedding_backward_exact_cpu_kernel(
           fbgemm::GenerateSparseAdaGrad</*IndexType=*/int>(D, /*rowwise=*/true);
 
       constexpr int C_BLOCK = 64;
-      at::parallel_for(c_begin, c_end, C_BLOCK, [&](int64_t c0, int64_t c1) {
+      at::parallel_for(0, num_non_zero_columns, C_BLOCK, [&](int64_t c0, int64_t c1) {
         grad_t grad_blocked_buffer[C_BLOCK * D];
         for (int64_t c = c0; c < c1; c += C_BLOCK) {
           const int* offsets_begin_ptr = col_segment_ptr + c;
@@ -149,11 +146,11 @@ void split_embedding_backward_exact_cpu_kernel(
               B,
               reinterpret_cast<const fbgemm_weight_t*>(
                   grad_output_data + D_begin),
-              batched_cscs[t].row_indices + *offsets_begin_ptr,
+              cscs[t].row_indices + *offsets_begin_ptr,
               offsets_begin_ptr,
-              batched_cscs[t].weights == nullptr
+              cscs[t].weights == nullptr
                   ? nullptr
-                  : batched_cscs[t].weights + *offsets_begin_ptr,
+                  : cscs[t].weights + *offsets_begin_ptr,
               reinterpret_cast<float*>(grad_blocked_buffer));
 
           if (!success) {
@@ -163,7 +160,7 @@ void split_embedding_backward_exact_cpu_kernel(
               c,
               c_block_end,
               col_segment_ptr,
-              batched_cscs[t].row_indices,
+              cscs[t].row_indices,
               hash_size,
               /*allow_minus_one=*/false);
           }
@@ -195,29 +192,28 @@ void split_embedding_backward_exact_cpu_kernel(
       // TODO: to parallelize, we should easily identify segments belong to
       // the same column.
       at::acc_type<grad_t, true> grad_buffer[D];
-      for (int c = c_begin; c < c_end; ++c) {
+      for (int c = 0; c < num_non_zero_columns; ++c) {
         int64_t idx = col_segment_indices[c];
-        if (c == c_begin || col_segment_indices[c - 1] != idx) {
+        if (c == 0 || col_segment_indices[c - 1] != idx) {
           memset(grad_buffer, 0, D * sizeof(at::acc_type<grad_t, true>));
         }
         const int64_t embedding_begin = table_begin + idx * D;
         for (int r = col_segment_ptr[c]; r < col_segment_ptr[c + 1]; ++r) {
           int D_offset = D_begin;
           if (is_shared_table) {
-            D_offset +=
-                batched_cscs[t].column_segment_ids[is_csr2csc_sort ? r : c] * D;
+            D_offset += cscs[t].column_segment_ids[r] * D;
           }
-          int b = batched_cscs[t].row_indices[r];
+          int b = cscs[t].row_indices[r];
           for (int64_t d = 0; d < D; ++d) {
-            if (batched_cscs[t].weights != nullptr) {
+            if (cscs[t].weights != nullptr) {
               grad_buffer[d] += grad_output_data[b * grad_stride + D_offset + d] *
-                    batched_cscs[t].weights[r];
+                    cscs[t].weights[r];
             } else {
               grad_buffer[d] += grad_output_data[b * grad_stride + D_offset + d];
             }
           }
         }
-        if (c == c_end - 1 || col_segment_indices[c + 1] != idx) {
+        if (c == num_non_zero_columns - 1 || col_segment_indices[c + 1] != idx) {
           {{ split_weight_update_cpu }}
         }
       } // for each c
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp
index 7a6c4698e..e16d103d7 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp
@@ -14,6 +14,9 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+/// @defgroup embedding-cpu Embedding CPU Operators
 
 void split_embedding_backward_codegen_{{ optimizer }}_cpu(
     Tensor grad_output,
@@ -176,6 +179,7 @@ class SplitLookupFunction_{{ optimizer }}_Op : public torch::autograd::Function<
   }
 };
 
+///@ingroup embedding-cpu
 Tensor split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(
     Tensor host_weights,
     Tensor weights_placements,
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
index bf2c3dee9..70fcd1f54 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
@@ -14,6 +14,9 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+/// @defgroup embedding-cuda Embedding CUDA Operators
 
 Tensor split_embedding_codegen_forward_unweighted_cuda(
     Tensor dev_weights,
@@ -186,6 +189,7 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
     {% for (var, _) in args.saved_data %}
     ctx->saved_data["{{ var }}"] = {{ var }};
     {% endfor %}
+
     {% if not nobag %}
 #ifdef __HIP_PLATFORM_HCC__
     constexpr int32_t BT_block_size = 64;
@@ -270,9 +274,11 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
     using torch::autograd::Variable;
 
     auto grad_output = gradient_clipping ? clamp(grad_outputs[0], -max_gradient, max_gradient) : grad_outputs[0];
-    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
-        grad_output.stride(1) != 1 ||
-        grad_output.stride(0) % 4 != 0) {
+    // FIXME: to support aligned memory access in Vec4T load/store function
+    // 16 for FP32 and 8 for FP16
+    if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
+        grad_output = at::empty_like(grad_output).copy_(grad_output);
+    } else if (!grad_output.is_contiguous()) {
         grad_output = grad_output.contiguous();
     }
 
@@ -423,6 +429,7 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
 };
 {% endfor %}
 
+///@ingroup embedding-cuda
 Tensor split_embedding_codegen_lookup_{{ optimizer }}_function(
     Tensor placeholder_autograd_tensor,
     Tensor dev_weights,
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
index a18ae700e..a8d9c4b70 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
@@ -968,6 +968,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
             // over 48 KB per block are architecture-specific, as such they
             // must use dynamic shared memory (rather than statically sized
             // arrays) and require an explicit opt-in using cudaFuncSetAttribute()".
+
 #ifndef __HIP_PLATFORM_HCC__
             cudaFuncSetAttribute(
                 split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_kernel_cta_per_row_1<
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu
index 8d7c5d196..4d77d2b50 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check.cu
+++ b/fbgemm_gpu/codegen/embedding_bounds_check.cu
@@ -13,9 +13,9 @@ template <typename index_t>
 __device__ void adjust_offset_kernel(
     index_t& indices_start,
     index_t& indices_end,
-    index_t num_indices,
-    index_t* offset_acc_start,
-    index_t* offset_acc_end) {
+    const index_t num_indices,
+    index_t* const offset_acc_start,
+    index_t* const offset_acc_end) {
   indices_start =
       std::max(static_cast<index_t>(0), std::min(indices_start, num_indices));
   indices_end = std::max(indices_start, std::min(indices_end, num_indices));
@@ -29,7 +29,7 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
         rows_per_table,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
-    int64_t bounds_check_mode_,
+    const int64_t bounds_check_mode_,
     at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> warning,
     FixedDivisor fd) {
   int32_t T = rows_per_table.size(0);
@@ -84,10 +84,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(
         &offsets[t * B + b + 1]);
   }
 
-  auto L = indices_end - indices_start;
+  const auto L = indices_end - indices_start;
   for (index_t i = static_cast<index_t>(threadIdx.x); i < L;
        i += static_cast<index_t>(fbgemm_gpu::kWarpSize)) {
-    auto idx = indices[indices_start + i];
+    const auto idx = indices[indices_start + i];
     if (idx == -1) {
       // -1 indicates pruned rows.
       continue;
@@ -161,16 +161,17 @@ void bounds_check_indices_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(rows_per_table.get_device());
 
-  int32_t T = rows_per_table.size(0);
-  int32_t B = (offsets.size(0) - 1) / T;
+  const int32_t T = rows_per_table.size(0);
+  const int32_t B = (offsets.size(0) - 1) / T;
   if (B == 0 || T == 0) {
     return;
   }
-  auto bounds_check_mode = static_cast<BoundsCheckMode>(bounds_check_mode_);
+  const auto bounds_check_mode =
+      static_cast<BoundsCheckMode>(bounds_check_mode_);
   if (bounds_check_mode == BoundsCheckMode::WARNING) {
     warning.zero_();
   }
-  int64_t num_indices = indices.size(0);
+  const int64_t num_indices = indices.size(0);
 
   TORCH_CHECK(
       offsets.size(0) == B * T + 1,
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
index 8d2ead3e5..84575a336 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -14,6 +14,9 @@
 
 using Tensor = at::Tensor;
 
+///@defgroup embedding-cuda Embedding CUDA Operators
+
+///@ingroup embedding-cuda
 void bounds_check_indices_cuda(
     Tensor& rows_per_table,
     Tensor& indices,
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
index 88d5893d9..a2dd19a75 100644
--- a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
@@ -12,6 +12,10 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+///@defgroup embedding-cpu Embedding CPU Operators
+///
 
 namespace {
 
@@ -31,6 +35,7 @@ void adjust_offset_cpu(
   *offsets_acc_end = indices_end;
 }
 
+///@addtogroup embedding-cpu
 void bounds_check_indices_cpu(
     Tensor& rows_per_table,
     Tensor& indices,
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
index 03db60ac1..5d2f10c51 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
@@ -19,6 +19,8 @@
 #include <immintrin.h>
 #include <emmintrin.h>
 
+using namespace fbgemm_gpu;
+
 namespace {
 
 using Tensor = at::Tensor;
@@ -116,7 +118,8 @@ Tensor int_nbit_split_embedding_codegen_forward_{{ wdesc }}_cpu(
     Tensor indice_weights,
     {% endif %}
     int64_t output_dtype,
-    int64_t unused
+    int64_t fp8_exponent_bits,
+    int64_t fp8_exponent_bias
 ) {
     TENSOR_ON_CPU(dev_weights);
     TENSOR_ON_CPU(uvm_weights);
@@ -160,7 +163,7 @@ Tensor int_nbit_split_embedding_codegen_forward_{{ wdesc }}_cpu(
 
     const auto* weights_tys_acc = weights_tys.data_ptr<uint8_t>();
 
-    DISPATCH_OUTPUT_TYPES(output.type(), "intn_split_embedding_codegen_forward_kernel", [&] {
+    DISPATCH_OUTPUT_TYPES(output.scalar_type(), "intn_split_embedding_codegen_forward_kernel", [&] {
         auto* output_acc = output.data_ptr<output_t>();
         {% if weighted %}
         const float* indice_weights_acc = indice_weights.data_ptr<float>();
@@ -246,6 +249,26 @@ Tensor int_nbit_split_embedding_codegen_forward_{{ wdesc }}_cpu(
                         offsets_begin_ptr,
                         indice_weights_ptr,
                         reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
+                } else if (weight_ty == SparseType::FP8) {
+                    assert(fp8_exponent_bits > 0 && fp8_exponent_bias > 0);
+                    auto kernel = fbgemm::GenerateEmbeddingSpMDMFP8WithStrides<index_t, index_t, fbgemm_out_t>(
+                        D,
+                        normalize_by_lengths,
+                        /*is_weight_positional=*/false,
+                        /*use_offsets=*/true,
+                        /*output_stride=*/total_D,
+                        /*input_stride=*/D_bytes / sizeof(uint8_t),
+                        /*exponent_bits=*/fp8_exponent_bits,
+                        /*exponent_bias=*/fp8_exponent_bias);
+                    success = kernel(
+                        B,
+                        index_size,
+                        num_rows,
+                        weights,
+                        indices_acc + *offsets_begin_ptr,
+                        offsets_begin_ptr,
+                        indice_weights_ptr,
+                        reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
                 } else if (weight_ty == SparseType::INT8) {
                     auto kernel = fbgemm::GenerateEmbeddingSpMDMWithStrides<uint8_t, index_t, index_t, fbgemm_out_t, /*THREAD_LOCAL=*/true>(
                         D,
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
index 13cacd48a..be9d1b476 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
@@ -14,6 +14,10 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+///@defgroup embedding-cuda Embedding CUDA Operators
+///
 
 Tensor int_nbit_split_embedding_codegen_forward_unweighted_cuda(
     Tensor dev_weights,
@@ -35,7 +39,9 @@ Tensor int_nbit_split_embedding_codegen_forward_unweighted_cuda(
     int64_t output_dtype,
     Tensor lxu_cache_weights,
     Tensor lxu_cache_locations,
-    int64_t unused);
+    int64_t max_float8_D,
+    int64_t fp8_exponent_bits,
+    int64_t fp8_exponent_bias);
 
 Tensor int_nbit_split_embedding_codegen_forward_weighted_cuda(
     Tensor dev_weights,
@@ -58,7 +64,9 @@ Tensor int_nbit_split_embedding_codegen_forward_weighted_cuda(
     int64_t output_dtype,
     Tensor lxu_cache_weights,
     Tensor lxu_cache_locations,
-    int64_t unused);
+    int64_t max_float8_D,
+    int64_t fp8_exponent_bits,
+    int64_t fp8_exponent_bias);
 
 Tensor int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda(
     Tensor dev_weights,
@@ -78,8 +86,11 @@ Tensor int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda(
     int64_t output_dtype,
     Tensor lxu_cache_weights,
     Tensor lxu_cache_locations,
-    int64_t unused);
+    int64_t max_float8_D,
+    int64_t fp8_exponent_bits,
+    int64_t fp8_exponent_bias);
 
+///@ingroup embedding-cuda
 Tensor int_nbit_split_embedding_codegen_lookup_function(
     Tensor dev_weights,
     Tensor uvm_weights,
@@ -100,10 +111,18 @@ Tensor int_nbit_split_embedding_codegen_lookup_function(
     int64_t output_dtype,
     c10::optional<Tensor> lxu_cache_weights,
     c10::optional<Tensor> lxu_cache_locations,
-    c10::optional<int64_t> row_alignment) {
+    c10::optional<int64_t> row_alignment,
+    c10::optional<int64_t> max_float8_D,
+    c10::optional<int64_t> fp8_exponent_bits,
+    c10::optional<int64_t> fp8_exponent_bias) {
   if (static_cast<PoolingMode>(pooling_mode) == PoolingMode::NONE) {
     std::vector<int64_t> max_D_list{
-        max_int2_D, max_int4_D, max_int8_D, max_float16_D, max_float32_D};
+        max_int2_D,
+        max_int4_D,
+        max_int8_D,
+        max_float8_D ? *max_float8_D : 0,
+        max_float16_D,
+        max_float32_D};
     int64_t max_D = *std::max_element(max_D_list.begin(), max_D_list.end());
     return int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda(
         dev_weights,
@@ -123,7 +142,9 @@ Tensor int_nbit_split_embedding_codegen_lookup_function(
         output_dtype,
         lxu_cache_weights.value_or(at::empty({0, 0}, at::kByte)),
         lxu_cache_locations.value_or(at::empty({0}, at::kInt)),
-        0);
+        max_float8_D ? *max_float8_D : 0,
+        fp8_exponent_bits ? *fp8_exponent_bits : -1,
+        fp8_exponent_bias ? *fp8_exponent_bias : -1);
   }
   if (!indice_weights) {
     return int_nbit_split_embedding_codegen_forward_unweighted_cuda(
@@ -146,7 +167,9 @@ Tensor int_nbit_split_embedding_codegen_lookup_function(
         output_dtype,
         lxu_cache_weights.value_or(at::empty({0, 0}, at::kByte)),
         lxu_cache_locations.value_or(at::empty({0}, at::kInt)),
-        0);
+        max_float8_D ? *max_float8_D : 0,
+        fp8_exponent_bits ? *fp8_exponent_bits : -1,
+        fp8_exponent_bias ? *fp8_exponent_bias : -1);
   }
   return int_nbit_split_embedding_codegen_forward_weighted_cuda(
       dev_weights,
@@ -169,15 +192,19 @@ Tensor int_nbit_split_embedding_codegen_lookup_function(
       output_dtype,
       lxu_cache_weights.value_or(at::empty({0, 0}, at::kByte)),
       lxu_cache_locations.value_or(at::empty({0}, at::kInt)),
-      0);
+      max_float8_D ? *max_float8_D : 0,
+      fp8_exponent_bits ? *fp8_exponent_bits : -1,
+      fp8_exponent_bias ? *fp8_exponent_bias : -1);
 }
 
+///@ingroup embedding-cuda
 Tensor pruned_hashmap_lookup_unweighted_cuda(
     Tensor indices,
     Tensor offsets,
     Tensor hash_table,
     Tensor hash_table_offsets);
 
+///@ingroup embedding-cuda
 Tensor pruned_array_lookup_cuda(
     Tensor indices,
     Tensor offsets,
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
index 3100db743..82876fa70 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp
@@ -20,6 +20,9 @@
 
 using Tensor = at::Tensor;
 
+///@defgroup embedding-cpu Embedding CPU Operators
+///
+
 Tensor int_nbit_split_embedding_codegen_forward_unweighted_cpu(
     Tensor dev_weights,
     Tensor uvm_weights,
@@ -33,7 +36,8 @@ Tensor int_nbit_split_embedding_codegen_forward_unweighted_cpu(
     int64_t pooling_mode,
     int64_t row_alignment,
     int64_t output_dtype,
-    int64_t unused);
+    int64_t fp8_exponent_bits,
+    int64_t fp8_exponent_bias);
 
 Tensor int_nbit_split_embedding_codegen_forward_weighted_cpu(
     Tensor dev_weights,
@@ -49,8 +53,10 @@ Tensor int_nbit_split_embedding_codegen_forward_weighted_cpu(
     int64_t row_alignment,
     Tensor indice_weights,
     int64_t output_dtype,
-    int64_t unused);
+    int64_t fp8_exponent_bits,
+    int64_t fp8_exponent_bias);
 
+///@ingroup embedding-cpu
 Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(
     Tensor dev_weights,
     Tensor uvm_weights, // to match the interface of CUDA op using UVM
@@ -73,7 +79,10 @@ Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(
         lxu_cache_weights, // Not used, to match cache interface for CUDA op
     c10::optional<Tensor>
         lxu_cache_locations, // Not used, to match cache interface for CUDA op
-    c10::optional<int64_t> row_alignment) {
+    c10::optional<int64_t> row_alignment,
+    c10::optional<int64_t> max_float8_D,
+    c10::optional<int64_t> fp8_exponent_bits,
+    c10::optional<int64_t> fp8_exponent_bias) {
   if (!indice_weights) {
     return int_nbit_split_embedding_codegen_forward_unweighted_cpu(
         dev_weights,
@@ -88,7 +97,8 @@ Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(
         pooling_mode,
         row_alignment ? *row_alignment : 1,
         output_dtype,
-        0);
+        fp8_exponent_bits ? *fp8_exponent_bits : -1,
+        fp8_exponent_bias ? *fp8_exponent_bias : -1);
   }
   return int_nbit_split_embedding_codegen_forward_weighted_cpu(
       dev_weights,
@@ -104,9 +114,11 @@ Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(
       row_alignment ? *row_alignment : 1,
       *indice_weights,
       output_dtype,
-      0);
+      fp8_exponent_bits ? *fp8_exponent_bits : -1,
+      fp8_exponent_bias ? *fp8_exponent_bias : -1);
 }
 
+///@ingroup embedding-cpu
 void pruned_hashmap_insert_unweighted_cpu(
     Tensor indices,
     Tensor dense_indices,
@@ -114,12 +126,14 @@ void pruned_hashmap_insert_unweighted_cpu(
     Tensor hash_table,
     Tensor hash_table_offsets);
 
+///@ingroup embedding-cpu
 Tensor pruned_hashmap_lookup_unweighted_cpu(
     Tensor indices,
     Tensor offsets,
     Tensor hash_table,
     Tensor hash_table_offsets);
 
+///@ingroup embedding-cpu
 Tensor pruned_array_lookup_cpu(
     Tensor indices,
     Tensor offsets,
@@ -128,7 +142,7 @@ Tensor pruned_array_lookup_cpu(
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
-      "int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment = None) -> Tensor");
+      "int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment = None, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1) -> Tensor");
   DISPATCH_TO_CPU(
       "int_nbit_split_embedding_codegen_lookup_function",
       int_nbit_split_embedding_codegen_lookup_function_cpu);
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
index eea68c958..ab56c2028 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
@@ -20,6 +20,7 @@ constexpr int32_t kCacheLocationMissing = -1;
 __device__ inline int32_t padded_D(int32_t dim, SparseType weight_ty) {
     if (weight_ty == SparseType::FP32) { return dim; }
     if (weight_ty == SparseType::FP16) { return dim; }
+    if (weight_ty == SparseType::FP8) { return dim; }
     if (weight_ty == SparseType::INT8) { return dim + 4; }
     if (weight_ty == SparseType::INT4) { return dim + 8; }
     if (weight_ty == SparseType::INT2) { return dim + 16; }
@@ -126,7 +127,7 @@ void cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
             "Size is not supported");
 
     unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-    int src_in_bytes = (pred_guard ? SizeInBytes : 0);
+    const int src_in_bytes = pred_guard ? SizeInBytes : 0;
 
     asm volatile(
     "cp.async.ca.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
@@ -150,10 +151,10 @@ void cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
 {% for nobag in [True, False] %}
 {% if not nobag or not weighted %}
 // TODO: increase code sharing (templates for accumulator_ty, accumulation, outputs per thread, etc?)
-{% for bit_width in [32, 16, 8, 4, 2] %}
+{% for emb_weight_type in ["FP32", "FP16", "FP8", "INT8", "INT4", "INT2"] %}
 template<typename index_t, typename output_t, size_t OutputRowsPerThread, size_t WarpsPerBlock, size_t InputRowsInFlight, size_t MinNum128BRows, size_t MaxNum128BRows>
 __launch_bounds__(WarpsPerBlock * kWarpSize)
-__global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L(
+__global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L(
   const at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> dev_weights,
   const at::PackedTensorAccessor64<uint8_t, 1, at::RestrictPtrTraits> uvm_weights,
   const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> weights_placements,
@@ -162,30 +163,34 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
   {% if not nobag %}
   const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> D_offsets,
   {% else %}
-  int64_t D,
+  const int64_t D,
   {% endif %}
   const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
   const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
   {% if not nobag %}
-  int64_t pooling_mode,
+  const int64_t pooling_mode,
   {% endif %}
-  int64_t row_alignment,
+  const int64_t row_alignment,
   {% if weighted %}
   at::PackedTensorAccessor32<float, 1, at::RestrictPtrTraits>
       indice_weights,
   {% endif %}
+  {% if type_map[emb_weight_type].enum_name == "FP8" %}
+  const int exponent_bits,
+  const int exponent_bias,
+  {% endif %}
   at::PackedTensorAccessor32<output_t, 2, at::RestrictPtrTraits>
       output, // [B][total_D],
   const at::PackedTensorAccessor64<uint8_t, 2, at::RestrictPtrTraits> lxu_cache_weights,
   const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> lxu_cache_locations
   ) {
-  int32_t T = weights_offsets.size(0);
+  const int32_t T = weights_offsets.size(0);
   {% if not nobag %}
-  int32_t B = output.size(0);
+  const int32_t B = output.size(0);
   {% else %}
-  int32_t B = (offsets.size(0) - 1) / T;
+  const int32_t B = (offsets.size(0) - 1) / T;
   {% endif %}
-  int32_t bb_t = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t bb_t = blockIdx.x * blockDim.y + threadIdx.y;
   if (bb_t >= div_round_up(B, OutputRowsPerThread) * T) {
     return;
   }
@@ -202,7 +207,7 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
   const int32_t D = D_end - D_start;
   {% endif %}
   SparseType weight_ty = static_cast<SparseType>(weights_tys[t]);
-  if (weight_ty != SparseType::{{ type_map[bit_width].enum_name }}) {
+  if (weight_ty != SparseType::{{ type_map[emb_weight_type].enum_name }}) {
       return;
   }
 
@@ -213,9 +218,9 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
     return;
   }
 
-  uint32_t bb = bb_t % div_round_up(B, OutputRowsPerThread);
+  const uint32_t bb = bb_t % div_round_up(B, OutputRowsPerThread);
 
-  int64_t weights_offset = weights_offsets[t];
+  const int64_t weights_offset = weights_offsets[t];
   const int32_t D_total = padded_D(D, weight_ty);
   const int32_t D_padding = D_total - D;
 
@@ -240,13 +245,13 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
   } else {
       weights = &uvm_weights[weights_offset];
   }
-  constexpr size_t kOutputsPerThread = {{ (32 // bit_width) }};
+  constexpr size_t kOutputsPerThread = {{ (32 // type_map[emb_weight_type].bit_width) }};
 
   constexpr uint32_t NumUint4PerRow = MaxNum128BRows * 128 / sizeof(uint4);
   const uint32_t uint4_loads_per_row = div_round_up(D_bytes, sizeof(uint4));
 
   {% if not nobag %}
-  VecNT<{{ (32 // bit_width) }}> accumulators[OutputRowsPerThread][MaxNum128BRows];
+  VecNT<{{ (32 // type_map[emb_weight_type].bit_width) }}, PrimitiveType::{{ type_map[emb_weight_type].primitive_type }}> accumulators[OutputRowsPerThread][MaxNum128BRows];
   {% endif %}
 
   for (uint32_t L_start = 0; L_start < max_Ls; L_start += InputRowsInFlight) {
@@ -304,7 +309,7 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
         // scale and bias are at the beginning of each row.
         // rationale: have scale/shift at start since these get loaded first
         // and then broadcasted around so it might speed up the first cache miss.
-        {% if bit_width in [8, 4, 2] %}
+        {% if type_map[emb_weight_type].primitive_type == "INT" %}
         half2 shift_scale = reinterpret_cast<const half2*>(row)[0];
         {% endif %}
 
@@ -312,16 +317,16 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
         float row_weight = buffers_indice_weights[warp_idx][i][input_row_idx];
         {% endif %}
 
-        using scalar_t = {{ type_map[bit_width].cpp_type_name }};
+        using scalar_t = {{ type_map[emb_weight_type].cpp_type_name }};
 
         {% if not nobag %}
         #pragma unroll MaxNum128BRows
         for (uint32_t j = 0; j < MaxNum128BRows; ++j) {
           scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
           {% if weighted %}
-          accumulators[i][j].fma(v, {% if bit_width in [8, 4, 2] %} shift_scale, {% endif %} row_weight);
+          accumulators[i][j].fma(v, {% if type_map[emb_weight_type].primitive_type == "INT" %} shift_scale, {% elif type_map[emb_weight_type].enum_name == "FP8" %} exponent_bits, exponent_bias, {% endif %} row_weight);
           {% else %}
-          accumulators[i][j].add(v{% if bit_width in [8, 4, 2] %}, shift_scale {% endif %});
+          accumulators[i][j].add(v{% if type_map[emb_weight_type].primitive_type == "INT" %}, shift_scale {% elif type_map[emb_weight_type].enum_name == "FP8" %}, exponent_bits, exponent_bias {% endif %});
           {% endif %}
         }
         {% else %}
@@ -336,8 +341,8 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
             const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
             scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
             if (output_d >= 0 && output_d < D) {
-              const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // bit_width) }}));
-              VecNT<{{ (32 // bit_width) }}> acc(v{% if bit_width in [8, 4, 2] %}, shift_scale {% endif %});
+              const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // type_map[emb_weight_type].bit_width) }}));
+              VecNT<{{ (32 // type_map[emb_weight_type].bit_width) }}, PrimitiveType::{{ type_map[emb_weight_type].primitive_type }}> acc(v{% if type_map[emb_weight_type].primitive_type == "INT" %}, shift_scale {% elif type_map[emb_weight_type].enum_name == "FP8" %}, exponent_bits, exponent_bias {% endif %});
               acc.store(&output[output_j][output_d], num_valid_outputs);
             }
           }
@@ -351,10 +356,10 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
           for (uint32_t j = 0; j < MaxNum128BRows; ++j) {
             int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
             scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
-            VecNT<{{ (32 // bit_width) }}> acc(v{% if bit_width in [8, 4, 2] %}, shift_scale {% endif %});
+            VecNT<{{ (32 // type_map[emb_weight_type].bit_width) }}, PrimitiveType::{{ type_map[emb_weight_type].primitive_type }}> acc(v{% if type_map[emb_weight_type].primitive_type == "INT" %}, shift_scale {% elif type_map[emb_weight_type].enum_name == "FP8" %}, exponent_bits, exponent_bias {% endif %});
             if (output_d >= 0 && output_d < D) {
-              thread_local_max = max(thread_local_max, float{{ (32 // bit_width) }}_max(acc.acc));
-              thread_local_min = min(thread_local_min, float{{ (32 // bit_width) }}_min(acc.acc));
+              thread_local_max = max(thread_local_max, float{{ (32 // type_map[emb_weight_type].bit_width) }}_max(acc.acc));
+              thread_local_min = min(thread_local_min, float{{ (32 // type_map[emb_weight_type].bit_width) }}_min(acc.acc));
             }
           }
           qparams = warp_find_qparams(thread_local_min, thread_local_max);
@@ -363,8 +368,8 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
             const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
             scalar_t v = reinterpret_cast<const scalar_t*>(row)[kWarpSize * j + threadIdx.x];
             if (output_d >= 0 && output_d < D) {
-              const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // bit_width) }}));
-              VecNT<{{ (32 // bit_width) }}> acc(v{% if bit_width in [8, 4, 2] %}, shift_scale {% endif %});
+              const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // type_map[emb_weight_type].bit_width) }}));
+              VecNT<{{ (32 // type_map[emb_weight_type].bit_width) }}, PrimitiveType::{{ type_map[emb_weight_type].primitive_type }}> acc(v{% if type_map[emb_weight_type].primitive_type == "INT" %}, shift_scale {% elif type_map[emb_weight_type].enum_name == "FP8" %}, exponent_bits, exponent_bias {% endif %});
               acc.store(&output[output_j][output_d], qparams, num_valid_outputs);
             }
           }
@@ -393,7 +398,7 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
         }
 
         if (output_d >= 0 && output_d < D) {
-          const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // bit_width) }}));
+          const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // type_map[emb_weight_type].bit_width) }}));
           accumulators[i][j].store(&output[b][D_start + output_d], num_valid_outputs);
         }
 
@@ -411,19 +416,19 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
           accumulators[i][j].mul(inv_L);
         }
         if (output_d >= 0 && output_d < D) {
-          thread_local_max = max(thread_local_max, float{{ (32 // bit_width) }}_max(accumulators[i][j].acc));
-          thread_local_min = min(thread_local_min, float{{ (32 // bit_width) }}_min(accumulators[i][j].acc));
+          thread_local_max = max(thread_local_max, float{{ (32 // type_map[emb_weight_type].bit_width) }}_max(accumulators[i][j].acc));
+          thread_local_min = min(thread_local_min, float{{ (32 // type_map[emb_weight_type].bit_width) }}_min(accumulators[i][j].acc));
         }
       }
 
       qparams = warp_find_qparams(thread_local_min, thread_local_max);
-      int output_D_start = D_start + t * 8;
-      int output_D_end = output_D_start + D;
+      const int output_D_start = D_start + t * 8;
+      const int output_D_end = output_D_start + D;
       #pragma unroll MaxNum128BRows
       for (uint32_t j = 0; j < MaxNum128BRows; ++j) {
         const int32_t output_d = kWarpSize * j * kOutputsPerThread + threadIdx.x * kOutputsPerThread - D_padding;
         if (output_d >= 0 && output_d < D) {
-          const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // bit_width) }}));
+          const int num_valid_outputs = min(static_cast<int>(D - output_d), static_cast<int>({{ (32 // type_map[emb_weight_type].bit_width) }}));
           accumulators[i][j].store(&output[b][output_D_start + output_d], qparams, num_valid_outputs);
         }
       }
@@ -436,7 +441,7 @@ __global__ void {{ type_map[bit_width].enum_name }}_split_embedding{{ "_nobag" i
   }
   {% endif %}
 }
-{% endfor %} // for bit_width in [32, 16, 8, 4, 2]
+{% endfor %} // for emb_weight_type in ["FP32", "FP16", "FP8", "INT8", "INT4", "INT2"]
 {% endif %} // if not nobag or not weighted
 {% endfor %} // for nobag in [True, False]
 
@@ -455,23 +460,23 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_
     const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> offsets,
     const at::PackedTensorAccessor64<int32_t, 2, at::RestrictPtrTraits> hash_table,
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> hash_table_offsets,
-    int32_t B,
-    int32_t T,
+    const int32_t B,
+    const int32_t T,
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> dense_indices) {
     // uint32_t capacity = hash_table.size(0);
-    int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
-    int32_t t = b_t / B;
-    int32_t b = b_t % B;
+    const int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+    const int32_t t = b_t / B;
+    const int32_t b = b_t % B;
     if (b_t >= B * T) {
         return;
     }
-    int32_t indices_start = offsets[t * B + b];
-    int32_t indices_end = offsets[t * B + b + 1];
-    int32_t L = indices_end - indices_start;
+    const int32_t indices_start = offsets[t * B + b];
+    const int32_t indices_end = offsets[t * B + b + 1];
+    const int32_t L = indices_end - indices_start;
 
-    int64_t table_start = hash_table_offsets[t];
-    int64_t table_end = hash_table_offsets[t + 1];
-    int64_t capacity = table_end - table_start;
+    const int64_t table_start = hash_table_offsets[t];
+    const int64_t table_end = hash_table_offsets[t + 1];
+    const int64_t capacity = table_end - table_start;
 
     if (capacity == 0) {
       // No pruning applied on the indices associated with this table.
@@ -481,21 +486,21 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_
       return;
     }
 
-    uint32_t subwarp_id = threadIdx.x / 4;
-    uint32_t subwarp_tid = threadIdx.x % 4;
+    const uint32_t subwarp_id = threadIdx.x / 4;
+    const uint32_t subwarp_tid = threadIdx.x % 4;
 #ifdef __HIP_PLATFORM_HCC__
-    uint64_t subwarp_mask = static_cast<uint64_t>(0xF) << (4 * subwarp_id);
+    const uint64_t subwarp_mask = static_cast<uint64_t>(0xF) << (4 * subwarp_id);
 #else
-    uint32_t subwarp_mask = static_cast<uint32_t>(0xF) << (4 * subwarp_id);
+    const uint32_t subwarp_mask = static_cast<uint32_t>(0xF) << (4 * subwarp_id);
 #endif
     for (int32_t l_start = 0; l_start + subwarp_id < L; l_start += kWarpSize / 4) {
-        int32_t idx = indices[indices_start + l_start + subwarp_id];
+        const int32_t idx = indices[indices_start + l_start + subwarp_id];
         uint32_t slot_start = pruned_hash_function(static_cast<uint32_t>(idx)) % capacity;
         while (true) {
-            uint32_t slot = (slot_start + subwarp_tid) % capacity;
-            int2 val = *reinterpret_cast<const int2*>(&hash_table[table_start + static_cast<int64_t>(slot)][0]);
-            int32_t slot_sparse_idx = val.x;
-            int32_t slot_dense_idx = val.y;
+            const uint32_t slot = (slot_start + subwarp_tid) % capacity;
+            const int2 val = *reinterpret_cast<const int2*>(&hash_table[table_start + static_cast<int64_t>(slot)][0]);
+            const int32_t slot_sparse_idx = val.x;
+            const int32_t slot_dense_idx = val.y;
 
             bool found = false;
             bool empty = false;
@@ -505,20 +510,9 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_
                 found = true;
                 dense_indices[indices_start + l_start + subwarp_id] = slot_dense_idx;
             }
-#ifdef __HIP_PLATFORM_HCC__
-            // FIXME: __any_sync with mask isn't supported by HIP yet.
-            // See https://fburl.com/fvy7j0lq for the similar context.
-            // assert false here with https://fburl.com/pfm7enw2
-            if (__any_sync(subwarp_mask, found)) {
-#else
             if (__any_sync(subwarp_mask, found)) {
-#endif
                 break;
-#ifdef __HIP_PLATFORM_HCC__
-            } else if (__any_sync(subwarp_mask, empty)) {
-#else
             } else if (__any_sync(subwarp_mask, empty)) {
-#endif
                 dense_indices[indices_start + l_start + subwarp_id] = -1;
                 break;
             }
@@ -533,22 +527,22 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_
     const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> offsets,
     const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> index_remappings,
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> index_remappings_offsets,
-    int32_t B,
-    int32_t T,
+    const int32_t B,
+    const int32_t T,
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> dense_indices) {
-  int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
-  int32_t t = b_t / B;
-  int32_t b = b_t % B;
+  const int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t t = b_t / B;
+  const int32_t b = b_t % B;
   if (b_t >= B * T) {
       return;
   }
-  int32_t indices_start = offsets[t * B + b];
-  int32_t indices_end = offsets[t * B + b + 1];
-  int32_t L = indices_end - indices_start;
+  const int32_t indices_start = offsets[t * B + b];
+  const int32_t indices_end = offsets[t * B + b + 1];
+  const int32_t L = indices_end - indices_start;
 
-  int64_t index_remappings_start = index_remappings_offsets[t];
-  int64_t index_remappings_end = index_remappings_offsets[t + 1];
-  int64_t capacity = index_remappings_end - index_remappings_start;
+  const int64_t index_remappings_start = index_remappings_offsets[t];
+  const int64_t index_remappings_end = index_remappings_offsets[t + 1];
+  const int64_t capacity = index_remappings_end - index_remappings_start;
 
   for (int32_t l = threadIdx.x; l < L; l += blockDim.x) {
     int32_t idx = indices[indices_start + l];
@@ -569,28 +563,30 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     Tensor weights_tys,
     {% if not nobag %}
     Tensor D_offsets,
-    int64_t total_D,
+    const int64_t total_D,
     {% else %}
-    int64_t D,
+    const int64_t D,
     {% endif %}
-    int64_t max_int2_D,
-    int64_t max_int4_D,
-    int64_t max_int8_D,
-    int64_t max_float16_D,
-    int64_t max_float32_D,
+    const int64_t max_int2_D,
+    const int64_t max_int4_D,
+    const int64_t max_int8_D,
+    const int64_t max_float16_D,
+    const int64_t max_float32_D,
     Tensor indices,
     Tensor offsets,
     {% if not nobag %}
-    int64_t pooling_mode,
+    const int64_t pooling_mode,
     {% endif %}
-    int64_t row_alignment,
+    const int64_t row_alignment,
     {% if weighted %}
     Tensor indice_weights,
     {% endif %}
-    int64_t output_dtype,
+    const int64_t output_dtype,
     Tensor lxu_cache_weights,
     Tensor lxu_cache_locations,
-    int64_t unused
+    const int64_t max_float8_D,
+    const int64_t fp8_exponent_bits,
+    const int64_t fp8_exponent_bias
 ) {
     TENSOR_ON_CUDA_GPU(dev_weights);
     TENSOR_ON_CUDA_GPU(uvm_weights);
@@ -612,14 +608,14 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     device_guard.set_index(dev_weights.get_device());
 
     {% if not nobag %}
-    int32_t T = D_offsets.numel() - 1;
+    const int32_t T = D_offsets.numel() - 1;
     {% else %}
-    int32_t total_L = indices.numel();
-    int32_t T = weights_offsets.numel();
+    const int32_t total_L = indices.numel();
+    const int32_t T = weights_offsets.numel();
     {% endif %}
     TORCH_CHECK(T > 0);
     // offsets = [B x T  + 1]
-    int32_t B = (offsets.size(0) - 1) / T;
+    const int32_t B = (offsets.size(0) - 1) / T;
     TORCH_CHECK(B >= 0);
 
     {% if not nobag %}
@@ -685,7 +681,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     ); \
     C10_CUDA_KERNEL_LAUNCH_CHECK(); \
 
-    DISPATCH_OUTPUT_TYPES(output.type(), "int2_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
+    DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int2_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int2_D > 0) {
         auto max_int2_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int2_D, SparseType::INT2, row_alignment), 128);
         TORCH_CHECK(max_int2_128b_rows <= 2);
@@ -729,7 +725,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     ); \
     C10_CUDA_KERNEL_LAUNCH_CHECK(); \
 
-    DISPATCH_OUTPUT_TYPES(output.type(), "int4_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
+    DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int4_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int4_D > 0) {
         auto max_int4_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int4_D, SparseType::INT4, row_alignment), 128);
         TORCH_CHECK(max_int4_128b_rows <= 4);
@@ -746,7 +742,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     }));
     #undef X
 
-    // launch 8-bit kernel
+    // launch 8-bit int kernel
     #define X(OutputRowsPerThread, InputRowsInFlight, MinNum128BRows, MaxNum128BRows) \
     nbit::INT8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows><<< \
         nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
@@ -776,7 +772,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     ); \
     C10_CUDA_KERNEL_LAUNCH_CHECK(); \
 
-    DISPATCH_OUTPUT_TYPES(output.type(), "int8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
+    DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_int8_D > 0) {
         auto max_int8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int8_D, SparseType::INT8, row_alignment), 128);
         TORCH_CHECK(max_int8_128b_rows <= 8);
@@ -796,6 +792,58 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     }));
     #undef X
 
+    // launch 8-bit float kernel
+    #define X(OutputRowsPerThread, InputRowsInFlight, MinNum128BRows, MaxNum128BRows) \
+    nbit::FP8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows><<< \
+        nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
+        dim3(kWarpSize, kWarpsPerBlock), \
+        0, \
+        at::cuda::getCurrentCUDAStream()>>>( \
+        dev_weights.packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(), \
+        uvm_weights.packed_accessor64<uint8_t, 1, at::RestrictPtrTraits>(), \
+        weights_placements.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(), \
+        weights_offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(), \
+        weights_tys.packed_accessor32<uint8_t, 1, at::RestrictPtrTraits>(), \
+        {% if not nobag %} \
+        D_offsets.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(), \
+        {% else %} \
+        D, \
+        {% endif %} \
+        indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(), \
+        offsets.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(), \
+        {% if not nobag %} \
+        pooling_mode, \
+        {% endif %} \
+        row_alignment, \
+        {% if weighted %} indice_weights.packed_accessor32<float, 1, at::RestrictPtrTraits>(), {% endif %} \
+        fp8_exponent_bits, \
+        fp8_exponent_bias, \
+        output.packed_accessor32<output_t, 2, at::RestrictPtrTraits>(), \
+        lxu_cache_weights.packed_accessor64<uint8_t, 2, at::RestrictPtrTraits>(), \
+        lxu_cache_locations.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>() \
+    ); \
+    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
+
+    DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
+      if (max_float8_D > 0) {
+        auto max_fp8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float8_D, SparseType::FP8, row_alignment), 128);
+        TORCH_CHECK(max_fp8_128b_rows <= 8);
+        if (max_fp8_128b_rows > 0) {
+          X(2, 8, 0, 1);
+        }
+        if (max_fp8_128b_rows > 1) {
+          X(2, 4, 1, 2);
+        }
+        if (max_fp8_128b_rows > 2) {
+          X(2, 4, 2, 4);
+        }
+        if (max_fp8_128b_rows > 4) {
+          X(2, 4, 4, 8);
+        }
+      }
+    }));
+    #undef X
+
     // launch 16-bit kernel
     #define X(OutputRowsPerThread, InputRowsInFlight, MinNum128BRows, MaxNum128BRows) \
     nbit::FP16_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows><<< \
@@ -826,7 +874,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     ); \
     C10_CUDA_KERNEL_LAUNCH_CHECK(); \
 
-    DISPATCH_OUTPUT_TYPES(output.type(), "fp16_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
+    DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp16_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_float16_D > 0) {
         auto max_fp16_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float16_D, SparseType::FP16, row_alignment), 128);
         TORCH_CHECK(max_fp16_128b_rows <= 16);
@@ -876,7 +924,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     ); \
     C10_CUDA_KERNEL_LAUNCH_CHECK(); \
 
-    DISPATCH_OUTPUT_TYPES(output.type(), "fp32_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
+    DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp32_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] {
       if (max_float32_D > 0) {
         auto max_fp32_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float32_D, SparseType::FP32, row_alignment), 128);
         TORCH_CHECK(max_fp32_128b_rows <= 32);
@@ -906,8 +954,8 @@ Tensor pruned_hashmap_lookup_{{ wdesc }}_cuda(
     at::cuda::OptionalCUDAGuard device_guard;
     device_guard.set_index(indices.get_device());
     auto dense_indices = at::empty_like(indices);
-    int32_t T = hash_table_offsets.size(0) - 1;
-    int32_t B = (offsets.size(0) - 1) / T;
+    const int32_t T = hash_table_offsets.size(0) - 1;
+    const int32_t B = (offsets.size(0) - 1) / T;
     TORCH_CHECK(B > 0);
     TORCH_CHECK(hash_table.size(0) < std::numeric_limits<int32_t>::max());
     constexpr size_t kForwardMaxThreads = 256;
@@ -943,7 +991,7 @@ Tensor pruned_array_lookup_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(indices.get_device());
   auto dense_indices = at::empty_like(indices);
-  int32_t T = index_remappings_offsets.size(0) - 1;
+  const int32_t T = index_remappings_offsets.size(0) - 1;
   TORCH_CHECK(
       (offsets.size(0) - 1) % T == 0,
       "offsets.size() - 1 is not divisible by T! offsets.size: ",
@@ -951,7 +999,7 @@ Tensor pruned_array_lookup_cuda(
       "T: ",
       T
   );
-  int32_t B = (offsets.size(0) - 1) / T;
+  const int32_t B = (offsets.size(0) - 1) / T;
   TORCH_CHECK(B > 0, "offsets.size(): ", offsets.size(0), ", T: ", T, ", B: ", B);
   TORCH_CHECK(index_remappings.size(0) < std::numeric_limits<int64_t>::max());
   TORCH_CHECK(indices.dim() == 1, "Tensor dim: ", indices.dim());
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
index c2eca730d..89c43fc4e 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
+++ b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp
@@ -20,6 +20,7 @@
 #include <ATen/AccumulateType.h>
 
 using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
 
 template <typename weights_t, typename ind_weights_t, typename output_t>
 void split_embedding_forward_cpu_kernel(
@@ -318,267 +319,287 @@ Tensor split_embedding_codegen_grad_indice_weights_cpu(
 
 namespace internal {
 
-template <typename scalar_t>
-void batched_csr2csc(
-    BatchedHyperCompressedSparseColumn& batched_csc,
+namespace {
+
+template <typename scalar_t, bool IS_VALUE_PAIR>
+void csr2csc_template_(
+    HyperCompressedSparseColumn& csc,
     int B,
-    // TODO: use accessor for the following 3 parameters
-    const at::TensorAccessor<int64_t, 1>& batched_csr_offsets,
-    const at::TensorAccessor<int64_t, 1>& batched_csr_indices,
-    const at::TensorAccessor<scalar_t, 1>& batched_csr_weights,
+    const at::TensorAccessor<int64_t, 1>& csr_offsets,
+    const at::TensorAccessor<int64_t, 1>& csr_indices,
+    const at::TensorAccessor<scalar_t, 1>& csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset,
     int64_t num_embeddings) {
-  int num_tables = 1;
-  batched_csc.num_tables = num_tables;
-  batched_csc.table_ptr = static_cast<int*>(
-      fbgemm::fbgemmAlignedAlloc(64, (num_tables + 1) * sizeof(int)));
-  batched_csc.table_ptr[0] = 0;
-  int64_t nnz = batched_csr_offsets[table_to_feature_offset[num_tables] * B] -
-      batched_csr_offsets[table_to_feature_offset[0] * B];
+  csc.num_non_zero_columns = 0;
+  int64_t nnz = csr_offsets[table_to_feature_offset[1] * B] -
+      csr_offsets[table_to_feature_offset[0] * B];
   if (nnz == 0) {
-    batched_csc.table_ptr[1] = 0;
     return;
   }
-  batched_csc.row_indices =
+  csc.row_indices =
       static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, nnz * sizeof(int)));
-  bool has_weights = batched_csr_weights.data() != nullptr;
-  if (has_weights ||
-      static_cast<PoolingMode>(pooling_mode) == PoolingMode::MEAN) {
-    batched_csc.weights = static_cast<float*>(
+  bool has_weights = csr_weights.data() != nullptr;
+  if (IS_VALUE_PAIR) {
+    csc.weights = static_cast<float*>(
         fbgemm::fbgemmAlignedAlloc(64, nnz * sizeof(float)));
   }
 
   int column_ptr_curr = 0;
-  int t = 0;
   bool is_shared_table =
-      table_to_feature_offset[t + 1] > table_to_feature_offset[t] + 1;
-  auto NS = batched_csr_offsets[table_to_feature_offset[t + 1] * B] -
-      batched_csr_offsets[table_to_feature_offset[t] * B];
+      table_to_feature_offset[1] > table_to_feature_offset[0] + 1;
+  auto NS = csr_offsets[table_to_feature_offset[1] * B] -
+      csr_offsets[table_to_feature_offset[0] * B];
   int num_non_empty_segments = 0;
-  if (!batched_csc.weights) {
-    batched_csc.column_segment_ids =
-        static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, nnz * sizeof(int)));
-
-    int* tmpBufKeys =
-        static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
-    int* tmpBufValues =
-        static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
-    int* tmpBuf1Keys =
-        static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
-    int* tmpBuf1Values =
-        static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
-    const auto FBo = batched_csr_offsets[table_to_feature_offset[t] * B];
-    for (int feature = table_to_feature_offset[t];
-         feature < table_to_feature_offset[t + 1];
-         ++feature) {
-      const auto FBs = (feature - table_to_feature_offset[t]) * B;
+
+  using pair_t = std::pair<int, scalar_t>;
+  using value_t = typename std::conditional<IS_VALUE_PAIR, pair_t, int>::type;
+
+  csc.column_segment_ids =
+      static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, nnz * sizeof(int)));
+  int* tmpBufKeys =
+      static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
+  value_t* tmpBufValues = static_cast<value_t*>(
+      fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(value_t)));
+  int* tmpBuf1Keys =
+      static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
+  value_t* tmpBuf1Values = static_cast<value_t*>(
+      fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(value_t)));
+
+  const auto FBo = csr_offsets[table_to_feature_offset[0] * B];
+  for (int feature = table_to_feature_offset[0];
+       feature < table_to_feature_offset[1];
+       ++feature) {
+    const auto FBs = (feature - table_to_feature_offset[0]) * B;
 #pragma omp parallel for
-      for (int b = 0; b < B; ++b) {
-        const auto FBb = feature * B + b;
-        int64_t pool_begin = batched_csr_offsets[FBb];
-        int64_t pool_end = batched_csr_offsets[FBb + 1];
-        for (int64_t p = pool_begin; p < pool_end; ++p) {
-          tmpBufKeys[p - FBo] = batched_csr_indices[p];
-          tmpBufValues[p - FBo] = FBs + b;
+    for (int b = 0; b < B; ++b) {
+      const auto FBb = feature * B + b;
+      int64_t pool_begin = csr_offsets[FBb];
+      int64_t pool_end = csr_offsets[FBb + 1];
+      int64_t L = pool_end - pool_begin;
+      // MEAN pooling will not work with indice_weights!
+      double scale_factor =
+          (static_cast<PoolingMode>(pooling_mode) == PoolingMode::MEAN &&
+           !has_weights && L > 0)
+          ? 1.0 / L
+          : 1.0;
+
+      for (int64_t p = pool_begin; p < pool_end; ++p) {
+        tmpBufKeys[p - FBo] = csr_indices[p];
+        if (IS_VALUE_PAIR) {
+          reinterpret_cast<pair_t*>(tmpBufValues)[p - FBo] = std::make_pair(
+              FBs + b, scale_factor * (has_weights ? csr_weights[p] : 1.0f));
+        } else {
+          reinterpret_cast<int*>(tmpBufValues)[p - FBo] = FBs + b;
         }
       }
     }
+  }
+
+  int* sorted_col_row_index_keys = nullptr;
+  value_t* sorted_col_row_index_values = nullptr;
+
+  std::tie(sorted_col_row_index_keys, sorted_col_row_index_values) =
+      fbgemm_gpu::radix_sort_parallel(
+          tmpBufKeys,
+          tmpBufValues,
+          tmpBuf1Keys,
+          tmpBuf1Values,
+          NS,
+          num_embeddings);
+
+  int max_thds = omp_get_max_threads();
+  int num_uniq[max_thds][64];
 
-    int* sorted_col_row_index_keys = nullptr;
-    int* sorted_col_row_index_values = nullptr;
-    std::tie(sorted_col_row_index_keys, sorted_col_row_index_values) =
-        fbgemm_gpu::radix_sort_parallel(
-            tmpBufKeys,
-            tmpBufValues,
-            tmpBuf1Keys,
-            tmpBuf1Values,
-            NS,
-            num_embeddings);
-
-    int max_thds = omp_get_max_threads();
-    int num_uniq[max_thds][64];
-    int U = 0;
-    if (at::get_num_threads() > 1) {
-      // This block is not needed for single thread
+  int U = 0;
+  if (at::get_num_threads() > 1) {
+    // This block is not needed for single thread
 #pragma omp parallel
-      {
-        int tid = omp_get_thread_num();
-        num_uniq[tid][0] = 0;
+    {
+      int tid = omp_get_thread_num();
+      num_uniq[tid][0] = 0;
 #pragma omp for schedule(static)
-        for (int i = 1; i < NS; i++) {
-          if (sorted_col_row_index_keys[i] !=
-              sorted_col_row_index_keys[i - 1]) {
-            num_uniq[tid][0]++;
-          }
+      for (int i = 1; i < NS; i++) {
+        if (sorted_col_row_index_keys[i] != sorted_col_row_index_keys[i - 1]) {
+          num_uniq[tid][0]++;
         }
       }
-      num_uniq[0][0] += 1;
-      for (int i = 1; i < max_thds; i++)
-        num_uniq[i][0] += num_uniq[i - 1][0];
-      U = num_uniq[max_thds - 1][0];
     }
+    num_uniq[0][0] += 1;
+    for (int i = 1; i < max_thds; i++) {
+      num_uniq[i][0] += num_uniq[i - 1][0];
+    }
+    U = num_uniq[max_thds - 1][0];
+  }
 
-    batched_csc.column_segment_ptr = static_cast<int*>(
-        fbgemm::fbgemmAlignedAlloc(64, (NS + 1) * sizeof(int)));
-    batched_csc.column_segment_indices =
-        static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
+  csc.column_segment_ptr =
+      static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, (NS + 1) * sizeof(int)));
+  csc.column_segment_indices =
+      static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
+  csc.column_segment_ptr[0] = 0;
+  const pair_t* sorted_col_row_index_values_pair =
+      reinterpret_cast<const pair_t*>(sorted_col_row_index_values);
+  const int* sorted_col_row_index_values_int =
+      reinterpret_cast<const int*>(sorted_col_row_index_values);
+  if (IS_VALUE_PAIR) {
+    csc.row_indices[0] = sorted_col_row_index_values_pair[0].first % B;
+    csc.weights[0] = sorted_col_row_index_values_pair[0].second;
+    csc.column_segment_ids[0] = sorted_col_row_index_values_pair[0].first / B;
+  } else {
+    csc.row_indices[0] = sorted_col_row_index_values_int[0] % B;
+    csc.column_segment_ids[0] = sorted_col_row_index_values_int[0] / B;
+  }
+  csc.column_segment_indices[0] = sorted_col_row_index_keys[0];
 
-    batched_csc.column_segment_ptr[0] = 0;
-    batched_csc.row_indices[0] = sorted_col_row_index_values[0] % B;
-    batched_csc.column_segment_indices[0] = sorted_col_row_index_keys[0];
-    batched_csc.column_segment_ids[0] = sorted_col_row_index_values[0] / B;
 #pragma omp parallel
-    {
-      int tid = omp_get_thread_num();
-      int* tstart =
-          (tid == 0
-               ? batched_csc.column_segment_indices + 1
-               : batched_csc.column_segment_indices + num_uniq[tid - 1][0]);
-
-      int* t_offs =
-          (tid == 0 ? batched_csc.column_segment_ptr + 1
-                    : batched_csc.column_segment_ptr + num_uniq[tid - 1][0]);
-
-      if (!is_shared_table) {
-        // For non shared table, no need for computing modulo.
-        // As an optimization, pointer swap instead of copying.
+  {
+    int tid = omp_get_thread_num();
+    int* tstart =
+        (tid == 0 ? csc.column_segment_indices + 1
+                  : csc.column_segment_indices + num_uniq[tid - 1][0]);
+
+    int* t_offs =
+        (tid == 0 ? csc.column_segment_ptr + 1
+                  : csc.column_segment_ptr + num_uniq[tid - 1][0]);
+
+    if (!IS_VALUE_PAIR && !is_shared_table) {
+      // For non shared table, no need for computing modulo.
+      // As an optimization, pointer swap instead of copying.
 #pragma omp master
-        std::swap(
-            batched_csc.row_indices,
-            sorted_col_row_index_values == tmpBufValues ? tmpBufValues
-                                                        : tmpBuf1Values);
-      } else {
+      std::swap(
+          csc.row_indices,
+          *reinterpret_cast<int**>(
+              sorted_col_row_index_values == tmpBufValues ? &tmpBufValues
+                                                          : &tmpBuf1Values));
+    } else {
 #ifdef FBCODE_CAFFE2
-        libdivide::divider<int> divisor(B);
+      libdivide::divider<int> divisor(B);
 #endif
 
 #pragma omp for schedule(static)
-        for (int i = 1; i < NS; ++i) {
-          int v = sorted_col_row_index_values[i];
+      for (int i = 1; i < NS; ++i) {
+        int v = IS_VALUE_PAIR ? sorted_col_row_index_values_pair[i].first
+                              : sorted_col_row_index_values_int[i];
 #ifdef FBCODE_CAFFE2
-          int q = v / divisor;
+        int q = v / divisor;
 #else
-          int q = v / B;
+        int q = v / B;
 #endif
-          batched_csc.column_segment_ids[i] = q;
-          batched_csc.row_indices[i] = v - q * B;
+        csc.column_segment_ids[i] = q;
+        csc.row_indices[i] = v - q * B;
+        if (IS_VALUE_PAIR) {
+          csc.weights[i] = sorted_col_row_index_values_pair[i].second;
         }
       }
+    }
 
 #pragma omp for schedule(static)
-      for (int i = 1; i < NS; ++i) {
-        if (sorted_col_row_index_keys[i] != sorted_col_row_index_keys[i - 1]) {
-          *tstart = sorted_col_row_index_keys[i];
-          *t_offs = i;
-          tstart++;
-          t_offs++;
-        }
+    for (int i = 1; i < NS; ++i) {
+      if (sorted_col_row_index_keys[i] != sorted_col_row_index_keys[i - 1]) {
+        *tstart = sorted_col_row_index_keys[i];
+        *t_offs = i;
+        tstart++;
+        t_offs++;
       }
+    }
 
-      if (at::get_num_threads() == 1 && tid == 0) {
-        // Special handling of single thread case
-        U = t_offs - batched_csc.column_segment_ptr;
-      }
-    } // omp parallel
-    batched_csc.table_ptr[t + 1] = batched_csc.table_ptr[t] + U;
-    batched_csc.column_segment_ptr[U] = NS;
-    column_ptr_curr += NS;
-    fbgemm::fbgemmAlignedFree(tmpBufKeys);
-    fbgemm::fbgemmAlignedFree(tmpBufValues);
-    fbgemm::fbgemmAlignedFree(tmpBuf1Keys);
-    fbgemm::fbgemmAlignedFree(tmpBuf1Values);
-  } else {
-    // batched_csc.weights
-#ifdef FBCODE_CAFFE2
-    folly::F14FastMap<
-#else
-    std::unordered_map<
-#endif
-        int64_t,
-        std::vector<std::vector<std::pair<int, scalar_t>>>>
-        non_empty_columns;
-    int f_begin = table_to_feature_offset[t];
-    int f_end = table_to_feature_offset[t + 1];
-    for (int feature = f_begin; feature < f_end; ++feature) {
-      for (int b = 0; b < B; ++b) {
-        int64_t pool_begin = batched_csr_offsets[feature * B + b];
-        int64_t pool_end = batched_csr_offsets[feature * B + b + 1];
-        int64_t L = pool_end - pool_begin;
-        // MEAN pooling will not work with indice_weights!
-        double scale_factor =
-            (static_cast<PoolingMode>(pooling_mode) == PoolingMode::MEAN &&
-             !has_weights && L > 0)
-            ? 1.0 / L
-            : 1.0;
-        for (int64_t p = pool_begin; p < pool_end; ++p) {
-          auto itr = non_empty_columns.find(batched_csr_indices[p]);
-          if (itr == non_empty_columns.end()) {
-            itr = non_empty_columns
-                      .emplace(
-                          batched_csr_indices[p],
-                          std::vector<std::vector<std::pair<int, scalar_t>>>(
-                              f_end - f_begin))
-                      .first;
-          }
-          if (itr->second[feature - f_begin].empty()) {
-            ++num_non_empty_segments;
-          }
-          itr->second[feature - f_begin].emplace_back(
-              b, scale_factor * (has_weights ? batched_csr_weights[p] : 1.0f));
-        }
-      }
-    } // for each feature
-
-    batched_csc.table_ptr[t + 1] =
-        batched_csc.table_ptr[t] + num_non_empty_segments;
-    batched_csc.column_segment_ptr = static_cast<int*>(
-        fbgemm::fbgemmAlignedAlloc(64, (NS + 1) * sizeof(int)));
-    batched_csc.column_segment_ptr[0] = 0;
-    batched_csc.column_segment_indices =
-        static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
-    batched_csc.column_segment_ids =
-        static_cast<int*>(fbgemm::fbgemmAlignedAlloc(64, NS * sizeof(int)));
-    int k = 1;
-    for (auto const& column : non_empty_columns) {
-      int feature = f_begin;
-      for (auto const& column_segment : column.second) {
-        if (!column_segment.empty()) {
-          batched_csc.column_segment_ptr[k] =
-              column_ptr_curr + column_segment.size();
-          batched_csc.column_segment_indices[k - 1] = column.first;
-          batched_csc.column_segment_ids[k - 1] = feature - f_begin;
-          k++;
-          for (auto const& non_zero : column_segment) {
-            batched_csc.row_indices[column_ptr_curr] = non_zero.first;
-            batched_csc.weights[column_ptr_curr] = non_zero.second;
-            ++column_ptr_curr;
-          }
-        }
-        ++feature;
-      } // for each column segment
-    } // for each column
-  } // !batched_csc.weights.empty()
+    if (at::get_num_threads() == 1 && tid == 0) {
+      // Special handling of single thread case
+      U = t_offs - csc.column_segment_ptr;
+    }
+
+  } // omp parallel
+
+  csc.num_non_zero_columns = U;
+  csc.column_segment_ptr[U] = NS;
+  column_ptr_curr += NS;
+
+  fbgemm::fbgemmAlignedFree(tmpBufKeys);
+  fbgemm::fbgemmAlignedFree(tmpBufValues);
+  fbgemm::fbgemmAlignedFree(tmpBuf1Keys);
+  fbgemm::fbgemmAlignedFree(tmpBuf1Values);
 
   assert(column_ptr_curr == nnz);
 }
 
-template void batched_csr2csc<float>(
-    BatchedHyperCompressedSparseColumn& batched_csc,
+#define INSTANTIATE_BATCHED_CSR2CSC(SCALAR_T)             \
+  template void csr2csc_template_<SCALAR_T, true>(        \
+      HyperCompressedSparseColumn & csc,                  \
+      int B,                                              \
+      const at::TensorAccessor<int64_t, 1>& csr_offsets,  \
+      const at::TensorAccessor<int64_t, 1>& csr_indices,  \
+      const at::TensorAccessor<SCALAR_T, 1>& csr_weights, \
+      int64_t pooling_mode,                               \
+      const int* table_to_feature_offset,                 \
+      int64_t num_embeddings);                            \
+                                                          \
+  template void csr2csc_template_<SCALAR_T, false>(       \
+      HyperCompressedSparseColumn & csc,                  \
+      int B,                                              \
+      const at::TensorAccessor<int64_t, 1>& csr_offsets,  \
+      const at::TensorAccessor<int64_t, 1>& csr_indices,  \
+      const at::TensorAccessor<SCALAR_T, 1>& csr_weights, \
+      int64_t pooling_mode,                               \
+      const int* table_to_feature_offset,                 \
+      int64_t num_embeddings);
+
+INSTANTIATE_BATCHED_CSR2CSC(float)
+INSTANTIATE_BATCHED_CSR2CSC(double)
+#undef INSTANTIATE_BATCHED_CSR2CSC
+
+} // namespace
+
+template <typename scalar_t>
+void csr2csc(
+    HyperCompressedSparseColumn& csc,
+    int B,
+    const at::TensorAccessor<int64_t, 1>& csr_offsets,
+    const at::TensorAccessor<int64_t, 1>& csr_indices,
+    const at::TensorAccessor<scalar_t, 1>& csr_weights,
+    int64_t pooling_mode,
+    const int* table_to_feature_offset,
+    int64_t num_embeddings) {
+  bool has_weights = csr_weights.data() != nullptr;
+  if (has_weights ||
+      static_cast<PoolingMode>(pooling_mode) == PoolingMode::MEAN) {
+    csr2csc_template_<scalar_t, /*IS_VALUE_PAIR=*/true>(
+        csc,
+        B,
+        csr_offsets,
+        csr_indices,
+        csr_weights,
+        pooling_mode,
+        table_to_feature_offset,
+        num_embeddings);
+  } else {
+    csr2csc_template_<scalar_t, /*IS_VALUE_PAIR=*/false>(
+        csc,
+        B,
+        csr_offsets,
+        csr_indices,
+        csr_weights,
+        pooling_mode,
+        table_to_feature_offset,
+        num_embeddings);
+  }
+}
+
+template void csr2csc<float>(
+    HyperCompressedSparseColumn& csc,
     int B,
-    const at::TensorAccessor<int64_t, 1>& batched_csr_offsets,
-    const at::TensorAccessor<int64_t, 1>& batched_csr_indices,
-    const at::TensorAccessor<float, 1>& batched_csr_weights,
+    const at::TensorAccessor<int64_t, 1>& csr_offsets,
+    const at::TensorAccessor<int64_t, 1>& csr_indices,
+    const at::TensorAccessor<float, 1>& csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset,
     int64_t num_embeddings);
 
-template void batched_csr2csc<double>(
-    BatchedHyperCompressedSparseColumn& batched_csc,
+template void csr2csc<double>(
+    HyperCompressedSparseColumn& csc,
     int B,
-    const at::TensorAccessor<int64_t, 1>& batched_csr_offsets,
-    const at::TensorAccessor<int64_t, 1>& batched_csr_indices,
-    const at::TensorAccessor<double, 1>& batched_csr_weights,
+    const at::TensorAccessor<int64_t, 1>& csr_offsets,
+    const at::TensorAccessor<int64_t, 1>& csr_indices,
+    const at::TensorAccessor<double, 1>& csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset,
     int64_t num_embeddings);
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_cpu.h b/fbgemm_gpu/codegen/embedding_forward_split_cpu.h
index ad2eaf02d..c8b7b25ca 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_cpu.h
+++ b/fbgemm_gpu/codegen/embedding_forward_split_cpu.h
@@ -32,26 +32,21 @@ at::Tensor split_embedding_codegen_grad_indice_weights_cpu(
     at::Tensor feature_requires_grad);
 
 namespace internal {
-// A batch of compressed sparse row but each sparse matrix is hyper sparse
+// A compressed sparse column but each sparse matrix is hyper sparse
 // meaning there can be many columns without any non-zeros.
-struct BatchedHyperCompressedSparseColumn {
-  int num_tables; // # of matrices (or tables)
-  // pointers to the beginning of each table in column_ptr (length T + 1)
-  int* table_ptr = nullptr;
+struct HyperCompressedSparseColumn {
+  int num_non_zero_columns;
   // pointers to the beginning of each column segment in row_indices
-  // (length table_ptr[T] + 1)
+  // (length num_non_zero_columns + 1)
   // For a shared table, a column can have multiple segments, each for a
   // feature sharing the table. In this case, the segments will have the
   // same column_segment_indices but different column_segment_ids.
   int* column_segment_ptr = nullptr;
-  int* column_segment_indices = nullptr; // length table_ptr[T]
-  int* column_segment_ids = nullptr; // length table_ptr[T]
-  int* row_indices = nullptr; // length column_ptr[table_ptr[T]]
-  float* weights = nullptr; // length column_ptr[table_ptr[T]]
-  ~BatchedHyperCompressedSparseColumn() {
-    if (table_ptr) {
-      fbgemm::fbgemmAlignedFree(table_ptr);
-    }
+  int* column_segment_indices = nullptr; // length num_non_zero_columns
+  int* column_segment_ids = nullptr; // length num_non_zero_columns
+  int* row_indices = nullptr; // length column_ptr[num_non_zero_columns]
+  float* weights = nullptr; // length column_ptr[num_non_zero_columns]
+  ~HyperCompressedSparseColumn() {
     if (column_segment_ptr) {
       fbgemm::fbgemmAlignedFree(column_segment_ptr);
       fbgemm::fbgemmAlignedFree(column_segment_indices);
@@ -65,12 +60,12 @@ struct BatchedHyperCompressedSparseColumn {
 };
 
 template <typename scalar_t>
-void batched_csr2csc(
-    BatchedHyperCompressedSparseColumn& batched_csc,
+void csr2csc(
+    HyperCompressedSparseColumn& csc,
     int B,
-    const at::TensorAccessor<int64_t, 1>& batched_csr_offsets,
-    const at::TensorAccessor<int64_t, 1>& batched_csr_indices,
-    const at::TensorAccessor<scalar_t, 1>& batched_csr_weights,
+    const at::TensorAccessor<int64_t, 1>& csr_offsets,
+    const at::TensorAccessor<int64_t, 1>& csr_indices,
+    const at::TensorAccessor<scalar_t, 1>& csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset,
     int64_t num_embeddings);
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_template.cu
index bd2c7e93e..c6f59f71f 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_template.cu
@@ -26,6 +26,126 @@ constexpr size_t kForwardMaxThreads = 512;
 using Tensor = at::Tensor;
 using namespace fbgemm_gpu;
 
+{% if not weighted %}
+template <
+    typename emb_t,
+    typename cache_t,
+    {% if not dense %}
+    typename output_t,
+    {% endif %}
+    typename index_t,
+    size_t kThreadGroupSize
+    >
+__launch_bounds__(kForwardMaxThreads)
+__global__ void {{ "dense" if dense else "split" }}_embedding_nobag_codegen_forward_unweighted_small_kernel(
+    const at::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> dev_weights,
+    {% if not dense %}
+    const at::PackedTensorAccessor64<emb_t, 1, at::RestrictPtrTraits> uvm_weights,
+    const at::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
+        lxu_cache_weights,
+    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        weights_placements,
+    {% endif %}
+    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> weights_offsets,
+    int64_t D,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> indices,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    {% if not dense %}
+    const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
+        lxu_cache_locations,
+    at::PackedTensorAccessor32<output_t, 2, at::RestrictPtrTraits>
+        output // [B][total_D],
+    {% else %}
+    at::PackedTensorAccessor32<at::acc_type<cache_t,true>, 2, at::RestrictPtrTraits>
+        output // [B][total_D],
+    {% endif %}
+    ) {
+    int32_t T = weights_offsets.size(0);
+    int32_t B = (offsets.size(0) - 1) / T;
+    int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
+    int32_t t = b_t / B;
+    int32_t b = b_t % B;
+
+    if (b_t >= B * T) {
+        return;
+    }
+    int64_t weights_offset = weights_offsets[t];
+    index_t indices_start = offsets[t * B + b];
+    index_t indices_end = offsets[t * B + b + 1];
+    int32_t L = indices_end - indices_start;
+    const emb_t* __restrict__ weights;
+    {% if not dense %}
+    const auto placement = static_cast<PlacementType>(weights_placements[t]);
+    if (placement == PlacementType::DEVICE) {
+        weights = &dev_weights[weights_offset];
+    } else {
+        weights = &uvm_weights[weights_offset];
+    }
+    {% else %}
+    weights = &dev_weights[weights_offset];
+    {% endif %}
+
+    int32_t D_emb = D;
+    if (std::is_same<emb_t, uint8_t>::value) {
+        D_emb += kINT8QparamsBytes;
+    }
+
+    constexpr int32_t kNumThreadGroup = kWarpSize / kThreadGroupSize;
+    const int32_t group_start = threadIdx.x / kThreadGroupSize * kThreadGroupSize;
+    const int32_t group_end = group_start + kThreadGroupSize;
+    const int32_t d = threadIdx.x % kThreadGroupSize * 4;
+
+    for (int32_t l_start = 0; l_start < L; l_start += kWarpSize) {
+        int32_t l = l_start + threadIdx.x;
+        int64_t idx = l < L ? indices[indices_start + l] : 0;
+        {% if not dense %}
+        int32_t cache_idx = (placement == PlacementType::MANAGED_CACHING && l < L) ? lxu_cache_locations[indices_start + l] : 0;
+        {% endif %}
+        for (auto j = group_start; j < group_end && l_start + j < L; ++j) {
+            int64_t idx_j = shfl_sync(idx, j);
+            int64_t output_j = indices_start + l_start + j;
+            {% if not dense %}
+            int32_t cache_idx_j = shfl_sync(cache_idx, j);
+            {% endif %}
+
+            {% if not dense %}
+            auto weight_row_cache = WeightRow<emb_t, cache_t, cache_t>(
+                const_cast<emb_t*>(&weights[idx_j * D_emb]),
+                const_cast<cache_t*>(&lxu_cache_weights[cache_idx_j][0]),
+                D,
+                nullptr);
+            float2 qparams_cache; // assume cache is fp16/fp32 which doesn't require qparams
+
+            {% endif %}
+            auto weight_row_emb = WeightRow<emb_t, cache_t, cache_t>(
+                const_cast<emb_t*>(&weights[idx_j * D_emb]),
+                nullptr,
+                D,
+                nullptr);
+            float2 qparams_emb;
+            if (std::is_same<emb_t, uint8_t>::value) {
+                qparams_emb = weight_row_emb.load_qparams();
+            }
+
+            if (d < D) {
+                {% if not dense %}
+                if (placement == PlacementType::MANAGED_CACHING && cache_idx_j != kCacheLocationMissing) {
+                    Vec4T<cache_t> weight = weight_row_cache.load(d, qparams_cache);
+                    weight.store(&output[output_j][d]);
+                } else {
+                    Vec4T<cache_t> weight = weight_row_emb.load(d, qparams_emb);
+                    weight.store(&output[output_j][d]);
+                }
+                {% else %}
+                    Vec4T<cache_t> weight = weight_row_emb.load(d, qparams_emb);
+                    weight.store(&output[output_j][d]);
+                {% endif %}
+            }
+        }
+    }
+}
+{% endif %}
+
 {% for nobag in [True, False] %}
 {% if not nobag or not weighted %}
 template <
@@ -361,7 +481,7 @@ Tensor {{ "dense" if dense else "split" }}_embedding{{ "_nobag" if nobag else ""
     {% endif %}
     {% else %}
     {% if dense %}
-    if (dev_weights.type().scalarType() == at::kHalf || dev_weights.type().scalarType() == at::kByte) {
+    if (dev_weights.scalar_type() == at::kHalf || dev_weights.scalar_type() == at::kByte) {
         output = at::empty({B, total_D}, dev_weights.options().dtype(at::kFloat));
     } else {
         output = at::empty({B, total_D}, dev_weights.options());
@@ -439,6 +559,45 @@ Tensor {{ "dense" if dense else "split" }}_embedding{{ "_nobag" if nobag else ""
         }
         {% endfor %}
         {% else %}
+        {% for kEmbeddingSize in [4, 8, 16, 32] %}
+        if (D <= {{ kEmbeddingSize }}) {
+        {% if not dense %}
+        split_embedding_nobag_codegen_forward_unweighted_small_kernel<emb_t, cache_t, output_t, int64_t, {{ kEmbeddingSize // 4 }}><<<
+        {% else %}
+        dense_embedding_nobag_codegen_forward_unweighted_small_kernel<scalar_t, scalar_t, int64_t, {{ kEmbeddingSize // 4 }}><<<
+        {% endif %}
+            div_round_up((B * T), kForwardMaxThreads / kWarpSize),
+            dim3(kWarpSize, kForwardMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            dev_weights.packed_accessor64<{{ "scalar_t" if dense else "emb_t" }}, 1, at::RestrictPtrTraits>(),
+            {% if not dense %}
+            uvm_weights.packed_accessor64<emb_t, 1, at::RestrictPtrTraits>(),
+            lxu_cache_weights.packed_accessor64<cache_t, 2, at::RestrictPtrTraits>(),
+            weights_placements.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            {% endif %}
+            weights_offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+            D,
+            indices.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+            offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+            {% if not dense %}
+            lxu_cache_locations.packed_accessor32<int32_t, 1, at::RestrictPtrTraits>(),
+            output.packed_accessor32<
+                output_t,
+                2,
+                at::RestrictPtrTraits>()
+            );
+            {% else %}
+            output.packed_accessor32<
+                at::acc_type<scalar_t, true>,
+                2,
+                at::RestrictPtrTraits>()
+            );
+            {% endif %}
+
+            return;
+        }
+        {% endfor %}
         {% if not dense %}
         split_embedding_nobag_codegen_forward_unweighted_kernel<emb_t, cache_t, output_t, int64_t><<<
         {% else %}
diff --git a/fbgemm_gpu/codegen/embedding_forward_template_helpers.cuh b/fbgemm_gpu/codegen/embedding_forward_template_helpers.cuh
index 4a7e517a6..24e76e8c7 100644
--- a/fbgemm_gpu/codegen/embedding_forward_template_helpers.cuh
+++ b/fbgemm_gpu/codegen/embedding_forward_template_helpers.cuh
@@ -10,11 +10,7 @@
 #include <ATen/core/TensorAccessor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-#if !defined(NEW_ATOMIC_PATH)
-#include <THC/THCAtomics.cuh>
-#else
 #include <ATen/cuda/Atomic.cuh>
-#endif
 
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh"
diff --git a/fbgemm_gpu/docs/Doxyfile.in b/fbgemm_gpu/docs/Doxyfile.in
new file mode 100644
index 000000000..f36821d10
--- /dev/null
+++ b/fbgemm_gpu/docs/Doxyfile.in
@@ -0,0 +1,2678 @@
+# Doxyfile 1.9.4
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables:
+# doxygen -x_noenv [configFile]
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "fbgemm_gpu"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = "build"
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# numer of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = NO
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = NO
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:^^"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
+
+EXTENSION_MAPPING      = .cu=C++ \
+                         .cuh=C++ \
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = YES
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = YES
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = YES
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USEDFILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = NO
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = "../include/fbgemm_gpu" \
+                         "../src/" \
+                         "../codegen/" \
+                         "../include/fbgemm_gpu/layout_transform_ops.cuh" \
+                         "../include/fbgemm_gpu/permute_pooled_embedding_ops_split.h" \
+                         "../include/fbgemm_gpu/merge_pooled_embeddings.h" \
+                         "../include/fbgemm_gpu/sparse_ops.h" \
+                         "../fbgemm_gpu/src/quantize_ops.cu" \
+                         "../src/quantize_ops_cpu.cpp" \
+                         "../src/split_table_batched_embeddings.cpp" \
+                         "../src/jagged_tensor_ops.cu" \
+                         "../src/jagged_tensor_ops_cpu.cpp" \
+                         "../src/cumem_utils.h" \
+                         "../include/fbgemm_gpu/input_combine.h" \
+                         "../src/layout_transform_ops.cu" \
+                         "../src/layout_transform_ops_cpu.cpp" \
+                         "../codegen/embedding_backward_split_host_template.cpp" \
+                         "../codegen/embedding_backward_split_host_cpu_template.cpp" \
+                         "../codegen/embedding_forward_quantized_host.cpp" \
+                         "../codegen/embedding_forward_quantized_host_cpu.cpp" \
+                         "../codegen/embedding_bounds_check_host.cpp" \
+                         "../codegen/embedding_bounds_check_host_cpu.cpp" \
+                         "../codegen/embedding_backward_dense_host.cpp" \
+                         "../codegen/embedding_forward_template_helpers.cuh " \
+                         "../src/permute_pooled_embedding_ops_gpu.cpp" \
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cpp \
+                         *.cuh \
+                         *.cu \
+                         *.h \
+                         *.hpp \
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# ANamespace::AClass, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        =
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see https://docs.mathjax.org/en/v2.7-latest/tex.html
+# #tex-and-latex-extensions):
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         =
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
+#
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           = "../codegen/" \
+                         "../include/fbgemm_gpu" \
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = DOXYGEN_THIS_WILL_BE_SKIPPED
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies. See also the chapter Grouping
+# in the manual.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
+# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
+# The default value is: YES.
+
+DOT_CLEANUP            = YES
diff --git a/fbgemm_gpu/docs/Makefile b/fbgemm_gpu/docs/Makefile
new file mode 100644
index 000000000..d0c3cbf10
--- /dev/null
+++ b/fbgemm_gpu/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/fbgemm_gpu/docs/README.md b/fbgemm_gpu/docs/README.md
new file mode 100644
index 000000000..097cde17d
--- /dev/null
+++ b/fbgemm_gpu/docs/README.md
@@ -0,0 +1,137 @@
+# Building API Documentation
+
+`fbgemm_gpu` has both Python and C++ code. When you add new methods, functions,
+you must add a description of what it is and what it does. We use Python
+docstrings and C++ java style comments to document the code. Later the
+documentation is generated with Sphinx and autodoc for Python and
+Sphinx + Doxygen + Breathe for C++.
+
+# General Guidelines
+
+Here are the guidelines for all docstrings disregarding the language:
+
+* Write docstrings together with the code. Don't leave this as a separate task.
+* Code is not documentation! Put yourself in the shoes of a new developer who
+  has to understand what your code does and make their life easier.
+* At a very minimum, add:
+    * A description of the method
+    * A description for each argument that can be passed
+    * Return value description
+* Add usage examples, links to other methods, and limitations.
+
+## Adding a new Docstring to the Python code
+
+Add your PyThon docstrings to the `.py` files under the name of the methond.
+Use [Google-style docstrings](https://www.sphinx-doc.org/en/master/usage/extensions/example_google.html).
+
+Example:
+
+```
+def my_awesome_function():
+    """
+    This class is an example of how you can write docstrings.
+    You can add multiple lines of those descriptions. Make sure to include
+    useful information about your method.
+
+    Args:
+        arg1 (int): This is the first arg that you can pass with this function.
+
+    Returns:
+        This function returns X.
+
+    Raises:
+        AttributeError: This function raises an error.
+
+    Example:
+        This is how you can use this function
+
+        >>> print("Code blocks are supported")
+
+    Note:
+       You can find more information
+    """
+```
+
+Adding descriptions will not make them automatically published. Python modules
+must be included to the corresponding `.rst` files.
+
+To publish your docstrings:
+
+1. Add the corresponding module method to
+   [fbgemm_gpu.rst](./source/fbgemm_gpu.rst).
+1. To preview locally, run `make html`.
+1. Submit your PR and verify the Netlify preview.
+
+## Adding a new description to the C++ code
+
+We keep documentation in header files with the `.h` extension as well as in
+`.cpp`, `cu`, and `cuh` files.
+In these files files, everything between `#ifndef DOXYGEN_THIS_WILL_BE_SKIPPED` and
+`#endif` will be hidden from the HTML output. At the moment, we hide
+undocumented functions in these tags. When you add descriptions to a function,
+make sure to set up the `#ifndef` and `#endif` are configured correctly.
+
+All functions are grouped by a specific group for better organization. Make
+sure you add `@defgroup` to the code comments.
+
+Follow these instructions to document, generate, and publish a new C++ description:
+
+1. Add a description to the source header file. At a very minimum, add a description
+   verbatim, parameters by using the `@param` tag, and return value by using the
+   @return tag. You can other tags as needed.
+   Here is an example of how it can look:
+
+   ```
+      ```
+   ///@defgroup my-group
+   ///Description of my method <br>
+   ///**Example:**
+   ///```
+   ///Here comes
+   ///my code block
+   ///```
+   ///@param param1 this is my test param #1
+   ///@param param2 this is my test param #2
+   ///@return this function returns abc
+   ///@note This is my test note
+   ///@warning I'm warning you! =)
+   ///@throw fbgemm_gpu::my_error if something something
+   ///@see you can find more info <a href="https://www.doxygen.nl/manual/commands.html#cmdlink">here</a>
+   ```
+
+   This example generates the following HTML output:
+
+   ![](./assets/example-output.png)
+
+1. Add a `doxygengroup` directive to your `.rst` file. If an `.rst` file for the
+   corresponding header file does not exist, create a new
+   one by the same name as the header file. If an `.rst` file already
+   exists, make sure the `doxygengroup` is defined in that file. Example
+
+   ```
+   .. doxygengroup:: sparse-data-cuda
+       :content-only:
+   ```
+
+1. Make sure your file is included in to the `toctree` in `docs/source/index.rst`.
+
+1. You also need include your source header file to the `INPUT` parameter
+   in the `Doxygen.ini` file. Most of the files are already included, but if
+   your doc is not building, that's one place to check.
+
+1. Build your docs:
+
+   ```
+   pip3 install -r requirements.txt
+   doxygen Doxygen.ini
+   make html
+   ```
+
+1. You can serve your docs locally with `sphinx-serve`:
+
+   ```
+   pip install sphinx-serve
+   sphinx-serve -b build
+   ```
+
+1. Submit your PR and check the Netlify preview.
diff --git a/fbgemm_gpu/docs/assets/example-output.png b/fbgemm_gpu/docs/assets/example-output.png
new file mode 100644
index 000000000..5ca5ce22a
Binary files /dev/null and b/fbgemm_gpu/docs/assets/example-output.png differ
diff --git a/fbgemm_gpu/docs/requirements.txt b/fbgemm_gpu/docs/requirements.txt
new file mode 100644
index 000000000..850489fa1
--- /dev/null
+++ b/fbgemm_gpu/docs/requirements.txt
@@ -0,0 +1,12 @@
+sphinx
+torch
+numpy
+Jinja2==3.0.*
+breathe==4.25.0
+exhale==0.2.3
+docutils==0.16
+# PyTorch Theme
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+bs4
+lxml
+six
diff --git a/fbgemm_gpu/docs/source/conf.py b/fbgemm_gpu/docs/source/conf.py
new file mode 100644
index 000000000..0a8af1162
--- /dev/null
+++ b/fbgemm_gpu/docs/source/conf.py
@@ -0,0 +1,88 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+import os
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+
+sys.path.insert(0, os.path.abspath("../.."))
+# Doxygen
+subprocess.call("doxygen Doxyfile.in", shell=True)
+
+# -- Project information -----------------------------------------------------
+highlight_language = "c++"
+
+project = "fbgemm"
+copyright = "2022, FBGEMM team"
+author = "FBGEMM team"
+
+# The full version, including alpha/beta/rc tags
+release = "0.1.2"
+
+# breathe_projects_source = {"auto": ("../src/", ["auto_function.h", "auto_class.h"])}
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ["sphinx.ext.napoleon", "sphinx.ext.autodoc"]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+extensions = ["sphinx.ext.intersphinx", "breathe", "sphinx.ext.autodoc"]
+
+intersphinx_mapping = {"pytorch": ("https://pytorch.org/docs/master", None)}
+
+# Setup absolute paths for communicating with breathe / exhale where
+# items are expected / should be trimmed by.
+# This file is {repo_root}/docs/cpp/source/conf.py
+
+breathe_projects = {"fbgemm_gpu": "../build/xml/", "codegen": "../build/xml/codegen/"}
+
+breathe_default_project = "fbgemm_gpu"
+
+
+# Tell sphinx what the primary language being documented is.
+primary_domain = "cpp"
+
+# Tell sphinx what the pygments highlight language should be.
+highlight_language = "cpp"
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "pytorch_sphinx_theme"
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+html_theme_options = {
+    "pytorch_project": "fbgemm",
+    "collapse_navigation": True,
+    "analytics_id": "UA-117752657-2",
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ["_static"]
diff --git a/fbgemm_gpu/docs/source/cpp-api/cumem_utils.rst b/fbgemm_gpu/docs/source/cpp-api/cumem_utils.rst
new file mode 100644
index 000000000..81c854f94
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/cumem_utils.rst
@@ -0,0 +1,5 @@
+CUDA Memory Operators
+=====================
+
+.. doxygengroup:: cumem-utils
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/cpp-api/embedding_ops.rst b/fbgemm_gpu/docs/source/cpp-api/embedding_ops.rst
new file mode 100644
index 000000000..453c28cd2
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/embedding_ops.rst
@@ -0,0 +1,12 @@
+Embedding Operators
+===================
+
+CUDA Operators
+--------------
+.. doxygengroup:: embedding-cuda
+   :content-only:
+
+CPU Operators
+-------------
+.. doxygengroup:: embedding-cpu
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/cpp-api/input_combine.rst b/fbgemm_gpu/docs/source/cpp-api/input_combine.rst
new file mode 100644
index 000000000..5fc169b38
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/input_combine.rst
@@ -0,0 +1,5 @@
+Combine Input Operators
+=======================
+
+.. doxygengroup:: input-combine
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/cpp-api/jagged_tensor_ops.rst b/fbgemm_gpu/docs/source/cpp-api/jagged_tensor_ops.rst
new file mode 100644
index 000000000..4fe903a0e
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/jagged_tensor_ops.rst
@@ -0,0 +1,19 @@
+Jagged Tensor Operators
+=======================
+
+Jagged Tensor solves the issue when rows in dimension are of
+different length. This often occurs in sparse feature inputs
+in recommender systems, as well as natural language processing
+system batched inputs.
+
+CUDA Operators
+--------------
+
+.. doxygengroup:: jagged-tensor-ops-cuda
+   :content-only:
+
+CPU Operators
+-------------
+
+.. doxygengroup:: jagged-tensor-ops-cpu
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/cpp-api/layout_transform_ops.rst b/fbgemm_gpu/docs/source/cpp-api/layout_transform_ops.rst
new file mode 100644
index 000000000..41a67b506
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/layout_transform_ops.rst
@@ -0,0 +1,14 @@
+Layout Transformation Operators
+===============================
+
+CUDA Operators
+--------------
+
+.. doxygengroup:: layout-transform-cuda
+   :content-only:
+
+CPU Operators
+-------------
+
+.. doxygengroup:: layout-transform-cpu
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/cpp-api/merge_pooled_embeddings.rst b/fbgemm_gpu/docs/source/cpp-api/merge_pooled_embeddings.rst
new file mode 100644
index 000000000..8333286b7
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/merge_pooled_embeddings.rst
@@ -0,0 +1,20 @@
+Pooled Embeddings Operators
+===========================
+
+This section includes CUDA and CPU operators for various
+operations with pooled embeddings, including merge and
+permutation operators.
+
+Merge Operators
+----------------
+.. doxygengroup:: merge-pooled-emb
+   :content-only:
+
+Permutation Operators
+---------------------
+
+.. doxygengroup:: permute-pooled-embs-gpu
+   :content-only:
+
+.. doxygengroup:: permute-pooled-embs-cpu
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/cpp-api/quantize_ops.rst b/fbgemm_gpu/docs/source/cpp-api/quantize_ops.rst
new file mode 100644
index 000000000..540173c4f
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/quantize_ops.rst
@@ -0,0 +1,18 @@
+Quantization Data Operators
+===========================
+
+Quantization is a model optimization technique in which
+a large model is reduced to a smaller size to
+achieve better performance with a small loss in accuracy.
+
+CUDA Operators
+--------------
+
+.. doxygengroup:: quantize-data-cuda
+   :content-only:
+
+CPU Operators
+-------------
+
+.. doxygengroup:: quantize-data-cpu
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/cpp-api/sparse_ops.rst b/fbgemm_gpu/docs/source/cpp-api/sparse_ops.rst
new file mode 100644
index 000000000..16a7d05a6
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/sparse_ops.rst
@@ -0,0 +1,14 @@
+Sparse Data Operators
+=====================
+
+Sparse Data CUDA Operators
+--------------------------
+
+.. doxygengroup:: sparse-data-cuda
+   :content-only:
+
+Sparse Data CPU Operators
+--------------------------
+
+.. doxygengroup:: sparse-data-cpu
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/cpp-api/split_table_batched_embeddings.rst b/fbgemm_gpu/docs/source/cpp-api/split_table_batched_embeddings.rst
new file mode 100644
index 000000000..365b1490b
--- /dev/null
+++ b/fbgemm_gpu/docs/source/cpp-api/split_table_batched_embeddings.rst
@@ -0,0 +1,5 @@
+Table Batched Embedding Operators
+=================================
+
+.. doxygengroup:: table-batched-embed-cuda
+   :content-only:
diff --git a/fbgemm_gpu/docs/source/index.rst b/fbgemm_gpu/docs/source/index.rst
new file mode 100644
index 000000000..055865ee0
--- /dev/null
+++ b/fbgemm_gpu/docs/source/index.rst
@@ -0,0 +1,31 @@
+.. FBGEMM documentation master file, copied from fbgemm/docs
+   on Wed Jun 8 17:19:01 2022.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to FBGEMM's documentation!
+=======================================
+
+This documentation provides a comprehensive reference of the `fbgemm_gpu`
+library.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Python API Documentation
+
+   python-api/index.rst
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: CPP API Documentation
+
+   cpp-api/sparse_ops.rst
+   cpp-api/quantize_ops.rst
+   cpp-api/merge_pooled_embeddings.rst
+   cpp-api/split_table_batched_embeddings.rst
+   cpp-api/jagged_tensor_ops.rst
+   cpp-api/cumem_utils.rst
+   cpp-api/input_combine.rst
+   cpp-api/layout_transform_ops.rst
+   cpp-api/embedding_ops.rst
diff --git a/fbgemm_gpu/docs/source/python-api/index.rst b/fbgemm_gpu/docs/source/python-api/index.rst
new file mode 100644
index 000000000..0c0742e47
--- /dev/null
+++ b/fbgemm_gpu/docs/source/python-api/index.rst
@@ -0,0 +1,11 @@
+fbgemm_gpu
+==========
+
+.. automodule:: fbgemm_gpu
+
+fbgemm_gpu.batched_unary_embeddings_ops
+=======================================
+
+.. automodule:: fbgemm_gpu.batched_unary_embeddings_ops
+   :members:
+   :show-inheritance:
diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
index f2995a1da..771b8deff 100644
--- a/fbgemm_gpu/fbgemm_gpu/__init__.py
+++ b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -9,7 +9,10 @@
 
 import torch
 
-torch.ops.load_library(os.path.join(os.path.dirname(__file__), "fbgemm_gpu_py.so"))
+try:
+    torch.ops.load_library(os.path.join(os.path.dirname(__file__), "fbgemm_gpu_py.so"))
+except Exception:
+    print("File fbgemm_gpu_py.so not found")
 
 # __init__.py is only used in OSS
 # Use existence to check if fbgemm_gpu_py.so has already been loaded
diff --git a/fbgemm_gpu/fbgemm_gpu/enums.py b/fbgemm_gpu/fbgemm_gpu/enums.py
index 23ec738cf..e02e5fd1c 100644
--- a/fbgemm_gpu/fbgemm_gpu/enums.py
+++ b/fbgemm_gpu/fbgemm_gpu/enums.py
@@ -16,6 +16,7 @@ def create_enums(
 ) -> None:
     for enum_name, items in query_op():
         # Create matching python enumeration
+        # pyre-fixme[19]: Expected 1 positional argument.
         new_enum = enum.Enum(enum_name, items)
         # and store it in the module
         namespace[enum_name] = new_enum
diff --git a/fbgemm_gpu/fbgemm_gpu/metrics.py b/fbgemm_gpu/fbgemm_gpu/metrics.py
new file mode 100644
index 000000000..88fc8083d
--- /dev/null
+++ b/fbgemm_gpu/fbgemm_gpu/metrics.py
@@ -0,0 +1,152 @@
+from typing import Any, Callable
+
+import torch
+
+
+class BatchAuc(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        n_tasks: int,
+        predictions: torch.Tensor,
+        labels: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> torch.Tensor:
+        _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+        sorted_labels = torch.gather(labels, 1, sorted_indices)
+        sorted_weights = torch.gather(weights, 1, sorted_indices)
+        cum_fp = torch.cumsum(sorted_weights * (1.0 - sorted_labels), dim=-1)
+        cum_tp = torch.cumsum(sorted_weights * sorted_labels, dim=-1)
+        fac = cum_fp[:, -1] * cum_tp[:, -1]
+        auc = torch.where(fac == 0, 0.5, torch.trapz(cum_tp, cum_fp, dim=-1) / fac)
+        return auc
+
+
+class Auc(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        n_tasks: int,
+        predictions: torch.Tensor,
+        labels: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> torch.Tensor:
+        _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+        aucs = []
+        for sorted_indices_i, labels_i, weights_i in zip(
+            sorted_indices, labels, weights
+        ):
+            sorted_labels = torch.index_select(labels_i, dim=0, index=sorted_indices_i)
+            sorted_weights = torch.index_select(
+                weights_i, dim=0, index=sorted_indices_i
+            )
+            cum_fp = torch.cumsum(sorted_weights * (1.0 - sorted_labels), dim=0)
+            cum_tp = torch.cumsum(sorted_weights * sorted_labels, dim=0)
+            auc = torch.where(
+                cum_fp[-1] * cum_tp[-1] == 0,
+                0.5,  # 0.5 is the no-signal default value for auc.
+                torch.trapz(cum_tp, cum_fp) / cum_fp[-1] / cum_tp[-1],
+            )
+            aucs.append(auc.view(1))
+        return torch.cat(aucs)
+
+
+class AucJiterator(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        # Jiterator only works with elementwise kernels
+        fp_code_string = """
+        template <typename T> T fp(T weights, T labels) {
+            return weights * (1.0 - labels);
+        }"""
+
+        tp_code_string = """
+        template <typename T> T tp(T weights, T labels) {
+            return weights * labels;
+        }"""
+
+        # pyre-ignore [4]
+        self.jitted_fp: Callable[..., Any] = torch.cuda.jiterator._create_jit_fn(
+            fp_code_string
+        )
+        # pyre-ignore [4]
+        self.jitted_tp: Callable[..., Any] = torch.cuda.jiterator._create_jit_fn(
+            tp_code_string
+        )
+
+    def forward(
+        self,
+        n_tasks: int,
+        predictions: torch.Tensor,
+        labels: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> torch.Tensor:
+        _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+        aucs = []
+        for sorted_indices_i, labels_i, weights_i in zip(
+            sorted_indices, labels, weights
+        ):
+            sorted_labels = torch.index_select(labels_i, dim=0, index=sorted_indices_i)
+            sorted_weights = torch.index_select(
+                weights_i, dim=0, index=sorted_indices_i
+            )
+            cum_fp = torch.cumsum(self.jitted_fp(sorted_weights, sorted_labels), dim=0)
+            cum_tp = torch.cumsum(self.jitted_tp(sorted_weights, sorted_labels), dim=0)
+            auc = torch.where(
+                cum_fp[-1] * cum_tp[-1] == 0,
+                0.5,  # 0.5 is the no-signal default value for auc.
+                torch.trapz(cum_tp, cum_fp) / cum_fp[-1] / cum_tp[-1],
+            )
+            aucs.append(auc.view(1))
+        return torch.cat(aucs)
+
+
+class BatchAucJiterator(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        # Jiterator only works with elementwise kernels
+        fp_code_string = """
+        template <typename T> T fp(T weights, T labels) {
+            return weights * (1.0 - labels);
+        }"""
+
+        tp_code_string = """
+        template <typename T> T tp(T weights, T labels) {
+            return weights * labels;
+        }"""
+
+        # pyre-ignore [4]
+        self.jitted_fp: Callable[..., Any] = torch.cuda.jiterator._create_jit_fn(
+            fp_code_string
+        )
+        # pyre-ignore [4]
+        self.jitted_tp: Callable[..., Any] = torch.cuda.jiterator._create_jit_fn(
+            tp_code_string
+        )
+
+    def forward(
+        self,
+        n_tasks: int,
+        predictions: torch.Tensor,
+        labels: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> torch.Tensor:
+        _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+        sorted_labels = torch.gather(labels, 1, sorted_indices)
+        sorted_weights = torch.gather(weights, 1, sorted_indices)
+        cum_fp = torch.cumsum(self.jitted_fp(sorted_weights, sorted_labels), dim=-1)
+        cum_tp = torch.cumsum(self.jitted_tp(sorted_weights, sorted_labels), dim=-1)
+        fac = cum_fp[:, -1] * cum_tp[:, -1]
+        auc = torch.where(fac == 0, 0.5, torch.trapz(cum_tp, cum_fp, dim=-1) / fac)
+        return auc
+
+
+def auc(
+    n_tasks: int, predictions: torch.Tensor, labels: torch.Tensor, weights: torch.Tensor
+) -> torch.Tensor:
+    _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+    return torch.ops.fbgemm.batch_auc(n_tasks, sorted_indices, labels, weights)
diff --git a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules_split.py b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules_split.py
index 3f7cd7821..8599a5278 100644
--- a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules_split.py
+++ b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules_split.py
@@ -18,6 +18,8 @@
 except Exception:
     torch.ops.load_library(
         "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_split_gpu"
+    )
+    torch.ops.load_library(
         "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_split_cpu"
     )
 
diff --git a/fbgemm_gpu/fbgemm_gpu/quantize_comm.py b/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
new file mode 100644
index 000000000..871fcb365
--- /dev/null
+++ b/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+from typing import Optional
+
+import torch
+
+from fbgemm_gpu.quantize_utils import (
+    bf16_to_fp32,
+    fp16_to_fp32,
+    fp32_to_bf16_with_clamp,
+    fp32_to_fp16_with_clamp,
+    fp32_to_hfp8_with_clamp,
+    hfp8_to_fp32,
+)
+from fbgemm_gpu.split_embedding_configs import SparseType
+from torch.autograd.profiler import record_function
+
+logger: logging.Logger = logging.getLogger()
+
+
+def _quantize_tensor(
+    input_tensor: torch.Tensor,
+    comm_precision: SparseType,
+) -> torch.Tensor:
+    if comm_precision == SparseType.FP32:
+        return input_tensor
+    elif comm_precision == SparseType.FP16:
+        return fp32_to_fp16_with_clamp(input_tensor)
+    elif comm_precision == SparseType.BF16:
+        return fp32_to_bf16_with_clamp(input_tensor)
+    elif comm_precision == SparseType.FP8:
+        return fp32_to_hfp8_with_clamp(input_tensor)
+    else:
+        raise ValueError(f"comm_precision={comm_precision} is not supported")
+
+
+def _dequantize_tensor(
+    quantized_tensor: torch.Tensor,
+    comm_precision: SparseType,
+) -> torch.Tensor:
+    if comm_precision == SparseType.FP32:
+        assert quantized_tensor.dtype == torch.float
+        return quantized_tensor
+    elif comm_precision == SparseType.FP16:
+        assert quantized_tensor.dtype == torch.half
+        return fp16_to_fp32(quantized_tensor)
+    elif comm_precision == SparseType.BF16:
+        assert quantized_tensor.dtype == torch.bfloat16
+        return bf16_to_fp32(quantized_tensor)
+    elif comm_precision == SparseType.FP8:
+        assert quantized_tensor.dtype == torch.uint8
+        return hfp8_to_fp32(quantized_tensor)
+    else:
+        raise ValueError(f"comm_precision={comm_precision} is not supported")
+
+
+class QuantizedCommCodec:
+    def __init__(
+        self,
+        comm_precision: SparseType,
+        loss_scale: Optional[float] = None,
+    ) -> None:
+
+        if loss_scale is not None:
+            if comm_precision not in [SparseType.FP16, SparseType.BF16]:
+                logger.warning(
+                    f"Setting loss scale for comm_precision={comm_precision} is not supported. Overriding to None"
+                )
+                loss_scale = None
+
+        logger.info(
+            f"Creating QuantizedCommsCodec comm_precision:{comm_precision}, loss_scale:{loss_scale}"
+        )
+
+        self._comm_precision = comm_precision
+        self._loss_scale = loss_scale
+
+    def encode(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        if self._loss_scale is not None:
+            input_tensor = self._loss_scale * input_tensor
+        with record_function(
+            f"## encoder {self._comm_precision} {self._loss_scale} ##"
+        ):
+            return _quantize_tensor(input_tensor, self._comm_precision)
+
+    def decode(self, input_grad: torch.Tensor) -> torch.Tensor:
+        if self._loss_scale is not None:
+            input_grad = input_grad / self._loss_scale
+        with record_function(
+            f"## decoder {self._comm_precision} {self._loss_scale} ##"
+        ):
+            dequantized_tensor = _dequantize_tensor(input_grad, self._comm_precision)
+        return dequantized_tensor
+
+    @property
+    def quantized_dtype(self) -> torch.dtype:
+        if self._comm_precision == SparseType.FP16:
+            return torch.half
+        elif self._comm_precision == SparseType.BF16:
+            return torch.bfloat16
+        elif self._comm_precision == SparseType.FP8:
+            return torch.uint8
+        return torch.float
diff --git a/fbgemm_gpu/fbgemm_gpu/quantize_utils.py b/fbgemm_gpu/fbgemm_gpu/quantize_utils.py
new file mode 100644
index 000000000..308bea6b1
--- /dev/null
+++ b/fbgemm_gpu/fbgemm_gpu/quantize_utils.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+
+logger: logging.Logger = logging.getLogger()
+
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+except Exception:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+TORCH_HALF_MIN: float = torch.finfo(torch.float16).min
+TORCH_HALF_MAX: float = torch.finfo(torch.float16).max
+
+TORCH_BFLOAT16_MIN: float = torch.finfo(torch.bfloat16).min
+TORCH_BFLOAT16_MAX: float = torch.finfo(torch.bfloat16).max
+
+
+def fp32_to_fp16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.clamp(tensor, TORCH_HALF_MIN, TORCH_HALF_MAX).half()
+
+
+def fp32_to_bf16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.clamp(tensor, TORCH_BFLOAT16_MIN, TORCH_BFLOAT16_MAX).bfloat16()
+
+
+def fp32_to_hfp8_with_clamp(
+    tensor: torch.Tensor, ebits: int = 4, mbits: int = 3, bias: int = 15
+) -> torch.Tensor:
+    max_pos: float = (2 ** ((1 << ebits) - 2 - bias)) * (2 - 2 ** (-mbits))
+    return torch.ops.fbgemm.FloatToHFP8Quantized(
+        tensor.contiguous(),
+        ebits,
+        bias,
+        max_pos,
+    )
+
+
+def fp16_to_fp32(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.float()
+
+
+def bf16_to_fp32(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.view(torch.bfloat16).float()
+
+
+def hfp8_to_fp32(tensor: torch.Tensor, ebits: int = 4, bias: int = 15) -> torch.Tensor:
+    return torch.ops.fbgemm.HFP8QuantizedToFloat(
+        tensor.contiguous().view(torch.uint8),
+        ebits,
+        bias,
+    )
+
+
+def measure_fp16_quant_error(input_tensor: torch.Tensor) -> None:
+    # TODO: log to tensorboard
+
+    num_nan_fp32_tensor = torch.numel(input_tensor[torch.isnan(input_tensor)])
+    logger.info(
+        "num NaN in fp32 tensor: {}, ratio: {}.".format(
+            num_nan_fp32_tensor, num_nan_fp32_tensor / torch.numel(input_tensor)
+        )
+    )
+
+    logger.info(
+        "fp32 tensor profile: min: {}, max: {}, min abs:{}, max abs:{}.".format(
+            torch.min(input_tensor),
+            torch.max(input_tensor),
+            torch.min(torch.abs(input_tensor)),
+            torch.max(torch.abs(input_tensor)),
+        )
+    )
+
+    fp16_tensor = fp32_to_fp16_with_clamp(input_tensor)
+    num_nan_fp16_tensor = torch.numel(fp16_tensor[torch.isnan(fp16_tensor)])
+
+    logger.info(
+        "num NaN in fp16 tensor: {}, ratio: {}.".format(
+            num_nan_fp16_tensor, num_nan_fp16_tensor / torch.numel(input_tensor)
+        )
+    )
+
+    diff = torch.abs(input_tensor - fp16_tensor.float())
+    rel_diff = diff / torch.abs(input_tensor)
+    logger.info(
+        "fp32_to_fp16 abs error: min={}, max={}, avg={}.".format(
+            torch.min(diff), torch.max(diff), torch.mean(diff)
+        )
+    )
+
+    rel_diff_not_nan = rel_diff[torch.logical_not(torch.isnan(rel_diff))]
+    logger.info(
+        "fp32_to_fp16 rel error: min={}, max={}, avg={}.".format(
+            torch.min(rel_diff_not_nan),
+            torch.max(rel_diff_not_nan),
+            torch.mean(rel_diff_not_nan),
+        )
+    )
+
+    rel_diff_1_idx = torch.where(rel_diff == 1.0)
+    fp32_rel_err_1_vals = input_tensor[rel_diff_1_idx]
+    if torch.numel(fp32_rel_err_1_vals) > 0:
+        fp32_rel_err_1_vals = torch.abs(fp32_rel_err_1_vals)
+        logger.info(
+            "fp32_to_fp16 rel error == 1: fp32 min:{}, fp32 max:{}, fp32 avg:{}.".format(
+                torch.min(fp32_rel_err_1_vals),
+                torch.max(fp32_rel_err_1_vals),
+                torch.mean(fp32_rel_err_1_vals),
+            )
+        )
+
+        subrange_ratio = torch.numel(fp16_tensor[rel_diff_1_idx]) / torch.numel(
+            fp16_tensor
+        )
+        logger.info("sub fp16 range ratio: {}".format(subrange_ratio))
diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py
index d81201ed0..bb025abd3 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py
@@ -6,7 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import enum
-from typing import Dict
+from typing import Any, Dict
 
 import torch
 
@@ -27,6 +27,7 @@ class EmbOptimType(enum.Enum):
     PARTIAL_ROWWISE_ADAM = "partial_row_wise_adam"
     PARTIAL_ROWWISE_LAMB = "partial_row_wise_lamb"
     ROWWISE_ADAGRAD = "row_wise_adagrad"
+    SHAMPOO = "shampoo"  # not currently supported for sparse embedding tables
     MADGRAD = "madgrad"
     EXACT_ROWWISE_WEIGHTED_ADAGRAD = "exact_row_wise_weighted_adagrad"
 
@@ -34,10 +35,39 @@ def __str__(self) -> str:
         return self.value
 
 
+# Base class for quantization configuration (in case other numeric types have
+# configs)
+class QuantizationConfig:
+    def __init__(self) -> None:
+        self.config = {}  # type: Dict[str, Any]
+
+    def get(self, name: str) -> int:
+        return -1
+
+
+# FP8 quantization configuration
+# Compute necessary parameters in the constructor
+class FP8QuantizationConfig(QuantizationConfig):
+    def __init__(self, exponent_bits: int, exponent_bias: int) -> None:
+        super(FP8QuantizationConfig, self).__init__()
+        self.config = {
+            "exponent_bits": exponent_bits,
+            "exponent_bias": exponent_bias,
+            "max_position": (1 << ((1 << exponent_bits) - 2 - exponent_bias))
+            * (2 - 2 ** (exponent_bits - 7)),
+        }  # type: Dict[str, Any]
+
+    def get(self, name: str) -> int:
+        if name not in self.config:
+            raise RuntimeError("{} must be set in config".format(name))
+        return self.config[name]
+
+
 @enum.unique
 class SparseType(enum.Enum):
     FP32 = "fp32"
     FP16 = "fp16"
+    FP8 = "fp8"
     INT8 = "int8"
     INT4 = "int4"
     INT2 = "int2"
@@ -60,6 +90,8 @@ def from_int(ty: int) -> "SparseType":
             return SparseType("int2")
         elif ty == 5:
             return SparseType("bf16")
+        elif ty == 6:
+            return SparseType("fp8")
         else:
             raise ValueError(f"Unsupported sparse type: {ty}")
 
@@ -71,6 +103,7 @@ def as_int(self) -> int:
             SparseType.INT4.value: 3,
             SparseType.INT2.value: 4,
             SparseType.BF16.value: 5,
+            SparseType.FP8.value: 6,
         }[self.value]
 
     @staticmethod
@@ -104,6 +137,7 @@ def bit_rate(self) -> int:
         return {
             SparseType.FP32.value: 32,
             SparseType.FP16.value: 16,
+            SparseType.FP8.value: 8,
             SparseType.INT8.value: 8,
             SparseType.INT4.value: 4,
             SparseType.INT2.value: 2,
@@ -114,6 +148,7 @@ def align_size(self) -> int:
         return {
             SparseType.FP32.value: 1,
             SparseType.FP16.value: 2,
+            SparseType.FP8.value: 4,
             SparseType.INT8.value: 4,
             SparseType.INT4.value: 8,
             SparseType.INT2.value: 16,
@@ -124,16 +159,24 @@ def is_float(self) -> bool:
         if (
             self.value == SparseType.FP32.value
             or self.value == SparseType.FP16.value
+            or self.value == SparseType.FP8.value
             or self.value == SparseType.BF16.value
         ):
             return True
         else:
             return False
 
+    def default_config(self) -> QuantizationConfig:
+        if self.value == SparseType.FP8.value:
+            return FP8QuantizationConfig(4, 7)
+        else:
+            return QuantizationConfig()
+
 
 ELEMENT_SIZE: Dict[SparseType, int] = {
     SparseType.FP32: 4,
     SparseType.FP16: 2,
+    SparseType.FP8: 1,
     SparseType.INT8: 1,
     SparseType.BF16: 2,
     # SparseType.INT4: 0.5,
diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
index f2dd0fdc8..24bc6268a 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
@@ -14,8 +14,8 @@
 import fbgemm_gpu.split_table_batched_embeddings_ops as split_table_batched_embeddings_ops
 import numpy as np
 import torch
-from fbgemm_gpu.split_embedding_configs import SparseType
-from torch import Tensor, nn
+from fbgemm_gpu.split_embedding_configs import QuantizationConfig, SparseType
+from torch import nn, Tensor
 
 # TODO: add per-feature based converter option (based on embedding_specs during inference)
 # TODO: optimize embedding pruning and quantization latency.
@@ -25,11 +25,13 @@ def __init__(
         quantize_type: SparseType,
         pruning_ratio: Optional[float],
         use_array_for_index_remapping: bool = True,
+        quantization_config: Optional[QuantizationConfig] = None,
     ):
         self.quantize_type = quantize_type
         # TODO(yingz): Change the pruning ratio to per-table settings.
         self.pruning_ratio = pruning_ratio
         self.use_array_for_index_remapping = use_array_for_index_remapping
+        self.quantization_config = quantization_config
 
     def convert_model(self, model: torch.nn.Module) -> nn.Module:
         self._process_split_embs(model)
@@ -72,6 +74,12 @@ def _prune_embs(
             weights, indicators, threshold, torch.int32
         )
 
+    def _get_quantization_config(self, name):
+        quantization_config = self.quantization_config
+        if quantization_config is None:
+            raise RuntimeError("quantization_config must be set for FP8 weight")
+        return quantization_config.get(name)
+
     def _quantize_embs(
         self, weight: Tensor, weight_ty: SparseType
     ) -> Tuple[Tensor, Optional[Tensor]]:
@@ -91,6 +99,16 @@ def _quantize_embs(
             ).contiguous()
             return (res_weight, None)
 
+        elif weight_ty == SparseType.FP8:
+            # Output tensor is already in uint8
+            q_weight = torch.ops.fbgemm.FloatToHFP8Quantized(
+                weight.float(),
+                self._get_quantization_config("exponent_bits"),
+                self._get_quantization_config("exponent_bias"),
+                self._get_quantization_config("max_position"),
+            )
+            return (q_weight, None)
+
         elif weight_ty == SparseType.INT8:
             q_weight = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(weight)
             res_weight = torch.tensor(q_weight[:, :-8].cpu().numpy().view(np.uint8))
@@ -164,6 +182,8 @@ def _process_split_embs(self, model: nn.Module) -> None:
                     # Try to quantize embeddings.
                     weight_lists.append(self._quantize_embs(pruned_weight, weight_ty))
 
+                is_fp8_weight = self.quantize_type == SparseType.FP8
+
                 q_child = split_table_batched_embeddings_ops.IntNBitTableBatchedEmbeddingBagsCodegen(
                     embedding_specs=new_embedding_specs,
                     index_remapping=index_remapping_list
@@ -173,6 +193,12 @@ def _process_split_embs(self, model: nn.Module) -> None:
                     device="cpu" if use_cpu else torch.cuda.current_device(),
                     weight_lists=weight_lists,
                     use_array_for_index_remapping=self.use_array_for_index_remapping,
+                    fp8_exponent_bits=self._get_quantization_config("exponent_bits")
+                    if is_fp8_weight
+                    else None,
+                    fp8_exponent_bias=self._get_quantization_config("exponent_bias")
+                    if is_fp8_weight
+                    else None,
                 )
                 setattr(model, name, q_child)
             else:
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index be5b15d38..d1b0139a5 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -16,22 +16,10 @@
 
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
 import torch
-from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType
-from fbgemm_gpu.split_embedding_configs import SparseType
-from torch import Tensor, nn
+from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType, SparseType
+from torch import nn, Tensor
 
 ASSOC = 32 if torch.version.hip is None else 64
-try:
-    # pyre-ignore[21]
-    from fbgemm_gpu import open_source  # noqa: F401
-except Exception:
-    torch.ops.load_library(
-        "//deeplearning/fbgemm/fbgemm_gpu/fb:embedding_inplace_update"
-    )
-    torch.ops.load_library(
-        "//deeplearning/fbgemm/fbgemm_gpu/fb:embedding_inplace_update_cpu"
-    )
-
 # Maximum number of times prefetch() can be called without
 # a corresponding forward() call
 MAX_PREFETCH_DEPTH = 100
@@ -109,7 +97,9 @@ def construct_split_state(
     host_size = 0
     uvm_size = 0
     for (num_embeddings, embedding_dim, location, _) in embedding_specs:
-        assert embedding_dim % 4 == 0, f"{embedding_dim}"
+        assert (
+            embedding_dim % 4 == 0
+        ), f"embedding_dim must be a multiple of 4, but got {embedding_dim}"
         if precision == SparseType.INT8:
             embedding_dim += int8_emb_row_dim_offset
         state_size = num_embeddings * embedding_dim if not rowwise else num_embeddings
@@ -163,14 +153,18 @@ def construct_cache_state(
     cache_hash_size_cumsum = []
     # [total_cache_hash_size], linear cache index -> table index
     cache_index_table_map = [-1] * total_cache_hash_size
+    unique_feature_table_map = {}
     for t, t_ in enumerate(feature_table_map):
-        for i in range(_cache_hash_size_cumsum[t_], _cache_hash_size_cumsum[t_ + 1]):
-            cache_index_table_map[i] = t
-        location = location_list[t_]
-        if location == EmbeddingLocation.MANAGED_CACHING:
-            cache_hash_size_cumsum.append(_cache_hash_size_cumsum[t_])
-        else:
-            cache_hash_size_cumsum.append(-1)
+        unique_feature_table_map[t_] = t
+    for t_, t in unique_feature_table_map.items():
+        start, end = _cache_hash_size_cumsum[t_], _cache_hash_size_cumsum[t_ + 1]
+        cache_index_table_map[start:end] = [t] * (end - start)
+    cache_hash_size_cumsum = [
+        _cache_hash_size_cumsum[t_]
+        if location_list[t_] == EmbeddingLocation.MANAGED_CACHING
+        else -1
+        for t_ in feature_table_map
+    ]
     cache_hash_size_cumsum.append(total_cache_hash_size)
     s = CacheState(
         cache_hash_size_cumsum=cache_hash_size_cumsum,
@@ -279,6 +273,7 @@ def __init__(  # noqa C901
         if device is not None:
             self.current_device: torch.device = device
         else:
+            # pyre-fixme[8]: Attribute has type `device`; used as `Union[int, device]`.
             self.current_device: torch.device = (
                 torch.device("cpu") if self.use_cpu else torch.cuda.current_device()
             )
@@ -610,6 +605,7 @@ def forward(
                 offsets,
                 self.bounds_check_mode_int,
                 self.bounds_check_warning,
+                per_sample_weights,
             )
         self.step += 1
         if len(self.timesteps_prefetched) == 0:
@@ -678,12 +674,12 @@ def forward(
                 common_args, self.optimizer_args, momentum1
             )
         if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
-            return invokers.lookup_rowwise_adagrad_with_weight_decay.invoke(
+            return invokers.lookup_rowwise_adagrad.invoke(
                 common_args, self.optimizer_args, momentum1
             )
         if self.optimizer == OptimType.ROWWISE_ADAGRAD:
             assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode"
-            return invokers.lookup_approx_rowwise_adagrad_with_weight_decay.invoke(
+            return invokers.lookup_approx_rowwise_adagrad.invoke(
                 common_args, self.optimizer_args, momentum1
             )
 
@@ -927,9 +923,6 @@ def split_embedding_weights(self) -> List[Tensor]:
             else:
                 weights = self.weights_uvm
             splits.append(
-                # pyre-fixme[29]:
-                #  `Union[BoundMethod[typing.Callable(Tensor.detach)[[Named(self,
-                #  Tensor)], Tensor], Tensor], Tensor, nn.Module]` is not a function.
                 weights.detach()[offset : offset + rows * dim].view(rows, dim)
             )
         return splits
@@ -1113,6 +1106,7 @@ def _apply_split(
         else:
             self.register_buffer(
                 f"{prefix}_dev",
+                # pyre-fixme[6]: For 3rd param expected `dtype` but got `Type[dtype]`.
                 torch.empty(0, device=self.current_device, dtype=dtype),
             )
         if split.host_size > 0:
@@ -1144,6 +1138,7 @@ def _apply_split(
         else:
             self.register_buffer(
                 f"{prefix}_host",
+                # pyre-fixme[6]: For 3rd param expected `dtype` but got `Type[dtype]`.
                 torch.empty(0, device=self.current_device, dtype=dtype),
             )
         if split.uvm_size > 0:
@@ -1176,6 +1171,7 @@ def _apply_split(
         else:
             self.register_buffer(
                 f"{prefix}_uvm",
+                # pyre-fixme[6]: For 3rd param expected `dtype` but got `Type[dtype]`.
                 torch.empty(0, device=self.current_device, dtype=dtype),
             )
 
@@ -1263,7 +1259,7 @@ def _apply_cache_state(
         )
         assert cache_sets > 0
         if cache_algorithm == CacheAlgorithm.LFU:
-            assert cache_sets < 2 ** 24 - 1
+            assert cache_sets < 2**24 - 1
         cache_size = cache_sets * ASSOC * element_size * self.max_D_cache
         logging.info(
             f"Using on-device cache with admission algorithm "
@@ -1306,7 +1302,6 @@ def _apply_cache_state(
         )
         self.register_buffer(
             "lxu_state",
-            # pyre-fixme[28]: Unexpected keyword argument `size`.
             torch.zeros(
                 size=(self.total_cache_hash_size + 1,)
                 if cache_algorithm == CacheAlgorithm.LFU
@@ -1362,6 +1357,7 @@ def __init__(
         self.pooling_mode = pooling_mode
 
         self.use_cpu = use_cpu
+        # pyre-fixme[8]: Attribute has type `device`; used as `Union[int, device]`.
         self.current_device: torch.device = (
             torch.device("cpu") if self.use_cpu else torch.cuda.current_device()
         )
@@ -1496,6 +1492,7 @@ def unpadded_row_size_in_bytes(dim: int, weight_ty: SparseType) -> int:
     r = {
         SparseType.FP32.value: dim * 4,
         SparseType.FP16.value: dim * 2,
+        SparseType.FP8.value: dim,
         SparseType.INT8.value: dim + 4,
         SparseType.INT4.value: dim // 2 + 4,
         SparseType.INT2.value: dim // 4 + 4,
@@ -1548,14 +1545,16 @@ def nbit_construct_split_state(
     )
 
 
+# pyre-fixme[13]: Attribute `cache_miss_counter` is never initialized.
 class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     """
     Table-batched version of nn.EmbeddingBag(sparse=False)
-    Inference version, with FP16/INT8/INT4/INT2 supports
+    Inference version, with FP16/FP8/INT8/INT4/INT2 supports
     """
 
     embedding_specs: List[Tuple[str, int, int, SparseType, EmbeddingLocation]]
     record_cache_metrics: RecordCacheMetrics
+    cache_miss_counter: torch.Tensor
 
     def __init__(
         self,
@@ -1578,6 +1577,8 @@ def __init__(
         enforce_hbm: bool = False,  # place all weights/momentums in HBM when using cache
         record_cache_metrics: Optional[RecordCacheMetrics] = None,
         row_alignment: Optional[int] = None,
+        fp8_exponent_bits: Optional[int] = None,
+        fp8_exponent_bias: Optional[int] = None,
     ) -> None:  # noqa C901  # tuple of (rows, dims,)
         super(IntNBitTableBatchedEmbeddingBagsCodegen, self).__init__()
 
@@ -1588,7 +1589,6 @@ def __init__(
         elif isinstance(device, torch.device):
             self.current_device = device
         else:
-            # pyre-ignore [6]
             self.current_device = torch.device(device)
         self.use_cpu: bool = self.current_device.type == "cpu"
 
@@ -1659,6 +1659,7 @@ def max_ty_D(ty: SparseType) -> int:
         self.max_int2_D: int = max_ty_D(SparseType.INT2)
         self.max_int4_D: int = max_ty_D(SparseType.INT4)
         self.max_int8_D: int = max_ty_D(SparseType.INT8)
+        self.max_float8_D: int = max_ty_D(SparseType.FP8)
         self.max_float16_D: int = max_ty_D(SparseType.FP16)
         self.max_float32_D: int = max_ty_D(SparseType.FP32)
 
@@ -1794,21 +1795,71 @@ def max_ty_D(ty: SparseType) -> int:
             cache_reserved_memory,
         )
 
-    @torch.jit.export
+        if self.max_float8_D > 0:
+            default_config = SparseType.FP8.default_config()
+            self.fp8_exponent_bits: int = (
+                default_config.get("exponent_bits")
+                if fp8_exponent_bits is None
+                else fp8_exponent_bits
+            )
+            self.fp8_exponent_bias: int = (
+                default_config.get("exponent_bias")
+                if fp8_exponent_bias is None
+                else fp8_exponent_bias
+            )
+        else:
+            self.fp8_exponent_bits = -1
+            self.fp8_exponent_bias = -1
+
     def get_cache_miss_counter(self) -> Tensor:
         # cache_miss_counter[0]: cache_miss_forward_count which records the total number of forwards which has at least one cache miss
         # cache_miss_counter[1]: unique_cache_miss_count which records to total number of unique (dedup) cache misses
+        # cache_miss_counter[2]: total number of unique (dedup) access count
+        # cache_miss_counter[3]: total number of non-dedup access count
+
+        # How to get cache miss ratio
+        # cache miss ratio (# of missed entries / # of unique requests): ( cache_miss_counter[1] / cache_miss_counter[2] )
+        # cache miss ratio (# of missed entries / # of total access): ( cache_miss_counter[1] / cache_miss_counter[3] )
+        assert (
+            self.record_cache_metrics.record_cache_miss_counter
+        ), "record_cache_miss_counter should be true to access counter values"
 
-        # pyre-fixme[7]: Expected `Tensor` but got `typing.Union[Tensor,
-        # nn.Module]`.
         return self.cache_miss_counter
 
     @torch.jit.export
     def get_table_wise_cache_miss(self) -> Tensor:
+        assert (
+            self.record_cache_metrics.record_cache_miss_counter
+        ), "record_cache_miss_counter should be true to access counter values"
         # table_wise_cache_miss contains all the cache miss count for each table in this embedding table object:
-
         return self.table_wise_cache_miss
 
+    def reset_cache_miss_counter(self) -> None:
+        assert (
+            self.record_cache_metrics.record_cache_miss_counter
+        ), "record_cache_miss_counter should be true to access counter values"
+        self.cache_miss_counter = torch.tensor(
+            [0, 0, 0, 0], device=self.current_device, dtype=torch.int64
+        )
+
+    def print_cache_miss_counter(self) -> None:
+        assert (
+            self.record_cache_metrics.record_cache_miss_counter
+        ), "record_cache_miss_counter should be true to access counter values"
+        logging.info(
+            f"\n"
+            f"Miss counter value [0] - # of miss occured iters : {self.cache_miss_counter[0]}, \n"
+            f"Miss counter value [1] - # of unique misses : {self.cache_miss_counter[1]}, \n"
+            f"Miss counter value [2] - # of unique requested indices : {self.cache_miss_counter[2]}, \n"
+            f"Miss counter value [3] - # of total requested indices : {self.cache_miss_counter[3]}, "
+        )
+        logging.info(
+            f"unique_miss_rate using counter : {self.cache_miss_counter[1]/self.cache_miss_counter[2]}, \n"
+        )
+        logging.info(
+            f"total_miss_rate using counter : {self.cache_miss_counter[1]/self.cache_miss_counter[3]}, \n"
+        )
+
     @torch.jit.export
     def prefetch(self, indices: Tensor, offsets: Tensor) -> None:
         self.timestep_counter.increment()
@@ -1819,6 +1870,10 @@ def prefetch(self, indices: Tensor, offsets: Tensor) -> None:
         if not self.lxu_cache_weights.numel():
             return
 
+        # FIXME: check the int32_t range failure in https://fburl.com/gdoc/kcdnrnvg .
+        # The real failure should be in cache handling in https://fburl.com/ox3f26r0 .
+        indices, offsets = indices.long(), offsets.long()
+
         linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
             self.cache_hash_size_cumsum,
             indices,
@@ -1900,18 +1955,24 @@ def _update_cache_miss_counter(
 
         miss_count = torch.sum(unique_ids_count_list)
 
-        # pyre-fixme[29]:
-        #  `Union[BoundMethod[typing.Callable(Tensor.__getitem__)[[Named(self,
-        #  Tensor), Named(item, typing.Any)], typing.Any], Tensor], Tensor,
-        #  nn.Module]` is not a function.
         self.cache_miss_counter[0] += (miss_count > 0).to(torch.int64)
 
-        # pyre-fixme[29]:
-        #  `Union[BoundMethod[typing.Callable(Tensor.__getitem__)[[Named(self,
-        #  Tensor), Named(item, typing.Any)], typing.Any], Tensor], Tensor,
-        #  nn.Module]` is not a function.
         self.cache_miss_counter[1] += miss_count
 
+        # Number of unique requests
+        assert (
+            len(linear_cache_indices.size()) == 1
+        ), f"linear_cache_indices should be 1-D was {len(linear_cache_indices.size())}-D"
+
+        assert (
+            self.cache_miss_counter.size()[0] == 4
+        ), f"self.cache_miss_counter should be 4-D was {self.cache_miss_counter.size()[0]}-D"
+
+        self.cache_miss_counter[2] += torch.unique(linear_cache_indices).size()[0]
+
+        # Number of total requests
+        self.cache_miss_counter[3] += linear_cache_indices.size()[0]
+
     def _update_tablewise_cache_miss(
         self,
         lxu_cache_locations: Tensor,
@@ -1983,6 +2044,7 @@ def forward(
                 offsets,
                 self.bounds_check_mode_int,
                 self.bounds_check_warning,
+                per_sample_weights,
             )
         # Note: CPU and CUDA ops use the same interface to facilitate JIT IR
         # generation for CUDA/CPU. For CPU op, we don't need weights_uvm and
@@ -2002,12 +2064,15 @@ def forward(
             max_float32_D=self.max_float32_D,
             indices=indices,
             offsets=offsets,
-            pooling_mode=self.pooling_mode,
+            pooling_mode=int(self.pooling_mode),
             indice_weights=per_sample_weights,
             output_dtype=self.output_dtype,
             lxu_cache_weights=self.lxu_cache_weights,
             lxu_cache_locations=lxu_cache_locations,
             row_alignment=self.row_alignment,
+            max_float8_D=self.max_float8_D,
+            fp8_exponent_bits=self.fp8_exponent_bits,
+            fp8_exponent_bias=self.fp8_exponent_bias,
         )
 
     def _apply_split(
@@ -2126,7 +2191,7 @@ def _apply_cache_state(
             )
             self.register_buffer(
                 "cache_miss_counter",
-                torch.tensor([0, 0], dtype=torch.int64),
+                torch.tensor([0, 0, 0, 0], dtype=torch.int64),
                 persistent=False,
             )
             return
@@ -2157,7 +2222,7 @@ def _apply_cache_state(
         )
         assert cache_sets > 0
         if cache_algorithm == CacheAlgorithm.LFU:
-            assert cache_sets < 2 ** 24 - 1
+            assert cache_sets < 2**24 - 1
         cache_size = cache_sets * ASSOC * self.max_D_cache
         logging.info(
             f"Using on-device cache with admission algorithm "
@@ -2200,7 +2265,6 @@ def _apply_cache_state(
         )
         self.register_buffer(
             "lxu_state",
-            # pyre-fixme[28]: Unexpected keyword argument `size`.
             torch.zeros(
                 size=(self.total_cache_hash_size + 1,)
                 if cache_algorithm == CacheAlgorithm.LFU
@@ -2211,7 +2275,7 @@ def _apply_cache_state(
         )
         self.register_buffer(
             "cache_miss_counter",
-            torch.tensor([0, 0], device=self.current_device, dtype=torch.int64),
+            torch.tensor([0, 0, 0, 0], device=self.current_device, dtype=torch.int64),
         )
         if cache_algorithm not in (CacheAlgorithm.LFU, CacheAlgorithm.LRU):
             raise ValueError(
@@ -2269,7 +2333,11 @@ def split_embedding_weights(
                         )
                     )
                 else:
-                    assert weight_ty == SparseType.FP16 or weight_ty == SparseType.FP32
+                    assert (
+                        weight_ty == SparseType.FP8
+                        or weight_ty == SparseType.FP16
+                        or weight_ty == SparseType.FP32
+                    )
                     splits.append(
                         (
                             weights_shifts,
@@ -2407,6 +2475,7 @@ def _embedding_inplace_update_per_table(
         row_size = len(update_row_indices)
         if row_size == 0:
             return
+        # pyre-fixme[9]: update_row_indices has type `List[int]`; used as `Tensor`.
         update_row_indices = torch.tensor(
             update_row_indices,
             device=self.current_device,
@@ -2417,6 +2486,7 @@ def _embedding_inplace_update_per_table(
         ]
         table_values[0].scatter_(
             dim=0,
+            # pyre-fixme[16]: `List` has no attribute `view`.
             index=update_row_indices.view(row_size, 1).expand_as(update_weights),
             src=update_weights,
         )
@@ -2454,11 +2524,13 @@ def embedding_inplace_update_internal(
             update_offset += D_bytes
         update_offsets.append(update_offset)
 
+        # pyre-fixme[9]: update_table_indices has type `List[int]`; used as `Tensor`.
         update_table_indices = torch.tensor(
             update_table_indices,
             device=self.current_device,
             dtype=torch.int32,
         )
+        # pyre-fixme[9]: update_row_indices has type `List[int]`; used as `Tensor`.
         update_row_indices = torch.tensor(
             update_row_indices,
             device=self.current_device,
diff --git a/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_postfix.cuh b/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_postfix.cuh
index dcb943529..ec53151b4 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_postfix.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_postfix.cuh
@@ -12,6 +12,20 @@
 #undef CUB_NS_PREFIX
 #undef CUB_NS_POSTFIX
 
+#include <cuda.h> // for CUDA_VERSION
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#include <cub/version.cuh>
+#else
+#define CUB_VERSION 0
+#endif
+
+// PR https://github.com/NVIDIA/cub/pull/350 introduced breaking change.
+// When the CUB_NS_[PRE|POST]FIX macros are set,
+// CUB_NS_QUALIFIER must also be defined to the fully qualified CUB namespace
+#if CUB_VERSION >= 101400
+#undef CUB_NS_QUALIFIER
+#endif
+
 #define FBGEMM_GPU_CUB_NS_PREFIX fbgemm_gpu::
 
 #else
diff --git a/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_prefix.cuh b/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_prefix.cuh
index ab01c000b..fb8200877 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_prefix.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_prefix.cuh
@@ -10,7 +10,28 @@
 #undef CUB_NS_PREFIX
 #undef CUB_NS_POSTFIX
 
+#include <cuda.h> // for CUDA_VERSION
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#include <cub/version.cuh>
+#else
+#define CUB_VERSION 0
+#endif
+
+// PR https://github.com/NVIDIA/cub/pull/350 introduced breaking change.
+// When the CUB_NS_[PRE|POST]FIX macros are set,
+// CUB_NS_QUALIFIER must also be defined to the fully qualified CUB namespace
+#if CUB_VERSION >= 101400
+#undef CUB_NS_QUALIFIER
+#endif
+
 #define CUB_NS_PREFIX namespace fbgemm_gpu {
 #define CUB_NS_POSTFIX } // namespace fbgemm_gpu
 
+// PR https://github.com/NVIDIA/cub/pull/350 introduced breaking change.
+// When the CUB_NS_[PRE|POST]FIX macros are set,
+// CUB_NS_QUALIFIER must also be defined to the fully qualified CUB namespace
+#if CUB_VERSION >= 101400
+#define CUB_NS_QUALIFIER ::fbgemm_gpu::cub
+#endif
+
 #endif
diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_backward_template_helpers.cuh b/fbgemm_gpu/include/fbgemm_gpu/embedding_backward_template_helpers.cuh
index 4c4724cbe..0a192d85f 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/embedding_backward_template_helpers.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/embedding_backward_template_helpers.cuh
@@ -10,17 +10,9 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/core/TensorAccessor.h>
 #include <ATen/cuda/CUDAContext.h>
-#if !defined(NEW_GENERATOR_PATH)
-#include <ATen/CUDAGeneratorImpl.h>
-#else
 #include <ATen/cuda/CUDAGeneratorImpl.h>
-#endif
 #include <c10/cuda/CUDAGuard.h>
-#if !defined(NEW_ATOMIC_PATH)
-#include <THC/THCAtomics.cuh>
-#else
 #include <ATen/cuda/Atomic.cuh>
-#endif
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 
 #include <cuda.h>
@@ -113,4 +105,4 @@ DEVICE_INLINE int64_t gpuAtomicIncrement(int64_t* p) {
   return static_cast<int64_t>(atomicAdd(
       reinterpret_cast<unsigned long long int*>(p),
       static_cast<unsigned long long int>(1)));
-}
\ No newline at end of file
+}
diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_common.h b/fbgemm_gpu/include/fbgemm_gpu/embedding_common.h
index f9fad0721..3ecf4d19d 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/embedding_common.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/embedding_common.h
@@ -9,7 +9,7 @@
 #include <c10/macros/Macros.h>
 #include <cstdint>
 
-namespace {
+namespace fbgemm_gpu {
 
 // Keep in sync with split_embedding_configs.py:SparseType
 enum class SparseType : uint8_t {
@@ -19,7 +19,8 @@ enum class SparseType : uint8_t {
   INT4 = 3,
   INT2 = 4,
   BF16 = 5,
-  INVALID = 6,
+  FP8 = 6,
+  INVALID = 7,
 };
 
 enum class PoolingMode : uint8_t { SUM = 0, MEAN = 1, NONE = 2 };
@@ -38,7 +39,7 @@ enum class BoundsCheckMode : uint8_t {
   IGNORE = 2,
 };
 
-at::ScalarType getScalarType(SparseType dtype) {
+inline at::ScalarType getScalarType(SparseType dtype) {
   switch (dtype) {
     case SparseType::FP32:
       return at::kFloat;
@@ -57,7 +58,7 @@ at::ScalarType getScalarType(SparseType dtype) {
   }
 };
 
-SparseType getSparseType(at::ScalarType dtype) {
+inline SparseType getSparseType(at::ScalarType dtype) {
   switch (dtype) {
     case at::kFloat:
       return SparseType::FP32;
@@ -79,7 +80,7 @@ SparseType getSparseType(at::ScalarType dtype) {
   }
 };
 
-} // namespace
+} // namespace fbgemm_gpu
 
 namespace nbit {
 
@@ -93,20 +94,23 @@ div_round_up(uint32_t a, uint32_t b) {
 }
 
 C10_HOST_DEVICE C10_ALWAYS_INLINE int32_t
-unpadded_row_size_in_bytes(int32_t dim, SparseType weight_ty) {
-  if (weight_ty == SparseType::FP32) {
+unpadded_row_size_in_bytes(int32_t dim, fbgemm_gpu::SparseType weight_ty) {
+  if (weight_ty == fbgemm_gpu::SparseType::FP32) {
     return dim * 4;
   }
-  if (weight_ty == SparseType::FP16) {
+  if (weight_ty == fbgemm_gpu::SparseType::FP16) {
     return dim * 2;
   }
-  if (weight_ty == SparseType::INT8) {
+  if (weight_ty == fbgemm_gpu::SparseType::FP8) {
+    return dim;
+  }
+  if (weight_ty == fbgemm_gpu::SparseType::INT8) {
     return dim + 4;
   }
-  if (weight_ty == SparseType::INT4) {
+  if (weight_ty == fbgemm_gpu::SparseType::INT4) {
     return dim / 2 + 4;
   }
-  if (weight_ty == SparseType::INT2) {
+  if (weight_ty == fbgemm_gpu::SparseType::INT2) {
     return dim / 4 + 4;
   }
   return 0;
@@ -114,7 +118,7 @@ unpadded_row_size_in_bytes(int32_t dim, SparseType weight_ty) {
 
 C10_HOST_DEVICE C10_ALWAYS_INLINE int32_t padded_row_size_in_bytes(
     int32_t dim,
-    SparseType weight_ty,
+    fbgemm_gpu::SparseType weight_ty,
     int32_t row_alignment) {
   auto r = unpadded_row_size_in_bytes(dim, weight_ty);
   return round_up(r, row_alignment);
diff --git a/fbgemm_gpu/include/fbgemm_gpu/enum_utils.h b/fbgemm_gpu/include/fbgemm_gpu/enum_utils.h
index cf2fdbf68..108c43025 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/enum_utils.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/enum_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) Facebook, Inc. and its affiliates.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -13,34 +13,40 @@
 
 namespace fbgemm_gpu {
 
-#define FBGEMM_GPU_ENUM_CREATE_TAG(module_name)                                \
-  struct fbgemm_gpu_enum_tag_##module_name {};                                 \
-  template <> enum_registration<struct fbgemm_gpu_enum_tag_##module_name>*     \
-      enum_registration<                                                       \
-          struct fbgemm_gpu_enum_tag_##module_name>::registration_list;        \
-  extern template class enum_registration<                                     \
+#define FBGEMM_GPU_ENUM_CREATE_TAG(module_name)                         \
+  struct fbgemm_gpu_enum_tag_##module_name {};                          \
+  template <>                                                           \
+  enum_registration<struct fbgemm_gpu_enum_tag_##module_name>*          \
+      enum_registration<                                                \
+          struct fbgemm_gpu_enum_tag_##module_name>::registration_list; \
+  extern template class enum_registration<                              \
       struct fbgemm_gpu_enum_tag_##module_name>;
 
 #define FBGEMM_GPU_ENUM_TAG(module_name) \
   struct fbgemm_gpu_enum_tag_##module_name
 
 #define FBGEMM_GPU_ENUM_GLOGAL(module_name)                                    \
+  template class enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>;          \
   template <>                                                                  \
   enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>*                         \
       enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>::registration_list = \
           nullptr;
 
-#define FBGEMM_GPU_ENUM_REGISTER_START(module_name, enum_name)                           \
-  enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)> fbgemm_fpu_enum_reg_ ## enum_name( \
-      #enum_name,
+// To work around (escape from) hipify_torch, the names of the idendifiers
+// are decoposed to `prefix` and `enum_name`.
+#define FBGEMM_GPU_ENUM_REGISTER_START(module_name, prefix, enum_name)     \
+  enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)> fbgemm_fpu_enum_reg_ \
+      ## prefix ## enum_name( #prefix #enum_name,
 
 #define FBGEMM_GPU_ENUM_REGISTER_END );
 
 #define FBGEMM_GPU_ENUM_OP(module_name, op_name) \
 #op_name "() -> ((str, (str, int)[])[])",      \
       TORCH_FN(enum_query <FBGEMM_GPU_ENUM_TAG(module_name)>)
-#define FBGEMM_GPU_ENUM_ITEM(x) \
-  { #x, x }
+// To work around (escape from) hipify_torch, the names of the idendifiers
+// are decoposed to `x` and `y`. `z` is supposed to be hipified.
+#define FBGEMM_GPU_ENUM_ITEM(x, y, z) \
+  { #x #y, z }
 
 using enum_item = std::tuple<std::string, int64_t>;
 
@@ -82,4 +88,4 @@ static inline enum_result enum_query() {
   return enum_registration<T>::enum_query();
 }
 
-} // namespace fbgemm_gpu
\ No newline at end of file
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
index d9e9adeea..74275e2b3 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -14,8 +14,17 @@
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
 
+namespace {
+using fint32 = union fint32 {
+  uint32_t I;
+  float F;
+};
+} // namespace
+
 namespace fbgemm_gpu {
 
+enum class PrimitiveType : uint8_t { FP = 0, INT = 1, BF = 2 };
+
 #define DEVICE_INLINE __device__ inline __attribute__((always_inline))
 
 // Warp size
@@ -190,6 +199,14 @@ struct Vec4T<float> {
     acc.z *= scale;
     acc.w *= scale;
   }
+
+  // this <- this element-wise mul a
+  DEVICE_INLINE void element_wise_mul_(Vec4T<float> a) {
+    acc.x *= a.acc.x;
+    acc.y *= a.acc.y;
+    acc.z *= a.acc.z;
+    acc.w *= a.acc.w;
+  }
 };
 
 template <>
@@ -343,6 +360,22 @@ struct Vec4T<at::Half> {
     acc.w += a.acc.w;
   }
 
+  // this <- this element-wise mul a
+  DEVICE_INLINE void element_wise_mul_(Vec4T<float> a) {
+    acc.x *= a.acc.x;
+    acc.y *= a.acc.y;
+    acc.z *= a.acc.z;
+    acc.w *= a.acc.w;
+  }
+
+  // this <- this element-wise mul a
+  DEVICE_INLINE void element_wise_mul_(Vec4T<at::Half> a) {
+    acc.x *= a.acc.x;
+    acc.y *= a.acc.y;
+    acc.z *= a.acc.z;
+    acc.w *= a.acc.w;
+  }
+
   // this <- this * scale
   DEVICE_INLINE void mul_(float scale) {
     acc.x *= scale;
@@ -462,6 +495,22 @@ struct Vec4T<at::BFloat16> {
     acc.w += a.acc.w;
   }
 
+  // this <- this element-wise mul a
+  DEVICE_INLINE void element_wise_mul_(Vec4T<float> a) {
+    acc.x *= a.acc.x;
+    acc.y *= a.acc.y;
+    acc.z *= a.acc.z;
+    acc.w *= a.acc.w;
+  }
+
+  // this <- this element-wise mul a
+  DEVICE_INLINE void element_wise_mul_(Vec4T<at::Half> a) {
+    acc.x *= a.acc.x;
+    acc.y *= a.acc.y;
+    acc.z *= a.acc.z;
+    acc.w *= a.acc.w;
+  }
+
   // this <- this * scale
   DEVICE_INLINE void mul_(float scale) {
     acc.x *= scale;
@@ -581,6 +630,14 @@ struct Vec4T<double> {
     acc.w += a.acc.w;
   }
 
+  // this <- this element-wise mul a
+  DEVICE_INLINE void element_wise_mul_(Vec4T<double> a) {
+    acc.x *= a.acc.x;
+    acc.y *= a.acc.y;
+    acc.z *= a.acc.z;
+    acc.w *= a.acc.w;
+  }
+
   // this <- this * scale
   DEVICE_INLINE void mul_(float scale) {
     acc.x *= scale;
@@ -625,11 +682,7 @@ template <typename T, int ReduceWidth = kWarpSize>
 DEVICE_INLINE T warpReduceAllSum(T val) {
 #pragma unroll
   for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) {
-#ifdef __HIP_PLATFORM_HCC__
-    val += __shfl_xor(val, mask);
-#else
     val += shfl_xor(val, mask);
-#endif
   }
   return val;
 }
@@ -646,11 +699,6 @@ stochastic_rounding_scalar(float x, uint32_t random_value) {
 
 static DEVICE_INLINE uint8_t
 stochastic_rounding_scalar_uint8(float x, uint32_t random_bits) {
-  typedef union {
-    uint32_t I;
-    float F;
-  } fint32;
-
   fint32 noise;
   noise.F = 1;
   noise.I = (noise.I & 0x7F800000) | (random_bits & 0x007FFFFF);
@@ -1216,10 +1264,8 @@ DEVICE_INLINE float_16 make_zero_float_16() {
 
 __forceinline__ __device__ __half2
 hfma2(const __half2 a, const __half2 b, const __half2 c) {
-#ifdef __HIP_PLATFORM_HCC__
-  return __hfma2(a, b, c);
-#else
-#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+#if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || \
+    defined(__HIP_PLATFORM_HCC__)
   return __hfma2(a, b, c);
 #else
   float2 fa, fb, fc;
@@ -1230,19 +1276,15 @@ hfma2(const __half2 a, const __half2 b, const __half2 c) {
   fc.y = fa.y * fb.y + fc.y;
   return __float22half2_rn(fc);
 #endif
-#endif
 }
 
 __forceinline__ __device__ half hmul(half a, half b) {
-#ifdef __HIP_PLATFORM_HCC__
-  return __hmul(a, b);
-#else
-#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+#if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || \
+    defined(__HIP_PLATFORM_HCC__)
   return __hmul(a, b);
 #else
   return __float2half(__half2float(a) * __half2float(b));
 #endif
-#endif
 }
 
 // Reinterpret a  pair of uint16_t (packed into a uint32_t) as half2, and
@@ -1436,6 +1478,50 @@ dequantize_permuted_int8(uint32_t packedVals, __half2 shift_scale) {
   return res;
 }
 
+__forceinline__ __device__ float4
+dequantize_packed_hfp8(uint32_t vals, int exp_bits, int exp_bias) {
+  union fint128 {
+    uint32_t I[4];
+    uint64_t L[2];
+    float4 F;
+  } res, sign;
+
+  union b32 {
+    uint32_t I;
+    uint8_t S[4];
+  } v;
+
+  v.I = vals;
+
+  fint32 multiplier;
+  multiplier.I = (127 + (127 - exp_bias)) << 23;
+
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    sign.I[i] = v.S[i] & 0x80;
+    res.I[i] = v.S[i] & 0x7F;
+  }
+
+  // Shift sign and mantissa bits
+  // (Shift 64 bits instead of 8 bits in the above loop)
+  sign.L[0] <<= 24;
+  sign.L[1] <<= 24;
+  res.L[0] <<= (16 + exp_bits);
+  res.L[1] <<= (16 + exp_bits);
+
+  // Obtain FP32
+  res.F.x *= multiplier.F;
+  res.F.y *= multiplier.F;
+  res.F.z *= multiplier.F;
+  res.F.w *= multiplier.F;
+
+  // Compute sign
+  res.L[0] |= sign.L[0];
+  res.L[1] |= sign.L[1];
+
+  return res.F;
+}
+
 __forceinline__ __device__ float accumulate_fp32(float acc, float vals) {
   acc += vals;
   return acc;
@@ -1461,6 +1547,33 @@ accumulate_weighted_fp16(float2 acc, __half2 vals, float weight) {
   return acc;
 }
 
+__forceinline__ __device__ float4 accumulate_packed_hfp8(
+    float4 acc,
+    uint32_t packedVals,
+    int exp_bits,
+    int exp_bias) {
+  float4 res = dequantize_packed_hfp8(packedVals, exp_bits, exp_bias);
+  acc.x += res.x;
+  acc.y += res.y;
+  acc.z += res.z;
+  acc.w += res.w;
+  return acc;
+}
+
+__forceinline__ __device__ float4 accumulate_weighted_packed_hfp8(
+    float4 acc,
+    uint32_t packedVals,
+    int exp_bits,
+    int exp_bias,
+    float weight) {
+  float4 res = dequantize_packed_hfp8(packedVals, exp_bits, exp_bias);
+  acc.x = fmaf(res.x, weight, acc.x);
+  acc.y = fmaf(res.y, weight, acc.y);
+  acc.z = fmaf(res.z, weight, acc.z);
+  acc.w = fmaf(res.w, weight, acc.w);
+  return acc;
+}
+
 __forceinline__ __device__ float4
 accumulate_packed_int8(float4 acc, uint32_t packedVals, __half2 shift_scale) {
   half4 res = dequantize_permuted_int8(packedVals, shift_scale);
@@ -1618,11 +1731,11 @@ __forceinline__ __device__ float_16 accumulate_weighted_packed_int2(
 
 // Customized N-element vector data types (with element type float for
 // accumulation type).
-template <int N>
+template <int N, PrimitiveType>
 struct VecNT {};
 
 template <>
-struct VecNT<1> {
+struct VecNT<1, PrimitiveType::FP> {
   float acc;
 
   DEVICE_INLINE VecNT() {
@@ -1679,7 +1792,7 @@ struct VecNT<1> {
 };
 
 template <>
-struct VecNT<2> {
+struct VecNT<2, PrimitiveType::FP> {
   float2 acc;
 
   DEVICE_INLINE VecNT() {
@@ -1762,7 +1875,121 @@ struct VecNT<2> {
 };
 
 template <>
-struct VecNT<4> {
+struct VecNT<4, PrimitiveType::FP> {
+  float4 acc;
+
+  DEVICE_INLINE VecNT() {
+    acc = make_zero_float4();
+  }
+
+  DEVICE_INLINE VecNT(uint32_t v, const int exp_bits, const int exp_bias) {
+    acc = make_zero_float4();
+    acc = accumulate_packed_hfp8(acc, v, exp_bits, exp_bias);
+  }
+
+  DEVICE_INLINE void store(float* output_ptr, int num_valid_outputs = 4) {
+    bool aligned_16b = intptr_t(output_ptr) % 16 == 0;
+    bool aligned_8b = intptr_t(output_ptr) % 8 == 0;
+    // Since byte granule is guaranteed, num_valid_outputs can be any integer
+    // for int8.
+    if (aligned_16b && num_valid_outputs == 4) {
+      *reinterpret_cast<float4*>(output_ptr) =
+          *reinterpret_cast<const float4*>(&acc);
+    } else if (aligned_8b && num_valid_outputs >= 2) {
+      *reinterpret_cast<float2*>(output_ptr) =
+          *reinterpret_cast<const float2*>(&(acc.x));
+      if (num_valid_outputs == 4) {
+        *reinterpret_cast<float2*>(output_ptr + 2) =
+            *reinterpret_cast<const float2*>(&(acc.x) + 2);
+      } else if (num_valid_outputs == 3) {
+        *(output_ptr + 2) = *(&(acc.x) + 2);
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < 4; ++i) {
+        if (i < num_valid_outputs) {
+          output_ptr[i] = *(&(acc.x) + i);
+        }
+      }
+    }
+  }
+
+  DEVICE_INLINE void store(at::Half* output_ptr, int num_valid_outputs = 4) {
+    half4 val = to_half4(acc);
+    bool aligned_8b = intptr_t(output_ptr) % 8 == 0;
+    bool aligned_4b = intptr_t(output_ptr) % 4 == 0;
+    // Since byte granule is guaranteed, num_valid_outputs can be any integer
+    // for int8.
+    if (aligned_8b && num_valid_outputs == 4) {
+      *reinterpret_cast<float2*>(output_ptr) =
+          *reinterpret_cast<const float2*>(&val);
+    } else if (aligned_4b && num_valid_outputs >= 2) {
+      *reinterpret_cast<float*>(output_ptr) =
+          *reinterpret_cast<const float*>(&(val.vals[0].x));
+      if (num_valid_outputs == 4) {
+        *reinterpret_cast<float*>(output_ptr + 2) =
+            *reinterpret_cast<const float*>(&(val.vals[0].x) + 2);
+      } else if (num_valid_outputs == 3) {
+        *(output_ptr + 2) =
+            *reinterpret_cast<const at::Half*>(&(val.vals[0].x) + 2);
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < 4; ++i) {
+        if (i < num_valid_outputs) {
+          output_ptr[i] =
+              *reinterpret_cast<const at::Half*>(&(val.vals[0].x) + i);
+        }
+      }
+    }
+  }
+
+  DEVICE_INLINE void store(uint8_t* output_ptr, int num_valid_outputs = 4) {
+    CUDA_KERNEL_ASSERT(false);
+  }
+
+  DEVICE_INLINE void
+  store(uint8_t* output_ptr, float2 qparams, int num_valid_outputs = 4) {
+    const float inv_scale = 255.0f / (qparams.x * 255.0f + kQParamEps);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      if (i < num_valid_outputs) {
+        output_ptr[i] = lrintf(((&(acc.x))[i] - qparams.y) * inv_scale);
+      }
+    }
+  }
+
+  DEVICE_INLINE void
+  store(float* output_ptr, float2 qparams, int num_valid_outputs = 4) {
+    CUDA_KERNEL_ASSERT(false);
+  }
+
+  DEVICE_INLINE void
+  store(at::Half* output_ptr, float2 qparams, int num_valid_outputs = 4) {
+    CUDA_KERNEL_ASSERT(false);
+  }
+
+  // acc <- acc + a * b
+  DEVICE_INLINE void fma(uint32_t v, int exp_bits, int exp_bias, float b) {
+    acc = accumulate_weighted_packed_hfp8(acc, v, exp_bits, exp_bias, b);
+  }
+
+  // acc <- acc + a
+  DEVICE_INLINE void add(uint32_t v, int exp_bits, int exp_bias) {
+    acc = accumulate_packed_hfp8(acc, v, exp_bits, exp_bias);
+  }
+
+  // acc <- acc * a
+  DEVICE_INLINE void mul(float a) {
+    acc.x *= a;
+    acc.y *= a;
+    acc.z *= a;
+    acc.w *= a;
+  }
+};
+
+template <>
+struct VecNT<4, PrimitiveType::INT> {
   float4 acc;
 
   DEVICE_INLINE VecNT() {
@@ -1876,7 +2103,7 @@ struct VecNT<4> {
 };
 
 template <>
-struct VecNT<8> {
+struct VecNT<8, PrimitiveType::INT> {
   float8 acc;
 
   DEVICE_INLINE VecNT() {
@@ -2011,7 +2238,7 @@ struct VecNT<8> {
 };
 
 template <>
-struct VecNT<16> {
+struct VecNT<16, PrimitiveType::INT> {
   float_16 acc;
 
   DEVICE_INLINE VecNT() {
@@ -2223,10 +2450,13 @@ DEVICE_INLINE float float16_min(float_16 val) {
 #undef min
 #undef max
 
+// ROCm does not natively support __any_sync(). Using __ballot()
+// (https://rocmdocs.amd.com/en/latest/Programming_Guides/Kernel_language.html)
+// to implement __any_sync(). Note: the "warp-size" of AMD GPU is 64.
 #ifdef __HIP_PLATFORM_HCC__
 __device__ int __any_sync(uint64_t mask, int predicate) {
   uint64_t predicate_bit_pattern = __ballot(predicate);
-  return (predicate_bit_pattern & mask) > 0;  
+  return (predicate_bit_pattern & mask) > 0;
 }
 #endif
 
diff --git a/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_postfix.cuh b/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_postfix.cuh
deleted file mode 100644
index 8922edbba..000000000
--- a/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_postfix.cuh
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#undef FBGEMM_GPU_CUB_NS_PREFIX
-
-#ifdef FBGEMM_CUB_USE_NAMESPACE
-
-#undef CUB_NS_PREFIX
-#undef CUB_NS_POSTFIX
-
-#define FBGEMM_GPU_CUB_NS_PREFIX fbgemm_gpu::
-
-#else
-
-#define FBGEMM_GPU_CUB_NS_PREFIX
-
-#endif
diff --git a/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_postfix.hpp b/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_postfix.hpp
deleted file mode 100644
index 8922edbba..000000000
--- a/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_postfix.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#undef FBGEMM_GPU_CUB_NS_PREFIX
-
-#ifdef FBGEMM_CUB_USE_NAMESPACE
-
-#undef CUB_NS_PREFIX
-#undef CUB_NS_POSTFIX
-
-#define FBGEMM_GPU_CUB_NS_PREFIX fbgemm_gpu::
-
-#else
-
-#define FBGEMM_GPU_CUB_NS_PREFIX
-
-#endif
diff --git a/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_prefix.cuh b/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_prefix.cuh
deleted file mode 100644
index c977653fa..000000000
--- a/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_prefix.cuh
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifdef FBGEMM_CUB_USE_NAMESPACE
-
-#undef CUB_NS_PREFIX
-#undef CUB_NS_POSTFIX
-
-#define CUB_NS_PREFIX namespace fbgemm_gpu {
-#define CUB_NS_POSTFIX } // namespace fbgemm_gpu
-
-#endif
diff --git a/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_prefix.hpp b/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_prefix.hpp
deleted file mode 100644
index c977653fa..000000000
--- a/fbgemm_gpu/include/fbgemm_gpu/hipcub_namespace_prefix.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifdef FBGEMM_CUB_USE_NAMESPACE
-
-#undef CUB_NS_PREFIX
-#undef CUB_NS_POSTFIX
-
-#define CUB_NS_PREFIX namespace fbgemm_gpu {
-#define CUB_NS_POSTFIX } // namespace fbgemm_gpu
-
-#endif
diff --git a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
index 422ad5b9f..348e0bebf 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h
@@ -11,12 +11,17 @@
 
 namespace fbgemm_gpu {
 
+///@defgroup input-combine Combine Input Operators
+///
+
+///@ingroup input-combine
 std::tuple<at::Tensor, at::Tensor, at::Tensor> tbe_input_combine_cpu(
     const std::vector<at::Tensor>& indices_list,
     const std::vector<at::Tensor>& offsets_list,
     const std::vector<at::Tensor>& per_sample_weights,
     const at::Tensor& include_last_offsets);
 
+///@ingroup input-combine
 std::tuple<at::Tensor, at::Tensor, at::Tensor>
 padding_fused_tbe_input_combine_cpu(
     const std::vector<at::Tensor>& indices_list,
diff --git a/fbgemm_gpu/include/fbgemm_gpu/layout_transform_ops.cuh b/fbgemm_gpu/include/fbgemm_gpu/layout_transform_ops.cuh
index 0cfa1cef0..27be600ab 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/layout_transform_ops.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/layout_transform_ops.cuh
@@ -21,14 +21,14 @@ __global__ void recat_copy_async_kernel(
     const int64_t T,
     const int64_t B,
     const int64_t dim_sum) {
-  auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
-  auto b = b_t % B;
-  auto t = b_t / B;
+  const auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
+  const auto b = b_t % B;
+  const auto t = b_t / B;
 
   if (b_t >= B * T) {
     return;
   }
-  auto dim_current = dim_sum_per_rank[t];
+  const auto dim_current = dim_sum_per_rank[t];
   const auto tgt_base_addr = B * cum_dim_sum_per_rank[t];
   const auto src_base_addr = cum_dim_sum_per_rank[t];
 
@@ -68,33 +68,34 @@ __global__ void permute_pooled_embs_kernel(
     const int64_t B,
     const int64_t T,
     const int64_t dim_sum) {
-  int32_t t = blockIdx.x * (blockDim.x / warpSize) + threadIdx.x / warpSize;
-  int32_t b = blockIdx.y + gridDim.y * blockIdx.z;
-  int32_t idx = threadIdx.x % warpSize;
-  int32_t blk = warpSize;
+  const int32_t t =
+      blockIdx.x * (blockDim.x / warpSize) + threadIdx.x / warpSize;
+  const int32_t b = blockIdx.y + gridDim.y * blockIdx.z;
+  const int32_t idx = threadIdx.x % warpSize;
+  const int32_t blk = warpSize;
   if (b >= B) {
     return;
   }
   if (t >= T) {
     return;
   }
-  int64_t permute_idx = permute_list[t];
-  int64_t input_dim_start = offset_dim_list[permute_idx];
-  int64_t input_dim_end = offset_dim_list[permute_idx + 1];
-  int64_t cur_dim = input_dim_end - input_dim_start;
+  const int64_t permute_idx = permute_list[t];
+  const int64_t input_dim_start = offset_dim_list[permute_idx];
+  const int64_t input_dim_end = offset_dim_list[permute_idx + 1];
+  const int64_t cur_dim = input_dim_end - input_dim_start;
   if (idx >= cur_dim) {
     return;
   }
   // Apply the offsets on B dimension.
   go += b * dim_sum;
   sgo += b * dim_sum;
-  int64_t sgo_offset = inv_offset_dim_list[t];
+  const int64_t sgo_offset = inv_offset_dim_list[t];
   // Need to check alignment before using vector code path.
   if (fbgemm_gpu::is_aligned<fbgemm_gpu::Vec4T<scalar_t>>(&sgo[sgo_offset]) &&
       fbgemm_gpu::is_aligned<fbgemm_gpu::Vec4T<scalar_t>>(
           &go[input_dim_start])) {
     const int32_t vec_size = 4;
-    int32_t loop_end = cur_dim / (vec_size) * (vec_size);
+    const int32_t loop_end = cur_dim / (vec_size) * (vec_size);
     for (int32_t i = idx * vec_size; i < loop_end; i += blk * vec_size) {
       fbgemm_gpu::Vec4T<scalar_t>::copy(
           &go[input_dim_start + i], &sgo[sgo_offset + i]);
diff --git a/fbgemm_gpu/include/fbgemm_gpu/merge_pooled_embeddings.h b/fbgemm_gpu/include/fbgemm_gpu/merge_pooled_embeddings.h
index bd32d8bf1..8bd51f3f0 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/merge_pooled_embeddings.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/merge_pooled_embeddings.h
@@ -10,7 +10,9 @@
 #include <ATen/ATen.h>
 
 namespace fbgemm_gpu {
+///@defgroup merge-pooled-emb Merge Operators
 
+///@ingroup merge-pooled-emb
 std::vector<at::Tensor> all_to_one_device(
     std::vector<at::Tensor> inputTensors,
     at::Device target_device);
diff --git a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h
index ec0414b70..565d2e8c4 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h
@@ -8,8 +8,12 @@
 #pragma once
 
 #include <ATen/ATen.h>
+///@defgroup permute-pooled-embs-gpu CUDA Permutation Operators
+
+///@defgroup permute-pooled-embs-cpu CPU Permutation Operators
 
 namespace fbgemm_gpu {
+///@ingroup permute-pooled-embs-cpu
 at::Tensor permute_pooled_embs_split_cpu(
     const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
     const at::Tensor& offset_dim_list,
@@ -17,6 +21,7 @@ at::Tensor permute_pooled_embs_split_cpu(
     const at::Tensor& inv_offset_dim_list,
     const at::Tensor& inv_permute_list);
 
+///@ingroup permute-pooled-embs-gpu
 at::Tensor permute_pooled_embs_split_gpu(
     const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
     const at::Tensor& offset_dim_list,
@@ -24,6 +29,7 @@ at::Tensor permute_pooled_embs_split_gpu(
     const at::Tensor& inv_offset_dim_list,
     const at::Tensor& inv_permute_list);
 
+///@ingroup permute-pooled-embs-cpu
 at::Tensor permute_pooled_embs_auto_grad_split_cpu(
     const at::Tensor& pooled_embs,
     const at::Tensor& offset_dim_list,
@@ -31,6 +37,7 @@ at::Tensor permute_pooled_embs_auto_grad_split_cpu(
     const at::Tensor& inv_offset_dim_list,
     const at::Tensor& inv_permute_list);
 
+///@ingroup permute-pooled-embs-gpu
 at::Tensor permute_pooled_embs_auto_grad_split_gpu(
     const at::Tensor& pooled_embs,
     const at::Tensor& offset_dim_list,
diff --git a/fbgemm_gpu/include/fbgemm_gpu/quantize_ops_gpu.h b/fbgemm_gpu/include/fbgemm_gpu/quantize_ops_gpu.h
deleted file mode 100644
index e9a9219be..000000000
--- a/fbgemm_gpu/include/fbgemm_gpu/quantize_ops_gpu.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <ATen/ATen.h>
-#include <ATen/native/TensorIterator.h>
-
-namespace fbgemm_gpu {
-at::Tensor _float_to_hfp8_gpu(
-    const at::Tensor& input,
-    const int64_t ebits,
-    const int64_t exponent_bias,
-    const double max_pos);
-
-at::Tensor _hfp8_to_float_gpu(
-    const at::Tensor& input,
-    const int64_t ebits,
-    const int64_t exponent_bias);
-} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
index 44d3cf91c..5ca01d343 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
@@ -19,24 +19,3 @@
 #include "cub/block/block_reduce.cuh"
 #include "./cub_namespace_postfix.cuh"
 // clang-format on
-
-namespace fbgemm_gpu {
-
-// Kernel for calculating the offsets ranges
-template <typename scalar_t>
-__global__ void _offsets_range_cuda_kernel(
-    int64_t N,
-    int64_t range_size,
-    const scalar_t* __restrict__ offsets_data,
-    scalar_t* __restrict__ range_data);
-
-// Kernel for permuting the lengths. Used for permutation of sparse features.
-template <typename index_t>
-__global__ void permute_2D_lengths_kernel(
-    int32_t T,
-    int32_t B,
-    const index_t* __restrict__ lengths,
-    const int32_t* __restrict__ permute,
-    index_t* __restrict__ permuted_lengths);
-
-} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index e3021354d..36cda321c 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -4,39 +4,75 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-
 #pragma once
 
 #include <ATen/ATen.h>
 
 namespace fbgemm_gpu {
 
+/// @defgroup sparse-data-cuda Sparse Data CUDA Operators
+/// The following are CUDA operators
+///
+
+/// @defgroup sparse-data-cpu Sparse Data CPU Operators
+/// The following are CPU Operators
+///
+
 // Return array of size T_in.numel(), representing incomplete exclusive cumsum
+
+#ifndef DOXYGEN_THIS_WILL_BE_SKIPPED
+///@ingroup sparse-data-cuda
 at::Tensor asynchronous_exclusive_cumsum_gpu(const at::Tensor& t_in);
 
+///@ingroup sparse-data-cuda
 at::Tensor asynchronous_complete_cumsum_gpu(const at::Tensor& t_in);
 
+///@ingroup sparse-data-cuda
 at::Tensor asynchronous_inclusive_cumsum_gpu(const at::Tensor& t_in);
 
+///@ingroup sparse-data-cpu
 at::Tensor asynchronous_exclusive_cumsum_cpu(const at::Tensor& t_in);
 
+///@ingroup sparse-data-cpu
 at::Tensor asynchronous_complete_cumsum_cpu(const at::Tensor& t_in);
 
+///@ingroup sparse-data-cpu
 at::Tensor asynchronous_inclusive_cumsum_cpu(const at::Tensor& t_in);
 
+///@ingroup sparse-data-cuda
 at::Tensor offsets_range_cuda(const at::Tensor& offsets, int64_t range_size);
 
+///@ingroup sparse-data-cpu
 at::Tensor offsets_range_cpu(const at::Tensor& offsets, int64_t range_size);
 
+///@ingroup sparse-data-cuda
 at::Tensor segment_sum_csr_cuda(
     const int64_t batch_size,
     const at::Tensor& csr_seg,
     const at::Tensor& values);
 
+///@ingroup sparse-data-cpu
 at::Tensor segment_sum_csr_cpu(
     const int64_t batch_size,
     const at::Tensor& csr_seg,
     const at::Tensor& values);
+#endif
+
+#ifndef DOXYGEN_THIS_WILL_BE_SKIPPED
+/// Description of my method <br>
+///**Example:**
+///```
+/// Here comes
+/// my code block
+///```
+///@param param1 this is my test param #1
+///@param param2 this is my test param #2
+///@return This function returns abc
+///@note This is my test note
+///@warning I'm warning you! =)
+///@throw fbgemm_gpu::my_error if something something
+///@see You can find more info <a
+/// href="https://www.doxygen.nl/manual/commands.html#cmdlink">here</a>
 
 std::tuple<at::Tensor, at::Tensor, c10::optional<at::Tensor>>
 permute_2D_sparse_data_cuda(
@@ -53,33 +89,32 @@ permute_1D_sparse_data_cuda(
     const at::Tensor& indices,
     const c10::optional<at::Tensor>& weights,
     const c10::optional<int64_t>& permuted_lengths_sum);
-
-/*
- * expand_into_jagged_permute expand the sparse data permute index from
- * table dimension to batch dimension, for cases where the sparse features
- * has different batch sizes across ranks.
- *
- * "permute":
- * the table level permute index.
- * "input_offsets":
- * the exclusive offsets of table-level length.
- * "output_offsets":
- * the exclusive offsets of table-level permuted length.
- *
- * The op expands the permute from table level to batch level by
- * contiguously mapping each bag of its corresponding tables to the position the
- * batch sits on after feature permute. we will derive offset array of table and
- * batch to compute the output permute.
- *
- * The output follows the following formula:
- * output_permute[table_offset[permute[table]] + batch] <- bag_offset[batch].
- */
+#endif
+
+/// @ingroup sparse-data-cuda
+/// expand_into_jagged_permute expand the sparse data permute index from
+/// table dimension to batch dimension, for cases where the sparse features
+/// has different batch sizes across ranks.
+///
+/// @param permute the table level permute index.
+/// @param input_offsets the exclusive offsets of table-level length.
+/// @param output_offsets the exclusive offsets of table-level permuted length.
+/// The op expands the permute from table level to batch level by
+/// contiguously mapping each bag of its corresponding tables to the position
+/// the batch sits on after feature permute. We will derive offset array of
+/// table and batch to compute the output permute.
+/// @return The output follows the following formula:
+/// ```
+/// output_permute[table_offset[permute[table]] + batch] <- bag_offset[batch]
+/// ```
 at::Tensor expand_into_jagged_permute_cuda(
     const at::Tensor& permute,
     const at::Tensor& input_offsets,
     const at::Tensor& output_offsets,
     int64_t output_size);
 
+#ifndef DOXYGEN_THIS_WILL_BE_SKIPPED
+///@ingroup sparse-data-cpu
 at::Tensor expand_into_jagged_permute_cpu(
     const at::Tensor& permute,
     const at::Tensor& input_offsets,
@@ -92,6 +127,8 @@ std::tuple<
     c10::optional<at::Tensor>,
     c10::optional<at::Tensor>,
     c10::optional<at::Tensor>>
+
+///@ingroup sparse-data-cuda
 block_bucketize_sparse_features_cuda(
     at::Tensor lengths,
     at::Tensor indices,
@@ -107,6 +144,8 @@ std::tuple<
     c10::optional<at::Tensor>,
     c10::optional<at::Tensor>,
     c10::optional<at::Tensor>>
+
+///@ingroup sparse-data-cpu
 block_bucketize_sparse_features_cpu(
     at::Tensor lengths,
     at::Tensor indices,
@@ -121,6 +160,8 @@ std::tuple<
     at::Tensor,
     c10::optional<at::Tensor>,
     c10::optional<at::Tensor>>
+
+///@ingroup sparse-data-cuda
 bucketize_sparse_features_cuda(
     const at::Tensor& lengths,
     const at::Tensor& indices,
@@ -133,6 +174,7 @@ std::tuple<
     at::Tensor,
     c10::optional<at::Tensor>,
     c10::optional<at::Tensor>>
+///@ingroup sparse-data-cpu
 bucketize_sparse_features_cpu(
     const at::Tensor& lengths,
     const at::Tensor& indices,
@@ -140,6 +182,7 @@ bucketize_sparse_features_cpu(
     const int64_t my_size,
     const c10::optional<at::Tensor>& weights);
 
+///@ingroup sparse-data-cpu
 std::tuple<at::Tensor, at::Tensor, c10::optional<at::Tensor>>
 permute_2D_sparse_data_cpu(
     const at::Tensor& permute,
@@ -148,6 +191,7 @@ permute_2D_sparse_data_cpu(
     const c10::optional<at::Tensor>& weights,
     const c10::optional<int64_t>& permuted_lengths_sum);
 
+///@ingroup sparse-data-cpu
 std::tuple<at::Tensor, at::Tensor, c10::optional<at::Tensor>>
 permute_1D_sparse_data_cpu(
     const at::Tensor& permute,
@@ -158,17 +202,47 @@ permute_1D_sparse_data_cpu(
 
 at::Tensor _float_to_fused8bitrowwise_gpu(const at::Tensor& input);
 at::Tensor _half_to_fused8bitrowwise_gpu(const at::Tensor& input);
+at::Tensor _float_or_half_to_fused8bitrowwise_gpu(const at::Tensor& input);
 at::Tensor _fused8bitrowwise_to_float_gpu(const at::Tensor& input);
 at::Tensor _fused8bitrowwise_to_half_gpu(const at::Tensor& input);
+at::Tensor _fused8bitrowwise_to_float_or_half_gpu(
+    const at::Tensor& input,
+    const int64_t output_dtype);
 at::Tensor float_to_fused8bitrowwise_cpu(const at::Tensor& input);
 at::Tensor half_to_fused8bitrowwise_cpu(const at::Tensor& input);
+at::Tensor float_or_half_to_fused8bitrowwise_cpu(const at::Tensor& input);
 at::Tensor fused8bitrowwise_to_float_cpu(const at::Tensor& input);
 at::Tensor fused8bitrowwise_to_half_cpu(const at::Tensor& input);
+at::Tensor fused8bitrowwise_to_float_or_half_cpu(
+    const at::Tensor& input,
+    const int64_t output_dtype);
 at::Tensor _float_to_bfloat16_gpu(const at::Tensor&);
 at::Tensor _bfloat16_to_float_gpu(const at::Tensor&);
 at::Tensor _float_to_bfloat16_cpu(const at::Tensor&);
 at::Tensor _bfloat16_to_float_cpu(const at::Tensor&);
 
+at::Tensor _float_to_hfp8_gpu(
+    const at::Tensor& input,
+    const int64_t ebits,
+    const int64_t exponent_bias,
+    const double max_pos);
+at::Tensor _hfp8_to_float_gpu(
+    const at::Tensor& input,
+    const int64_t ebits,
+    const int64_t exponent_bias);
+at::Tensor _float_to_msfp_gpu(
+    const at::Tensor& input,
+    const int64_t bounding_box_size,
+    const int64_t ebits,
+    const int64_t mbits,
+    const int64_t bias,
+    const double min_pos,
+    const double max_pos);
+at::Tensor _msfp_to_float_gpu(
+    const at::Tensor& input,
+    const int64_t ebits,
+    const int64_t mbits,
+    const int64_t bias);
 at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu(
     const at::Tensor& input,
     const at::Tensor& D_offsets,
@@ -179,12 +253,19 @@ at::Tensor _float_to_fusednbitrowwise_gpu(
 at::Tensor _half_to_fusednbitrowwise_gpu(
     const at::Tensor& input,
     const int64_t bit_rate);
+at::Tensor _float_or_half_to_fusednbitrowwise_gpu(
+    const at::Tensor& input,
+    const int64_t bit_rate);
 at::Tensor _fusednbitrowwise_to_float_gpu(
     const at::Tensor& input,
     const int64_t bit_rate);
 at::Tensor _fusednbitrowwise_to_half_gpu(
     const at::Tensor& input,
     const int64_t bit_rate);
+at::Tensor _fusednbitrowwise_to_float_or_half_gpu(
+    const at::Tensor& input,
+    const int64_t bit_rate,
+    const int64_t output_dtype);
 at::Tensor& _fused8bitrowwise_to_float_cpu_out(
     at::Tensor& output,
     const at::Tensor& input);
@@ -197,18 +278,27 @@ at::Tensor float_to_fusednbitrowwise_cpu(
 at::Tensor half_to_fusednbitrowwise_cpu(
     const at::Tensor& input,
     const int64_t bit_rate);
+at::Tensor float_or_half_to_fusednbitrowwise_cpu(
+    const at::Tensor& input,
+    const int64_t bit_rate);
 at::Tensor fusednbitrowwise_to_float_cpu(
     const at::Tensor& input,
     const int64_t bit_rate);
 at::Tensor fusednbitrowwise_to_half_cpu(
     const at::Tensor& input,
     const int64_t bit_rate);
+at::Tensor fusednbitrowwise_to_float_or_half_cpu(
+    const at::Tensor& input,
+    const int64_t bit_rate,
+    const int64_t output_dtype);
 
+///@ingroup sparse-data-cuda
 at::Tensor reorder_batched_ad_lengths_gpu(
     const at::Tensor& cat_ad_lengths,
     const at::Tensor& batch_offsets,
     const int64_t num_ads_in_batch);
 
+///@ingroup sparse-data-cuda
 at::Tensor reorder_batched_ad_indices_gpu(
     const at::Tensor& cat_ad_offsets,
     const at::Tensor& cat_ad_indices,
@@ -216,11 +306,12 @@ at::Tensor reorder_batched_ad_indices_gpu(
     const at::Tensor& batch_offsets,
     const int64_t num_ads_in_batch);
 
+///@ingroup sparse-data-cpu
 at::Tensor reorder_batched_ad_lengths_cpu(
     const at::Tensor& cat_ad_lengths,
     const at::Tensor& batch_offsets,
     const int64_t num_ads_in_batch);
-
+///@ingroup sparse-data-cpu
 at::Tensor reorder_batched_ad_indices_cpu(
     const at::Tensor& cat_ad_offsets,
     const at::Tensor& cat_ad_indices,
@@ -245,12 +336,14 @@ at::Tensor recat_embedding_grad_output_mixed_D_cpu(
     const at::Tensor& grad_output, // [B_local][Sum_T_global(D)]
     const std::vector<int64_t>& dim_sum_per_rank);
 
+///@ingroup sparse-data-cuda
 at::Tensor batched_unary_embeddings_forward_cuda(
     const at::Tensor& weight,
     const at::Tensor& table_offsets,
     const at::Tensor& offsets,
     const at::Tensor& indices);
 
+///@ingroup sparse-data-cuda
 at::Tensor batched_unary_embeddings_backward_cuda(
     const at::Tensor& grad_output,
     const at::Tensor& weight,
@@ -258,6 +351,7 @@ at::Tensor batched_unary_embeddings_backward_cuda(
     const at::Tensor& offsets,
     const at::Tensor& indices);
 
+///@ingroup sparse-data-cuda
 std::tuple<std::vector<at::Tensor>, std::vector<at::Tensor>>
 stacked_jagged_2d_to_dense_forward_cuda(
     at::Tensor values,
@@ -265,6 +359,7 @@ stacked_jagged_2d_to_dense_forward_cuda(
     const std::vector<int64_t>& offset_per_key,
     const std::vector<int64_t>& max_lengths_per_key);
 
+///@ingroup sparse-data-cuda
 at::Tensor stacked_jagged_2d_to_dense_backward_cuda(
     int64_t B,
     int64_t D,
@@ -273,82 +368,84 @@ at::Tensor stacked_jagged_2d_to_dense_backward_cuda(
     const std::vector<at::Tensor>& offsets_tensor_per_key,
     const std::vector<int64_t>& offset_per_key);
 
+///@ingroup sparse-data-cuda
 at::Tensor jagged_1d_to_dense_gpu(
     at::Tensor values,
     at::Tensor offsets,
     int64_t max_L,
     int64_t padding_value);
 
+///@ingroup sparse-data-cpu
 at::Tensor jagged_1d_to_dense_cpu(
     at::Tensor values,
     at::Tensor offsets,
     int64_t max_L,
     int64_t padding_value);
 
+///@ingroup sparse-data-cpu
 at::Tensor jagged_2d_to_dense_forward_cpu(
     at::Tensor values,
     at::Tensor offsets,
     int64_t max_L);
 
+///@ingroup sparse-data-cuda
 at::Tensor jagged_2d_to_dense_gpu(
     at::Tensor values,
     at::Tensor offsets,
     int64_t max_sequence_length);
 
+///@ingroup sparse-data-cuda
 at::Tensor jagged_2d_to_dense_gpu_forward(
     at::Tensor values,
     at::Tensor offsets,
     int64_t max_sequence_length);
 
+///@ingroup sparse-data-cuda
 at::Tensor jagged_2d_to_dense_gpu_backward(
     at::Tensor grad_output,
     at::Tensor offsets,
     int64_t max_lengths);
 
+///@ingroup sparse-data-gpu
 std::vector<at::Tensor> stacked_jagged_1d_to_dense_gpu(
     at::Tensor values,
     at::Tensor lengths,
     const std::vector<int64_t>& offset_per_key,
     const std::vector<int64_t>& max_lengths_per_key,
     int64_t padding_value);
-
-// Divide the prediction range (e.g., [0, 1]) into B bins. In each bin, use
-// two parameters to store the number of positive examples and the number of
-// examples that fall into this bucket. So we basically have a histogram for
-// the model prediction. As a result, for each bin, we have a statistical
-// value for the real CTR (num_pos / num_example). We use this statistical
-// value as the final calibrated prediction if the pre-cali prediction falls
-// into the corresponding bin. In this way, the predictions within each bin
-// should be well-calibrated if we have sufficient examples. That is, we have
-// a fine-grained calibrated model by this calibration module. Theoretically,
-// this calibration layer can fix any uncalibrated model or prediction if we
-// have sufficient bins and examples.
-//
-// Returns [calibrated_prediction, bin_ids].
-//
-// "logit" is input tensor before applying Sigmoid.
-//
-// Assumes positive weight calibration is used for calibartion target, and
-// "positive_weight" is passed as input argument.
-//
-// # of bins is automatically derived from "bin_num_examples", and
-// "bin_num_positives", all of which should be the same size.
-//
-// "lower/upper_bound":
-// Bounds of the bins.
-//
-// "bin_ctr_in_use_after":
-// We will use the calibration_target for the final calibrated prediction if we
-// don't have sufficient examples. Only use the statistical value of bin CTR
-// after we observe `bin_ctr_in_use_after` examples that fall in this bin.
-// Default: 0.
-//
-// "bin_ctr_weight_value":
-// Weight for statistical value of bin CTR. When this is specified, we perform
-// a weighted sum for the statisctical bin CTR and the calibration_target:
-// final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
-// bin_ctr_weight) * calibration_target.
-// Default: 1.0
+#endif
+
+///@ingroup sparse-data-cpu
+/// Divide the prediction range (e.g., [0, 1]) into B bins. In each bin, use
+/// two parameters to store the number of positive examples and the number of
+/// examples that fall into this bucket. So we basically have a histogram for
+/// the model prediction. As a result, for each bin, we have a statistical
+/// value for the real CTR (`num_pos / num_example`). We use this statistical
+/// value as the final calibrated prediction if the pre-cali prediction falls
+/// into the corresponding bin. In this way, the predictions within each bin
+/// should be well-calibrated if we have sufficient examples. That is, we have
+/// a fine-grained calibrated model by this calibration module. Theoretically,
+/// this calibration layer can fix any uncalibrated model or prediction if we
+/// have sufficient bins and examples.
+///@return `[calibrated_prediction, bin_ids]`
+///@param logit is input tensor before applying Sigmoid.
+/// Assumes positive weight calibration is used for calibartion target, and
+///@param positive_weight is passed as input argument.
+/// The number of bins is automatically derived from `bin_num_examples`, and
+///`bin_num_positives`, all of which should be the same size.
+///@param lower/upper_bound Bounds of the bins.
+///@param bin_ctr_in_use_after We will use the calibration_target for the
+/// final calibrated prediction if we don't have sufficient examples. Only
+/// use the statistical value of bin CTR after we observe `bin_ctr_in_use_after`
+/// examples that fall in this bin. Default value: 0.
+///@param bin_ctr_weight_value Weight for statistical value of bin CTR.
+/// When this is specified, we perform a weighted sum for the statisctical
+/// bin CTR and the calibration_target:
+///```
+/// final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
+/// bin_ctr_weight) * calibration_target
+///```
+/// Default value: 1.0
 std::tuple<at::Tensor, at::Tensor> histogram_binning_calibration_cpu(
     const at::Tensor& logit,
     const at::Tensor& bin_num_examples,
@@ -359,6 +456,8 @@ std::tuple<at::Tensor, at::Tensor> histogram_binning_calibration_cpu(
     int64_t bin_ctr_in_use_after = 0,
     double bin_ctr_weight_value = 1.0);
 
+#ifndef DOXYGEN_THIS_WILL_BE_SKIPPED
+///@ingroup sparse-data-cuda
 std::tuple<at::Tensor, at::Tensor> histogram_binning_calibration_cuda(
     const at::Tensor& logit,
     const at::Tensor& bin_num_examples,
@@ -368,50 +467,47 @@ std::tuple<at::Tensor, at::Tensor> histogram_binning_calibration_cuda(
     double upper_bound = 1.0,
     int64_t bin_ctr_in_use_after = 0,
     double bin_ctr_weight_value = 1.0);
-
-// An extension of histogram binning calibration model which divides data into
-// bins based on one specific feature and prediction/ECTR range. In each bin,
-// use two parameters to store the number of positive examples and the number of
-// examples that fall into this bucket. So we basically have a histogram for
-// the model prediction. As a result, for each bin, we have a statistical
-// value for the real CTR (num_pos / num_example). We use this statistical
-// value as the final calibrated prediction if the pre-cali prediction falls
-// into the corresponding bin. In this way, the predictions within each bin
-// should be well-calibrated if we have sufficient examples. That is, we have
-// a fine-grained calibrated model by this calibration module. Theoretically,
-// this calibration layer can fix any uncalibrated model or prediction if we
-// have sufficient bins and examples.
-//
-// Returns [calibrated_prediction, bin_ids].
-//
-// "logit" is input tensor before applying Sigmoid.
-//
-// Assumes positive weight calibration is used for calibartion target, and
-// "positive_weight" is passed as input argument.
-//
-// "segment_value/lengths":
-// Values and lengths in KeyJaggedTensor. Assumes value of length is either 0
-// or 1.
-//
-// "num_bins":
-// # of bins is no longer the same as "bin_num_examples", and
-// "bin_num_positives", all of which should be still the same size.
-//
-// "lower/upper_bound":
-// Bounds of the bins.
-//
-// "bin_ctr_in_use_after":
-// We will use the calibration_target for the final calibrated prediction if we
-// don't have sufficient examples. Only use the statistical value of bin CTR
-// after we observe `bin_ctr_in_use_after` examples that fall in this bin.
-// Default: 0.
-//
-// "bin_ctr_weight_value":
-// Weight for statistical value of bin CTR. When this is specified, we perform
-// a weighted sum for the statisctical bin CTR and the calibration_target:
-// final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
-// bin_ctr_weight) * calibration_target.
-// Default: 1.0
+#endif
+
+///@ingroup sparse-data-cpu
+/// An extension of histogram binning calibration model which divides data into
+/// bins based on one specific feature and prediction/ECTR range. In each bin,
+/// use two parameters to store the number of positive examples and the number
+/// of examples that fall into this bucket. So we basically have a histogram for
+/// the model prediction. As a result, for each bin, we have a statistical
+/// value for the real CTR (num_pos / num_example). We use this statistical
+/// value as the final calibrated prediction if the pre-cali prediction falls
+/// into the corresponding bin. In this way, the predictions within each bin
+/// should be well-calibrated if we have sufficient examples. That is, we have
+/// a fine-grained calibrated model by this calibration module. Theoretically,
+/// this calibration layer can fix any uncalibrated model or prediction if we
+/// have sufficient bins and examples.
+///
+///@return `[calibrated_prediction, bin_ids]`
+///@param logit is input tensor before applying Sigmoid.
+///
+/// Assumes positive weight calibration is used for calibartion target, and
+///`positive_weight` is passed as input argument.
+///@param segment_value/lengths Values and lengths in KeyJaggedTensor.
+/// Assumes value of length is either 0 or 1.
+///@param num_bins # of bins is no longer the same as `bin_num_examples`,
+/// and `bin_num_positives`, all of which should be still the same size.
+///@param lower/upper_bound Bounds of the bins.
+///@param bin_ctr_in_use_after We will use the calibration_target for
+/// the final calibrated prediction if we don't have sufficient examples.
+/// Only use the statistical value of bin CTR after we observe
+/// `bin_ctr_in_use_after` examples that fall in this bin. Default value is `0`.
+///@parambin_ctr_weight_value Weight for statistical value of bin CTR. When
+/// this is specified, we perform a weighted sum for the statisctical
+/// bin CTR and the calibration_target:
+///```
+/// final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
+/// bin_ctr_weight) * calibration_target.
+///```
+/// Default value: `1.0`
+
+#ifndef DOXYGEN_THIS_WILL_BE_SKIPPED
+///@ingroup sparse-data-cpu
 std::tuple<at::Tensor, at::Tensor> histogram_binning_calibration_by_feature_cpu(
     const at::Tensor& logit,
     const at::Tensor& segment_value,
@@ -426,6 +522,7 @@ std::tuple<at::Tensor, at::Tensor> histogram_binning_calibration_by_feature_cpu(
     int64_t bin_ctr_in_use_after = 0,
     double bin_ctr_weight_value = 1.0);
 
+///@ingroup sparse-data-cuda
 std::tuple<at::Tensor, at::Tensor>
 histogram_binning_calibration_by_feature_cuda(
     const at::Tensor& logit,
@@ -440,11 +537,12 @@ histogram_binning_calibration_by_feature_cuda(
     double upper_bound = 1.0,
     int64_t bin_ctr_in_use_after = 0,
     double bin_ctr_weight_value = 1.0);
+#endif
 
-// Same as above, but accepts generic "bin_boundaries", which is assumed to be
-// sorted.
-//
-// Returns calibrated_prediction.
+///@ingroup sparse-data-cpu
+/// Same as above, but accepts generic "bin_boundaries", which is assumed to be
+/// sorted.
+///@return calibrated_prediction.
 std::tuple<at::Tensor, at::Tensor>
 generic_histogram_binning_calibration_by_feature_cpu(
     const at::Tensor& logit,
@@ -458,6 +556,8 @@ generic_histogram_binning_calibration_by_feature_cpu(
     int64_t bin_ctr_in_use_after = 0,
     double bin_ctr_weight_value = 1.0);
 
+#ifndef DOXYGEN_THIS_WILL_BE_SKIPPED
+///@ingroup sparse-data-cuda
 std::tuple<at::Tensor, at::Tensor>
 generic_histogram_binning_calibration_by_feature_cuda(
     const at::Tensor& logit,
@@ -480,25 +580,31 @@ std::tuple<at::Tensor, at::Tensor> embedding_bag_rowwise_prune(
     const int64_t min_non_pruned_rows,
     const c10::optional<double>& min_save_ratio);
 
+///@ingroup sparse-data-cpu
 at::Tensor lengths_range(
     const at::Tensor& t_in,
     const c10::optional<std::vector<int64_t>>& shape);
 
+///@ingroup sparse-data-cpu
 at::Tensor& lengths_range_out(
     at::Tensor& output,
     const at::Tensor& t_in,
     const c10::optional<std::vector<int64_t>>& shape);
 
+///@ingroup sparse-data-cuda
 at::Tensor lengths_range_cuda(
     const at::Tensor& t_in,
     const c10::optional<std::vector<int64_t>>& shape);
 std::tuple<at::Tensor, at::Tensor, c10::optional<at::Tensor>>
+
+///@ingroup sparse-data-cpu
 permute_sparse_features_cpu(
     const at::Tensor& permute,
     const at::Tensor& lengths,
     const at::Tensor& indices,
     const c10::optional<at::Tensor>& weights);
 
+///@ingroup sparse-data-cuda
 std::tuple<at::Tensor, at::Tensor, c10::optional<at::Tensor>>
 permute_sparse_features_cuda(
     const at::Tensor& permute,
@@ -506,19 +612,61 @@ permute_sparse_features_cuda(
     const at::Tensor& indices,
     const c10::optional<at::Tensor>& weights);
 
+///@ingroup sparse-data-cuda
 at::Tensor permute102_baddbmm_permute102_cuda(
     const at::Tensor& bias,
     const at::Tensor& A,
     const at::Tensor& B);
 
+///@ingroup sparse-data-cpu
 std::tuple<at::Tensor, at::Tensor> permute_sequence_embeddings_cpu(
     const at::Tensor& permute,
     const at::Tensor& lengths,
     const at::Tensor& embeddings);
 
+///@ingroup sparse-data-cuda
 std::tuple<at::Tensor, at::Tensor> permute_sequence_embeddings_cuda(
     const at::Tensor& permute,
     const at::Tensor& lengths,
     const at::Tensor& embeddings);
 
+///@ingroup sparse-data-cpu
+at::Tensor pack_segments_cpu(
+    const at::Tensor& t_in,
+    const at::Tensor& lengths,
+    const int64_t max_length);
+
+///@ingroup sparse-data-cuda
+at::Tensor pack_segments_cuda(
+    const at::Tensor& t_in,
+    const at::Tensor& lengths,
+    const int64_t max_length);
+
+at::Tensor pack_segments_forward_cuda(
+    const at::Tensor& t_in,
+    const at::Tensor& lengths,
+    const int64_t max_length);
+
+at::Tensor pack_segments_backward_cuda(
+    const at::Tensor& data,
+    const at::Tensor& lengths,
+    int64_t total_length,
+    int64_t max_length);
+
+///@ingroup sparse-data-cuda
+at::Tensor index_select_with_sorted_indices_cuda(
+    const at::Tensor& input,
+    const at::Tensor& sorted_indices,
+    const at::Tensor& orig_indices,
+    const int consecutive_range_start,
+    const int consecutive_range_length);
+
+at::Tensor index_add_with_unique_indices_cuda(
+    const at::Tensor& grad_output,
+    const at::Tensor& sorted_indices,
+    const at::Tensor& orig_indices,
+    std::vector<int64_t>& input_shape,
+    const int consecutive_range_start,
+    const int consecutive_range_length);
+#endif
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h
index 4599f70a3..14e81a6db 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h
@@ -142,6 +142,14 @@ inline bool torch_tensor_empty_or_on_cpu_check(
   TENSOR_ON_CUDA_GPU(x);                     \
   TENSOR_CONTIGUOUS(x)
 
+#define TENSOR_NDIM_IS_GE(ten, dims)         \
+  TORCH_CHECK(                               \
+      (ten).dim() >= (dims),                 \
+      "Tensor '" #ten "' must have >=" #dims \
+      " dimension(s). "                      \
+      "Found ",                              \
+      (ten).ndimension())
+
 /// Determine an appropriate CUDA block count along the x axis
 ///
 /// When launching CUDA kernels the number of blocks B is often calculated
@@ -285,6 +293,16 @@ constexpr uint32_t cuda_calc_block_count(
       cuda_calc_xblock_count(num_items, threads_per_block), max_blocks);
 }
 
+// A wrapper class for passing dynamically sized dimension information (e.g.
+// tensor.dims()) from the host to device.
+constexpr size_t kStackArrayMaxDims = 5;
+
+template <typename T>
+struct StackArray {
+  T vals[kStackArrayMaxDims];
+  size_t ndim;
+};
+
 // Used in jagged_tensor_ops.cu and jagged_tensor_ops_cpu.cpp
 // Passing lambda exp argument by value instead of by reference to avoid
 // "internal compiler error: in maybe_undo_parenthesized_ref" error for specific
diff --git a/fbgemm_gpu/run_all.sh b/fbgemm_gpu/run_all.sh
deleted file mode 100755
index d7a457c08..000000000
--- a/fbgemm_gpu/run_all.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-common_opts="--bag-size 55 \
-        --batch-size 65536 \
-        --num-embeddings 19300000 \
-        --num-tables 1 \
-        --iters 5"
-
-# Run on GPU and get PyTorch-level performance
-for D in 64 128 192 256 512; do
-  for fp in "fp32" "fp16"; do
-    for alpha in 1 1.15; do
-      echo "D = ${D}, FP = ${fp}, alpha = ${alpha}"
-      python3.6 bench/split_table_batched_embeddings_benchmark.py device \
-        $common_opts \
-        --embedding-dim $D \
-        --alpha ${alpha} \
-        --weights-precision $fp
-    done
-  done
-done 2>&1 | tee log_fbgemm_gpu_m1.log
-
-# Run on GPU and get rocprof-level performance
-for D in 64 128 192 256 512; do
-  for fp in "fp32" "fp16"; do
-    for alpha in 1 1.15; do
-      rm -rf rocprof
-      rm -rf rocprof_tmp
-      echo "D = ${D}, FP = ${fp}, alpha = ${alpha}"
-      outf="rocprof_fbgemm_gpu_D_${D}_${fp}_alpha_${alpha}.csv"
-      rocprof --timestamp on -o $outf -d rocprof -t rocprof_tmp \
-      python3.6 bench/split_table_batched_embeddings_benchmark.py device \
-        $common_opts \
-        --embedding-dim $D \
-        --alpha ${alpha} \
-        --weights-precision $fp
-    done
-  done
-done
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index 9ac1cbe17..5eff309fc 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -11,7 +11,7 @@
 import subprocess
 import sys
 from datetime import date
-from typing import Optional, List
+from typing import List, Optional
 
 import torch
 from skbuild import setup
@@ -142,7 +142,7 @@ def main(argv: List[str]) -> None:
             print(
                 "CUDA CUB directory environment variable not set.  Using default CUB location."
             )
-            if not torch.version.cuda is None:
+            if torch.version.cuda is not None:
                 cuda_version = torch.version.cuda.split(".")
                 cuda_home = find_cuda(int(cuda_version[0]), int(cuda_version[1]))
             else:
diff --git a/fbgemm_gpu/src/cpu_utils.cpp b/fbgemm_gpu/src/cpu_utils.cpp
index 5353365fd..63cee94df 100644
--- a/fbgemm_gpu/src/cpu_utils.cpp
+++ b/fbgemm_gpu/src/cpu_utils.cpp
@@ -173,4 +173,19 @@ template std::pair<int64_t*, int*> radix_sort_parallel(
     int64_t elements_count,
     int64_t max_value);
 
+template std::pair<int*, std::pair<int, double>*> radix_sort_parallel(
+    int* inp_key_buf,
+    std::pair<int, double>* inp_value_buf,
+    int* tmp_key_buf,
+    std::pair<int, double>* tmp_value_buf,
+    int64_t elements_count,
+    int64_t max_value);
+
+template std::pair<int*, std::pair<int, float>*> radix_sort_parallel(
+    int* inp_key_buf,
+    std::pair<int, float>* inp_value_buf,
+    int* tmp_key_buf,
+    std::pair<int, float>* tmp_value_buf,
+    int64_t elements_count,
+    int64_t max_value);
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu
index e40676175..03926b4db 100644
--- a/fbgemm_gpu/src/cumem_utils.cu
+++ b/fbgemm_gpu/src/cumem_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) Facebook, Inc. and its affiliates.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -265,36 +265,7 @@ int64_t uvm_get_guard_index(Tensor& t) {
 }
 } // namespace
 
-#ifdef __HIP_PLATFORM_HCC__
-void uvm_cuda_mem_advise(Tensor t, int64_t hipMemoryAdvise) {
-  // Call hipMemAdvise on vm tensor
-  // See hipMemAdvise enum (automatically exported to python fbgemm_gpu.uvm
-  // namespace) for valid values and interface stub.
-  at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard;
-  int64_t cuda_device_index = uvm_get_guard_index(t);
-  int hint_device;
-  if (t.is_cpu()) {
-    hint_device = hipCpuDeviceId;
-  } else {
-    TORCH_CHECK(t.is_cuda());
-    hint_device = static_cast<int>(cuda_device_index);
-  }
-
-  void* ptr = t.data_ptr();
-  size_t size_bytes = at::detail::computeStorageNbytes(
-      t.sizes(), t.strides(), t.dtype().itemsize());
-
-  device_guard.set_index(cuda_device_index);
-
-  AT_CUDA_CHECK(hipMemAdvise(
-      ptr,
-      size_bytes,
-      static_cast<enum hipMemoryAdvise>(hipMemoryAdvise),
-      hint_device));
-  return;
-}
-#else
-void uvm_cuda_mem_advise(Tensor t, int64_t cudaMemoryAdvise) {
+void uvm_cuda_mem_advise(Tensor t, int64_t cuda_memory_advise) {
   // Call cudaMemAdvise on vm tensor
   // See cudaMemoryAdvise enum (automatically exported to python fbgemm_gpu.uvm
   // namespace) for valid values and interface stub.
@@ -314,17 +285,14 @@ void uvm_cuda_mem_advise(Tensor t, int64_t cudaMemoryAdvise) {
 
   device_guard.set_index(cuda_device_index);
 
-#ifndef __HIP_PLATFORM_HCC__
   // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
   AT_CUDA_CHECK(cudaMemAdvise(
       ptr,
       size_bytes,
-      static_cast<enum cudaMemoryAdvise>(cudaMemoryAdvise),
+      static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
       hint_device));
-#endif
   return;
 }
-#endif
 
 void uvm_cuda_mem_prefetch_async(Tensor t, c10::optional<Tensor> device_t) {
   // Call cudaMemPrefetchAsync on Tensor
@@ -388,25 +356,32 @@ Tensor uvm_to_cpu_clone(Tensor t) {
 }
 
 FBGEMM_GPU_ENUM_GLOGAL(uvm)
-#ifdef __HIP_PLATFORM_HCC__
-// FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
-FBGEMM_GPU_ENUM_REGISTER_START(uvm, hipMemoryAdvise){
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetReadMostly),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetReadMostly),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetPreferredLocation),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetPreferredLocation),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetAccessedBy),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetAccessedBy),
-} FBGEMM_GPU_ENUM_REGISTER_END
-#else
-FBGEMM_GPU_ENUM_REGISTER_START(uvm, cudaMemoryAdvise){
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetReadMostly),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetReadMostly),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetPreferredLocation),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetPreferredLocation),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseSetAccessedBy),
-    FBGEMM_GPU_ENUM_ITEM(cudaMemAdviseUnsetAccessedBy),
+
+FBGEMM_GPU_ENUM_REGISTER_START(uvm, cudaMemory, Advise){
+    FBGEMM_GPU_ENUM_ITEM(
+        cudaMem,
+        AdviseSetReadMostly,
+        cudaMemAdviseSetReadMostly),
+    FBGEMM_GPU_ENUM_ITEM(
+        cudaMem,
+        AdviseUnsetReadMostly,
+        cudaMemAdviseUnsetReadMostly),
+    FBGEMM_GPU_ENUM_ITEM(
+        cudaMem,
+        AdviseSetPreferredLocation,
+        cudaMemAdviseSetPreferredLocation),
+    FBGEMM_GPU_ENUM_ITEM(
+        cudaMem,
+        AdviseUnsetPreferredLocation,
+        cudaMemAdviseUnsetPreferredLocation),
+    FBGEMM_GPU_ENUM_ITEM(
+        cudaMem,
+        AdviseSetAccessedBy,
+        cudaMemAdviseSetAccessedBy),
+    FBGEMM_GPU_ENUM_ITEM(
+        cudaMem,
+        AdviseUnsetAccessedBy,
+        cudaMemAdviseUnsetAccessedBy),
 } FBGEMM_GPU_ENUM_REGISTER_END
-#endif
 
-} // namespace fbgemm_gpu
\ No newline at end of file
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/cumem_utils.h b/fbgemm_gpu/src/cumem_utils.h
index 75532f435..a0441d2c5 100644
--- a/fbgemm_gpu/src/cumem_utils.h
+++ b/fbgemm_gpu/src/cumem_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) Facebook, Inc. and its affiliates.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -12,44 +12,57 @@ namespace fbgemm_gpu {
 
 using namespace at;
 
-// Allocate the ATen Tensor with unified managed memory (UVM)
-// and set both UVM storage preference to CPU and access from self.device
+///@defgroup cumem-utils CUDA Memorty Operators
+///
+
+///@ingroup cumem-utils
+/// Allocate the ATen Tensor with unified managed memory (UVM)
+/// and set both UVM storage preference to CPU and access from self.device
 Tensor new_managed_tensor(Tensor self, const std::vector<std::int64_t>& sizes);
 
-// Allocate the ATen Tensor with unified managed memory (UVM)
+///@ingroup cumem-utils
+/// Allocate the ATen Tensor with unified managed memory (UVM)
 Tensor new_vanilla_managed_tensor(
     Tensor self,
     const std::vector<std::int64_t>& sizes);
 
-// Check if a tensor is allocated with UVM
+///@ingroup cumem-utils
+/// Check if a tensor is allocated with UVM
 bool uvm_storage(Tensor t);
 
-// Check if a tensor is allocated with UVM *AND* is not on a CPU
+///@ingroup cumem-utils
+/// Check if a tensor is allocated with UVM *AND* is not on a CPU
 bool is_uvm_tensor(Tensor t);
 
-// Convert a UVM tensor to a CPU tensor
+///@ingroup cumem-utils
+/// Convert a UVM tensor to a CPU tensor
 Tensor uvm_to_cpu(Tensor t);
 
-// Create a UVM tensor on the same device as prototype sharing
-// the same uvm storage as t
+///@ingroup cumem-utils
+/// Create a UVM tensor on the same device as prototype sharing
+/// the same uvm storage as t
 Tensor uvm_to_device(Tensor t, Tensor prototype);
 
-// Call cudaMemAdvise on UVM Storage. The hint enum is generated in Python
-// (fbgemm,uvm) using data returned from C++ op.
-void uvm_cuda_mem_advise(Tensor t, int64_t cudaMemoryAdvise);
+///@ingroup cumem-utils
+/// Call cudaMemAdvise on UVM Storage. The hint enum is generated in Python
+/// (fbgemm,uvm) using data returned from C++ op.
+void uvm_cuda_mem_advise(Tensor t, int64_t cuda_memory_advise);
 
-// Call cudaMemPrefetchAsync on UVM Storage
+///@ingroup cumem-utils
+/// Call cudaMemPrefetchAsync on UVM Storage
 void uvm_cuda_mem_prefetch_async(Tensor t, c10::optional<Tensor> device_t);
 
-// Call madvise(..MADV_DONTFORK) on the UVM storage. This is a workaround for
-// an issue where the UVM kernel driver unmaps UVM storage pages from the page
-// table on fork - causing slowdown on the next access from a CPU.
+///@ingroup cumem-utils
+/// Call madvise(..MADV_DONTFORK) on the UVM storage. This is a workaround for
+/// an issue where the UVM kernel driver unmaps UVM storage pages from the page
+/// table on fork - causing slowdown on the next access from a CPU.
 void uvm_mem_advice_dont_fork(Tensor t);
 
-// Copy a contigious uvm Tensor (uvm_storage(t) is true) into a CPU Tensor
-// The copy uses single threaded memcpy
+///@ingroup cumem-utils
+/// Copy a contigious uvm Tensor (uvm_storage(t) is true) into a CPU Tensor
+/// The copy uses single threaded memcpy
 Tensor uvm_to_cpu_clone(Tensor t);
 
 FBGEMM_GPU_ENUM_CREATE_TAG(uvm)
 
-} // namespace fbgemm_gpu
\ No newline at end of file
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/cumem_utils_host.cpp b/fbgemm_gpu/src/cumem_utils_host.cpp
index 249d2e27e..a4c046582 100644
--- a/fbgemm_gpu/src/cumem_utils_host.cpp
+++ b/fbgemm_gpu/src/cumem_utils_host.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) Facebook, Inc. and its affiliates.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -15,6 +15,7 @@ using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
 TORCH_LIBRARY_FRAGMENT(fb, m) {
   m.def("is_uvm_tensor(Tensor t) -> bool", TORCH_FN(is_uvm_tensor));
   m.def("uvm_storage(Tensor t) -> bool", TORCH_FN(uvm_storage));
@@ -38,11 +39,7 @@ TORCH_LIBRARY_FRAGMENT(fb, m) {
       TORCH_FN(uvm_mem_advice_dont_fork));
 
   m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
-
-#ifndef __HIP_PLATFORM_HCC__
-  // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
   m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));
-#endif
 }
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -67,10 +64,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "uvm_mem_advice_dont_fork(Tensor t) -> ()",
       TORCH_FN(uvm_mem_advice_dont_fork));
 
-#ifndef __HIP_PLATFORM_HCC__
-  // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
+  m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
   m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));
-#endif
 }
 
-} // namespace fbgemm_gpu
\ No newline at end of file
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/input_combine_cpu.cpp b/fbgemm_gpu/src/input_combine_cpu.cpp
index 612c150da..1ba67a3f3 100644
--- a/fbgemm_gpu/src/input_combine_cpu.cpp
+++ b/fbgemm_gpu/src/input_combine_cpu.cpp
@@ -90,9 +90,6 @@ std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_cpu(
   int64_t total_offsets = 1;
   bool need_weights = false;
   bool pin_memory = false;
-  if (at::Context::hasCUDA() && at::getNumGPUs() > 0) {
-    pin_memory = true;
-  }
 
   for (size_t i = 0; i < indices_list.size(); i++) {
     TORCH_CHECK(
@@ -172,9 +169,6 @@ std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_cpu(
   int64_t total_lengths = 0;
   bool need_weights = false;
   bool pin_memory = false;
-  if (at::Context::hasCUDA() && at::getNumGPUs() > 0) {
-    pin_memory = true;
-  }
 
   for (size_t i = 0; i < indices_list.size(); i++) {
     TORCH_CHECK(
@@ -234,9 +228,6 @@ std::tuple<Tensor, Tensor, Tensor> padding_fused_tbe_input_combine_cpu(
   int64_t total_offsets = 1 + batch_size * indices_list.size();
   bool need_weights = false;
   bool pin_memory = false;
-  if (at::Context::hasCUDA() && at::getNumGPUs() > 0) {
-    pin_memory = true;
-  }
 
   for (size_t i = 0; i < indices_list.size(); i++) {
     TORCH_CHECK(
diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index cad2b7772..3ddd1d61d 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -11,6 +11,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/library.h>
+#include <ATen/cuda/Atomic.cuh>
 
 // clang-format off
 #include "fbgemm_gpu/cub_namespace_prefix.cuh"
@@ -27,6 +28,10 @@ namespace fbgemm_gpu {
 
 namespace {
 
+/// @defgroup jagged-tensor-ops-cuda Jagged Tensor CUDA Operators
+/// The following are Jagged Tensor CUDA Operators
+///
+
 /**
  * Ref. http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
  * @param offset the input value points to the offset in the first jagged dim
@@ -41,14 +46,14 @@ template <int NUM_JAGGED_DIM, typename index_t>
 DEVICE_INLINE bool walk_down_tensor_storage_tree_(
     int& offset,
     const int flattened_jagged_idx,
-    const int64_t* jagged_dims,
-    const std::array<index_t*, NUM_JAGGED_DIM>& x_offsets) {
+    const StackArray<int64_t>& jagged_dims,
+    const StackArray<index_t*>& x_offsets) {
   // compute coorindates
   int jagged_coords[NUM_JAGGED_DIM];
   int j_temp = flattened_jagged_idx;
 #pragma unroll
   for (int d = NUM_JAGGED_DIM - 1; d >= 0; --d) {
-    const int jagged_size = jagged_dims[d];
+    const int jagged_size = jagged_dims.vals[d];
     jagged_coords[d] = j_temp % jagged_size;
     j_temp /= jagged_size;
   }
@@ -57,8 +62,8 @@ DEVICE_INLINE bool walk_down_tensor_storage_tree_(
   bool is_zero = false;
 #pragma unroll
   for (int d = 0; d < NUM_JAGGED_DIM; ++d) {
-    const int begin = x_offsets[d][offset];
-    const int end = x_offsets[d][offset + 1];
+    const int begin = x_offsets.vals[d][offset];
+    const int end = x_offsets.vals[d][offset + 1];
     if (jagged_coords[d] >= end - begin) {
       is_zero = true;
       break;
@@ -87,10 +92,10 @@ __global__
 __launch_bounds__(kMaxThreads) void jagged_dense_elementwise_dense_output_kernel_(
     const at::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits>
         x_values,
-    const std::array<index_t*, NUM_JAGGED_DIM> x_offsets,
+    StackArray<index_t*> x_offsets,
     const at::PackedTensorAccessor32<scalar_t, 3, at::RestrictPtrTraits> y,
     at::PackedTensorAccessor32<scalar_t, 3, at::RestrictPtrTraits> output,
-    const int64_t* jagged_dims,
+    StackArray<int64_t> jagged_dims,
     F f,
     const scalar_t padding_value) {
   const int outer_dense_size = y.size(0);
@@ -138,7 +143,7 @@ __launch_bounds__(kMaxThreads) void jagged_dense_elementwise_dense_output_kernel
   }
 }
 
-std::tuple<dim3, dim3, Tensor> check_shape_and_partition_(
+std::tuple<dim3, dim3, StackArray<int64_t>> check_shape_and_partition_(
     const Tensor& values,
     const std::vector<Tensor>& offsets,
     const Tensor& dense_tensor) {
@@ -165,17 +170,14 @@ std::tuple<dim3, dim3, Tensor> check_shape_and_partition_(
   const dim3 blocks(
       div_round_up(outer_dense_size * jagged_folded_size, threads_y));
 
+  StackArray<int64_t> jagged_dims_tensor;
   const int num_jagged_dim = dense_tensor.dim() - 2;
-  Tensor jagged_dims_tensor = at::empty(
-      {num_jagged_dim},
-      at::TensorOptions().dtype(at::kLong).pinned_memory(true));
-  memcpy(
-      jagged_dims_tensor.data_ptr<int64_t>(),
+  TORCH_CHECK(num_jagged_dim <= kStackArrayMaxDims);
+  jagged_dims_tensor.ndim = num_jagged_dim;
+  std::memcpy(
+      &(jagged_dims_tensor.vals[0]),
       dense_tensor.sizes().data() + 1,
       num_jagged_dim * sizeof(int64_t));
-  jagged_dims_tensor =
-      jagged_dims_tensor.to(offsets[0].device(), /*non_blocking=*/true);
-
   return {dim3(threads_x, threads_y), blocks, jagged_dims_tensor};
 }
 
@@ -205,7 +207,7 @@ void jagged_dense_elementwise_dense_output_(
   }
 
   dim3 threads, blocks;
-  Tensor jagged_dims_tensor;
+  StackArray<int64_t> jagged_dims_tensor;
   std::tie(threads, blocks, jagged_dims_tensor) =
       check_shape_and_partition_(x_values, x_offsets, y);
 
@@ -215,11 +217,14 @@ void jagged_dense_elementwise_dense_output_(
 
 #define INVOKE_KERNEL_WITH_DIM(NUM_JAGGED_DIM)                                \
   {                                                                           \
-    Tensor x_offsets_contig[num_jagged_dim];                                  \
-    std::array<index_t*, NUM_JAGGED_DIM> x_offset_ptrs;                       \
+    std::vector<Tensor> x_offsets_contig;                                     \
+    x_offsets_contig.resize(num_jagged_dim);                                  \
+    StackArray<index_t*> x_offset_ptrs;                                       \
+    x_offset_ptrs.ndim = num_jagged_dim;                                      \
     for (int d = 0; d < num_jagged_dim; ++d) {                                \
       x_offsets_contig[d] = x_offsets[d].contiguous();                        \
-      x_offset_ptrs[d] = x_offsets_contig[d].template data_ptr<index_t>();    \
+      x_offset_ptrs.vals[d] =                                                 \
+          x_offsets_contig[d].template data_ptr<index_t>();                   \
     }                                                                         \
     jagged_dense_elementwise_dense_output_kernel_<NUM_JAGGED_DIM, index_t>    \
         <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(           \
@@ -229,7 +234,7 @@ void jagged_dense_elementwise_dense_output_(
                 .packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(),     \
             output_reshaped                                                   \
                 .packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(),     \
-            jagged_dims_tensor.data_ptr<int64_t>(),                           \
+            jagged_dims_tensor,                                               \
             f,                                                                \
             padding_value);                                                   \
   }
@@ -255,17 +260,18 @@ Tensor jagged_dense_elementwise_dense_output_(
 
 template <int NUM_JAGGED_DIM, typename index_t, typename scalar_t, typename F>
 __global__
-__launch_bounds__(kMaxThreads) void jagged_dense_elementwise_jagged_output_kernel_(
+__launch_bounds__(kMaxThreads) void jagged_dense_dense_elementwise_jagged_output_kernel_(
     const at::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits>
         x_values,
-    const std::array<index_t*, NUM_JAGGED_DIM> x_offsets,
-    const std::array<int, NUM_JAGGED_DIM> x_offsets_sizes,
-    const at::PackedTensorAccessor32<scalar_t, 3, at::RestrictPtrTraits> y,
+    StackArray<index_t*> x_offsets,
+    StackArray<int64_t> x_offsets_sizes,
+    const at::PackedTensorAccessor32<scalar_t, 3, at::RestrictPtrTraits> y_0,
+    const at::PackedTensorAccessor32<scalar_t, 3, at::RestrictPtrTraits> y_1,
     at::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits>
         output_values,
-    const int64_t* jagged_dims,
+    StackArray<int64_t> jagged_dims,
     F f) {
-  const int inner_dense_size = y.size(2);
+  const int inner_dense_size = y_0.size(2);
   const int nnz = x_values.size(0);
 
   const int offset_begin = blockIdx.x * blockDim.y + threadIdx.y;
@@ -278,13 +284,13 @@ __launch_bounds__(kMaxThreads) void jagged_dense_elementwise_jagged_output_kerne
 #pragma unroll
     for (int d = NUM_JAGGED_DIM - 1; d >= 0; --d) {
       // Binary search the first that is bigger than offset
-      int count = x_offsets_sizes[d] - 1;
+      int count = x_offsets_sizes.vals[d] - 1;
       int first = 1;
       while (count > 0) {
         int idx = first;
         int step = count / 2;
         idx += step;
-        if (x_offsets[d][idx] <= offset_temp) {
+        if (x_offsets.vals[d][idx] <= offset_temp) {
           first = ++idx;
           count -= step + 1;
         } else {
@@ -293,13 +299,13 @@ __launch_bounds__(kMaxThreads) void jagged_dense_elementwise_jagged_output_kerne
       }
 
       --first;
-      int coord = offset_temp - x_offsets[d][first];
-      if (coord >= jagged_dims[d]) {
+      int coord = offset_temp - x_offsets.vals[d][first];
+      if (coord >= jagged_dims.vals[d]) {
         truncated = true;
         break;
       }
       jidx += coord * dim_prod;
-      dim_prod *= jagged_dims[d];
+      dim_prod *= jagged_dims.vals[d];
       offset_temp = first;
     }
 
@@ -309,29 +315,36 @@ __launch_bounds__(kMaxThreads) void jagged_dense_elementwise_jagged_output_kerne
       for (iidx = threadIdx.x; iidx * 2 + 1 < inner_dense_size;
            iidx += blockDim.x) {
         output_values[offset][2 * iidx] =
-            f(x_values[offset][2 * iidx], y[oidx][jidx][2 * iidx]);
+            f(x_values[offset][2 * iidx],
+              y_0[oidx][jidx][2 * iidx],
+              y_1[oidx][jidx][2 * iidx]);
         output_values[offset][2 * iidx + 1] =
-            f(x_values[offset][2 * iidx + 1], y[oidx][jidx][2 * iidx + 1]);
+            f(x_values[offset][2 * iidx + 1],
+              y_0[oidx][jidx][2 * iidx + 1],
+              y_1[oidx][jidx][2 * iidx + 1]);
       }
       if (iidx * 2 + 1 == inner_dense_size) {
         output_values[offset][2 * iidx] =
-            f(x_values[offset][2 * iidx], y[oidx][jidx][2 * iidx]);
+            f(x_values[offset][2 * iidx],
+              y_0[oidx][jidx][2 * iidx],
+              y_1[oidx][jidx][2 * iidx]);
       }
     } else {
       int iidx;
       for (iidx = threadIdx.x; iidx * 2 + 1 < inner_dense_size;
            iidx += blockDim.x) {
-        output_values[offset][2 * iidx] = f(x_values[offset][2 * iidx], 0);
+        output_values[offset][2 * iidx] = f(x_values[offset][2 * iidx], 0, 0);
         output_values[offset][2 * iidx + 1] =
-            f(x_values[offset][2 * iidx + 1], 0);
+            f(x_values[offset][2 * iidx + 1], 0, 0);
       }
       if (iidx * 2 + 1 == inner_dense_size) {
-        output_values[offset][2 * iidx] = f(x_values[offset][2 * iidx], 0);
+        output_values[offset][2 * iidx] = f(x_values[offset][2 * iidx], 0, 0);
       }
     }
   }
 }
 
+///@addtogroup jagged-tensor-ops-cuda
 template <typename scalar_t, typename F>
 void jagged_dense_elementwise_jagged_output_(
     const Tensor& x_values,
@@ -357,7 +370,7 @@ void jagged_dense_elementwise_jagged_output_(
   }
 
   dim3 threads, blocks;
-  Tensor jagged_dims_tensor;
+  StackArray<int64_t> jagged_dims_tensor;
   std::tie(threads, blocks, jagged_dims_tensor) =
       check_shape_and_partition_(x_values, x_offsets, y);
   // Patch up blocks.x because we're using different parallelization from other
@@ -367,27 +380,32 @@ void jagged_dense_elementwise_jagged_output_(
   // Canonicalize y to 3D, collapsing jagged dimensions.
   const Tensor y_reshaped = y.view({y.size(0), -1, y.size(-1)});
 
-#define INVOKE_KERNEL_WITH_DIM(NUM_JAGGED_DIM)                                \
-  {                                                                           \
-    Tensor x_offsets_contig[num_jagged_dim];                                  \
-    std::array<index_t*, NUM_JAGGED_DIM> x_offset_ptrs;                       \
-    std::array<int, NUM_JAGGED_DIM> x_offset_sizes;                           \
-    for (int d = 0; d < num_jagged_dim; ++d) {                                \
-      x_offsets_contig[d] = x_offsets[d].contiguous();                        \
-      x_offset_ptrs[d] = x_offsets_contig[d].template data_ptr<index_t>();    \
-      x_offset_sizes[d] = x_offsets[d].numel();                               \
-    }                                                                         \
-    jagged_dense_elementwise_jagged_output_kernel_<NUM_JAGGED_DIM, index_t>   \
-        <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(           \
-            x_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(), \
-            x_offset_ptrs,                                                    \
-            x_offset_sizes,                                                   \
-            y_reshaped                                                        \
-                .packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(),     \
-            output_values                                                     \
-                .packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(),     \
-            jagged_dims_tensor.data_ptr<int64_t>(),                           \
-            f);                                                               \
+#define INVOKE_KERNEL_WITH_DIM(NUM_JAGGED_DIM)                                 \
+  {                                                                            \
+    std::vector<Tensor> x_offsets_contig;                                      \
+    x_offsets_contig.resize(num_jagged_dim);                                   \
+    StackArray<index_t*> x_offset_ptrs;                                        \
+    x_offset_ptrs.ndim = num_jagged_dim;                                       \
+    StackArray<int64_t> x_offset_sizes;                                        \
+    x_offset_sizes.ndim = num_jagged_dim;                                      \
+    for (int d = 0; d < num_jagged_dim; ++d) {                                 \
+      x_offsets_contig[d] = x_offsets[d].contiguous();                         \
+      x_offset_ptrs.vals[d] =                                                  \
+          x_offsets_contig[d].template data_ptr<index_t>();                    \
+      x_offset_sizes.vals[d] = x_offsets[d].numel();                           \
+    }                                                                          \
+    jagged_dense_dense_elementwise_jagged_output_kernel_<                      \
+        NUM_JAGGED_DIM,                                                        \
+        index_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(    \
+        x_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(),      \
+        x_offset_ptrs,                                                         \
+        x_offset_sizes,                                                        \
+        y_reshaped.packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(),    \
+        y_reshaped.packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(),    \
+        output_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(), \
+        jagged_dims_tensor,                                                    \
+        [f_ = f] __device__(scalar_t x, scalar_t y, scalar_t /*unused*/)       \
+            -> scalar_t { return f_(x, y); });                                 \
   }
 
   JAGGED_TENSOR_DISPATCH_DIMS();
@@ -396,6 +414,77 @@ void jagged_dense_elementwise_jagged_output_(
 #undef INVOKE_KERNEL_WITH_DIM
 }
 
+template <typename scalar_t, typename F>
+void jagged_dense_dense_elementwise_jagged_output_(
+    const Tensor& x_values,
+    const std::vector<Tensor>& x_offsets,
+    const Tensor& y_0,
+    const Tensor& y_1,
+    const Tensor& output_values,
+    F f) {
+  TENSOR_ON_CUDA_GPU(x_values);
+  for (auto& x_offset : x_offsets) {
+    TENSOR_ON_CUDA_GPU(x_offset);
+  }
+
+  const int num_jagged_dim = y_0.dim() - 2;
+  TORCH_CHECK(
+      x_offsets.size() == static_cast<size_t>(num_jagged_dim),
+      "x_offsets.size(), ",
+      x_offsets.size(),
+      " != num_jagged_dim, ",
+      num_jagged_dim);
+
+  if (y_0.numel() == 0 || x_values.numel() == 0) {
+    return;
+  }
+
+  dim3 threads, blocks;
+  StackArray<int64_t> jagged_dims_tensor;
+  std::tie(threads, blocks, jagged_dims_tensor) =
+      check_shape_and_partition_(x_values, x_offsets, y_0);
+  // Patch up blocks.x because we're using different parallelization from other
+  // jagged kernels.
+  blocks.x = div_round_up(x_values.size(0), threads.y);
+
+  // Canonicalize y to 3D, collapsing jagged dimensions.
+  const Tensor y_0_reshaped = y_0.view({y_0.size(0), -1, y_0.size(-1)});
+  const Tensor y_1_reshaped = y_1.view({y_1.size(0), -1, y_1.size(-1)});
+
+#define INVOKE_KERNEL_WITH_DIM(NUM_JAGGED_DIM)                                 \
+  {                                                                            \
+    std::vector<Tensor> x_offsets_contig;                                      \
+    x_offsets_contig.resize(num_jagged_dim);                                   \
+    StackArray<index_t*> x_offset_ptrs;                                        \
+    x_offset_ptrs.ndim = num_jagged_dim;                                       \
+    StackArray<int64_t> x_offset_sizes;                                        \
+    x_offset_sizes.ndim = num_jagged_dim;                                      \
+    for (int d = 0; d < num_jagged_dim; ++d) {                                 \
+      x_offsets_contig[d] = x_offsets[d].contiguous();                         \
+      x_offset_ptrs.vals[d] =                                                  \
+          x_offsets_contig[d].template data_ptr<index_t>();                    \
+      x_offset_sizes.vals[d] = x_offsets[d].numel();                           \
+    }                                                                          \
+    jagged_dense_dense_elementwise_jagged_output_kernel_<                      \
+        NUM_JAGGED_DIM,                                                        \
+        index_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(    \
+        x_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(),      \
+        x_offset_ptrs,                                                         \
+        x_offset_sizes,                                                        \
+        y_0_reshaped.packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(),  \
+        y_1_reshaped.packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(),  \
+        output_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(), \
+        jagged_dims_tensor,                                                    \
+        f);                                                                    \
+  }
+
+  JAGGED_TENSOR_DISPATCH_DIMS();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+#undef INVOKE_KERNEL_WITH_DIM
+}
+
+///@ingroup jagged-tensor-ops-cuda
 at::Tensor jagged_to_padded_dense_forward(
     const Tensor& values,
     const std::vector<Tensor>& offsets,
@@ -480,7 +569,6 @@ at::Tensor jagged_to_padded_dense_backward(
 
   return grad_values;
 }
-
 class JaggedToPaddedDenseGPUOp
     : public torch::autograd::Function<JaggedToPaddedDenseGPUOp> {
  public:
@@ -519,6 +607,7 @@ class JaggedToPaddedDenseGPUOp
   }
 };
 
+///@ingroup jagged-tensor-ops-cuda
 Tensor jagged_to_padded_dense(
     const Tensor& values,
     const std::vector<Tensor>& offsets,
@@ -528,7 +617,8 @@ Tensor jagged_to_padded_dense(
       values, offsets, max_lengths, padding_value)[0];
 }
 
-// output = x + y where x is jagged, y and output are dense
+///@ingroup jagged-tensor-ops-cuda
+/// output = x + y where x is jagged, y and output are dense
 Tensor jagged_dense_elementwise_add(
     const Tensor& x_values,
     const std::vector<Tensor>& x_offsets,
@@ -549,6 +639,7 @@ Tensor jagged_dense_elementwise_add(
   return dense_output;
 }
 
+///@ingroup jagged-tensor-ops-cuda
 class DenseToJaggedGPUOp
     : public torch::autograd::Function<DenseToJaggedGPUOp> {
  public:
@@ -576,8 +667,12 @@ class DenseToJaggedGPUOp
     at::cuda::OptionalCUDAGuard device_guard;
     device_guard.set_index(dense.get_device());
 
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-        values.scalar_type(), "jagged_dense_add_forward", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::Long,
+        values.scalar_type(),
+        "jagged_dense_add_forward",
+        [&] {
           jagged_dense_elementwise_jagged_output_<scalar_t>(
               values,
               offsets,
@@ -615,6 +710,69 @@ class DenseToJaggedGPUOp
   }
 };
 
+class JaggedDenseDenseAddJaggedOutputGPUOp
+    : public torch::autograd::Function<JaggedDenseDenseAddJaggedOutputGPUOp> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const Tensor& x_values,
+      const std::vector<Tensor>& offsets,
+      const Tensor& dense_0,
+      const Tensor& dense_1) {
+    ctx->save_for_backward(offsets);
+    ctx->saved_data["dense_shape"] = dense_0.sizes();
+
+    TORCH_CHECK(dense_0.sizes() == dense_1.sizes());
+    auto output = at::empty_like(x_values);
+
+    at::cuda::OptionalCUDAGuard device_guard;
+    device_guard.set_index(dense_0.get_device());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        x_values.scalar_type(),
+        "jagged_dense_dense_elementwise_jagged_output_forward",
+        [&] {
+          jagged_dense_dense_elementwise_jagged_output_<scalar_t>(
+              x_values,
+              offsets,
+              dense_0,
+              dense_1,
+              output,
+              [] __device__(scalar_t x, scalar_t y_0, scalar_t y_1)
+                  -> scalar_t { return x + y_0 + y_1; });
+        });
+
+    return {output};
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_outputs) {
+    auto offsets = ctx->get_saved_variables();
+    auto dense_shape = ctx->saved_data["dense_shape"].toIntVector();
+    TORCH_CHECK(grad_outputs.size() == 1);
+
+    at::cuda::OptionalCUDAGuard device_guard;
+    device_guard.set_index(grad_outputs[0].get_device());
+
+    Tensor dense_values_grad_0 = jagged_to_padded_dense(
+        grad_outputs[0],
+        offsets,
+        std::vector<int64_t>(dense_shape.begin() + 1, dense_shape.end() - 1),
+        /*padding_value=*/0);
+    TORCH_CHECK(dense_values_grad_0.sizes() == dense_shape);
+
+    Tensor dense_values_grad_1 = dense_values_grad_0;
+
+    return {
+        grad_outputs[0],
+        torch::autograd::Variable(), // offsets
+        dense_values_grad_0,
+        dense_values_grad_1};
+  }
+};
+
+///@ingroup jagged-tensor-ops-cuda
 std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged(
     const Tensor& dense,
     const std::vector<Tensor>& offsets,
@@ -622,18 +780,86 @@ std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged(
   return {DenseToJaggedGPUOp::apply(dense, offsets, total_L)[0], offsets};
 }
 
-// output = x + y where x is jagged, y is dense, and output is jagged
+class JaggedDenseAddJaggedOutputGPUOp
+    : public torch::autograd::Function<JaggedDenseAddJaggedOutputGPUOp> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const Tensor& x_values,
+      const std::vector<Tensor>& offsets,
+      const Tensor& dense) {
+    ctx->save_for_backward(offsets);
+    ctx->saved_data["dense_shape"] = dense.sizes();
+
+    auto output = at::empty_like(x_values);
+
+    at::cuda::OptionalCUDAGuard device_guard;
+    device_guard.set_index(dense.get_device());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        x_values.scalar_type(),
+        "jagged_dense_elementwise_jagged_output_forward",
+        [&] {
+          jagged_dense_elementwise_jagged_output_<scalar_t>(
+              x_values,
+              offsets,
+              dense,
+              output,
+              [] __device__(scalar_t x, scalar_t y) -> scalar_t {
+                return x + y;
+              });
+        });
+
+    return {output};
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_outputs) {
+    auto offsets = ctx->get_saved_variables();
+    auto dense_shape = ctx->saved_data["dense_shape"].toIntVector();
+    TORCH_CHECK(grad_outputs.size() == 1);
+
+    at::cuda::OptionalCUDAGuard device_guard;
+    device_guard.set_index(grad_outputs[0].get_device());
+
+    Tensor dense_values_grad = jagged_to_padded_dense(
+        grad_outputs[0],
+        offsets,
+        std::vector<int64_t>(dense_shape.begin() + 1, dense_shape.end() - 1),
+        /*padding_value=*/0);
+    TORCH_CHECK(dense_values_grad.sizes() == dense_shape);
+
+    return {
+        grad_outputs[0],
+        torch::autograd::Variable(), // offsets
+        dense_values_grad};
+  }
+};
+
+///@ingroup jagged-tensor-ops-cuda
+/// output = x + y where x is jagged, y is dense, and output is jagged
 std::tuple<Tensor, std::vector<Tensor>>
 jagged_dense_elementwise_add_jagged_output(
     const Tensor& x_values,
     const std::vector<Tensor>& x_offsets,
     const Tensor& y) {
-  // Convert to jagged
-  auto jagged_values =
-      DenseToJaggedGPUOp::apply(y, x_offsets, c10::optional<int64_t>())[0];
+  auto sum_values =
+      JaggedDenseAddJaggedOutputGPUOp::apply(x_values, x_offsets, y)[0];
+
+  return {sum_values, x_offsets};
+}
 
-  // Add jagged_values + x_values -> sum_values
-  auto sum_values = x_values + jagged_values;
+// output = x + y_0 + y_1 where x is jagged, y_0 and y_1 are dense, and output
+// is jagged
+std::tuple<Tensor, std::vector<Tensor>>
+jagged_dense_dense_elementwise_add_jagged_output(
+    const Tensor& x_values,
+    const std::vector<Tensor>& x_offsets,
+    const Tensor& y_0,
+    const Tensor& y_1) {
+  auto sum_values = JaggedDenseDenseAddJaggedOutputGPUOp::apply(
+      x_values, x_offsets, y_0, y_1)[0];
 
   return {sum_values, x_offsets};
 }
@@ -649,11 +875,11 @@ __global__
 __launch_bounds__(kMaxThreads) void jagged_jagged_elementwise_dense_output_kernel_(
     const at::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits>
         x_values,
-    const std::array<index_t*, NUM_JAGGED_DIM> x_offsets,
+    StackArray<index_t*> x_offsets,
     const at::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits>
         y_values,
     at::PackedTensorAccessor32<scalar_t, 3, at::RestrictPtrTraits> output,
-    const int64_t* jagged_dims,
+    StackArray<int64_t> jagged_dims,
     F f,
     const scalar_t padding_value) {
   const int outer_dense_size = output.size(0);
@@ -712,7 +938,7 @@ void jagged_jagged_elementwise_dense_output_(
   }
 
   dim3 threads, blocks;
-  Tensor jagged_dims_tensor;
+  StackArray<int64_t> jagged_dims_tensor;
   std::tie(threads, blocks, jagged_dims_tensor) =
       check_shape_and_partition_(x_values, x_offsets, output);
 
@@ -721,11 +947,14 @@ void jagged_jagged_elementwise_dense_output_(
 
 #define INVOKE_KERNEL_WITH_DIM(NUM_JAGGED_DIM)                                \
   {                                                                           \
-    Tensor x_offsets_contig[num_jagged_dim];                                  \
-    std::array<index_t*, NUM_JAGGED_DIM> x_offset_ptrs;                       \
+    std::vector<Tensor> x_offsets_contig;                                     \
+    x_offsets_contig.resize(num_jagged_dim);                                  \
+    StackArray<index_t*> x_offset_ptrs;                                       \
+    x_offset_ptrs.ndim = num_jagged_dim;                                      \
     for (int d = 0; d < num_jagged_dim; ++d) {                                \
       x_offsets_contig[d] = x_offsets[d].contiguous();                        \
-      x_offset_ptrs[d] = x_offsets_contig[d].template data_ptr<index_t>();    \
+      x_offset_ptrs.vals[d] =                                                 \
+          x_offsets_contig[d].template data_ptr<index_t>();                   \
     }                                                                         \
     jagged_jagged_elementwise_dense_output_kernel_<NUM_JAGGED_DIM, index_t>   \
         <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(           \
@@ -734,7 +963,7 @@ void jagged_jagged_elementwise_dense_output_(
             y_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(), \
             output_reshaped                                                   \
                 .packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(),     \
-            jagged_dims_tensor.data_ptr<int64_t>(),                           \
+            jagged_dims_tensor,                                               \
             f,                                                                \
             padding_value);                                                   \
   }
@@ -745,16 +974,90 @@ void jagged_jagged_elementwise_dense_output_(
 #undef INVOKE_KERNEL_WITH_DIM
 }
 
+class JaggedDenseMulGPUOp
+    : public torch::autograd::Function<JaggedDenseMulGPUOp> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const Tensor& x_values,
+      const Tensor& y,
+      const std::vector<Tensor>& x_offsets) {
+    std::vector<Tensor> tensors_to_save;
+    tensors_to_save.push_back(x_values);
+    tensors_to_save.insert(
+        tensors_to_save.end(), x_offsets.begin(), x_offsets.end());
+    tensors_to_save.push_back(y);
+    ctx->save_for_backward(tensors_to_save);
+
+    at::cuda::OptionalCUDAGuard device_guard;
+    device_guard.set_index(x_values.get_device());
+
+    Tensor output = at::empty_like(x_values);
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        x_values.scalar_type(), "jagged_scalars", [&] {
+          jagged_dense_elementwise_jagged_output_<scalar_t>(
+              x_values,
+              x_offsets,
+              y,
+              output,
+              [] __device__(scalar_t x, scalar_t y) -> scalar_t {
+                return x * y;
+              });
+        });
+
+    return {output};
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_outputs) {
+    const Tensor x_values = ctx->get_saved_variables().front();
+    std::vector<Tensor> x_offsets;
+    for (int i = 1; i < ctx->get_saved_variables().size() - 1; ++i) {
+      x_offsets.push_back(ctx->get_saved_variables()[i]);
+    }
+    Tensor y = ctx->get_saved_variables().back();
+    TORCH_CHECK(grad_outputs.size() == 1);
+
+    at::cuda::OptionalCUDAGuard device_guard;
+    device_guard.set_index(grad_outputs[0].get_device());
+
+    Tensor x_values_grad = at::empty_like(grad_outputs[0]);
+    Tensor y_grad = at::empty_like(y);
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        x_values.scalar_type(), "jagged_scalars", [&] {
+          jagged_dense_elementwise_jagged_output_<scalar_t>(
+              grad_outputs[0],
+              x_offsets,
+              y,
+              x_values_grad,
+              [] __device__(scalar_t x, scalar_t y) -> scalar_t {
+                return x * y;
+              });
+
+          jagged_jagged_elementwise_dense_output_<scalar_t>(
+              grad_outputs[0],
+              x_offsets,
+              x_values,
+              y_grad,
+              [] __device__(scalar_t x, scalar_t y) -> scalar_t {
+                return x * y;
+              });
+        });
+
+    return {x_values_grad, y_grad, torch::autograd::Variable()};
+  }
+};
+
+///@ingroup jagged-tensor-ops-cuda
 std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_mul(
     const Tensor& x_values,
     const std::vector<Tensor>& x_offsets,
     const Tensor& y) {
   // Convert to jagged
-  auto jagged_values =
-      DenseToJaggedGPUOp::apply(y, x_offsets, c10::optional<int64_t>())[0];
-
-  // Multiply x_values * jagged_values -> prod_values
-  auto prod_values = x_values * jagged_values;
+  auto prod_values = JaggedDenseMulGPUOp::apply(x_values, y, x_offsets)[0];
 
   return {prod_values, x_offsets};
 }
@@ -1017,6 +1320,7 @@ class BatchedDenseVecJagged2DMulGPUOp
   }
 };
 
+///@ingroup jagged-tensor-ops-cuda
 Tensor batched_dense_vec_jagged_2d_mul(
     const Tensor& v,
     const Tensor& a_values,
@@ -1210,6 +1514,317 @@ std::vector<Tensor> stacked_jagged_1d_to_dense_gpu(
   return padded_values_per_key;
 }
 
+template <typename scalar_t>
+__device__ __forceinline__ void binary_search_range(
+    int* found,
+    const scalar_t* arr,
+    const scalar_t target,
+    const int num_entries) {
+  const int last_entry = num_entries - 1;
+  int start = 0, end = last_entry;
+  int found_ = -1;
+  while (start <= end) {
+    int mid = start + (end - start) / 2;
+    scalar_t mid_offset = arr[mid];
+    if (target == mid_offset) {
+      if (mid != last_entry && target != arr[last_entry]) {
+        // Do linear scan in case of duplicate data (We assume that the
+        // number of duplicates is small.  This can we very bad if the
+        // number of duplicates is large)
+        for (int i = mid + 1; i < num_entries; i++) {
+          if (target != arr[i]) {
+            found_ = i;
+            break;
+          }
+        }
+      }
+      break;
+    } else if (target < mid_offset) {
+      if (mid == 0) {
+        found_ = 0;
+        break;
+      } else if (mid - 1 >= 0 && target > arr[mid - 1]) {
+        found_ = mid;
+        break;
+      }
+      end = mid - 1;
+    } else {
+      if (mid + 1 <= last_entry && target < arr[mid + 1]) {
+        found_ = mid + 1;
+        break;
+      }
+      start = mid + 1;
+    }
+  }
+  *found = found_;
+}
+
+template <typename index_t, typename offset_t, typename scalar_t>
+__global__ __launch_bounds__(kMaxThreads) void jagged_index_select_2d_kernel(
+    scalar_t* output,
+    const scalar_t* input,
+    const offset_t* input_offsets,
+    const index_t* indices,
+    const offset_t* output_offsets,
+    const int64_t num_output_rows,
+    const int64_t num_dense_output_rows,
+    const int64_t num_cols) {
+  __shared__ int smem[1];
+  for (offset_t dense_output_offset = blockIdx.x;
+       dense_output_offset < num_dense_output_rows;
+       dense_output_offset += gridDim.x) {
+    // Binary search
+    // TODO: use multiple threads to do bin search to reduce number of steps
+    if (threadIdx.x == 0) {
+      binary_search_range(
+          smem, output_offsets, dense_output_offset, num_output_rows);
+    }
+    __syncthreads();
+
+    // All threads load index_pos from shared memory and return if the index_pos
+    // is invalid
+    int index_pos = smem[0];
+
+    // TODO: Can also be obtained during the binary search
+    // Relative index position
+    const offset_t rel_index = dense_output_offset -
+        (index_pos == 0 ? 0 : output_offsets[index_pos - 1]);
+    const index_t index = indices[index_pos];
+    const offset_t input_offset =
+        (index == 0 ? 0 : input_offsets[index - 1]) + rel_index;
+
+    // Shift buffers
+    scalar_t* output_ = output + dense_output_offset * num_cols;
+    const scalar_t* input_ = input + input_offset * num_cols;
+
+    for (int i = threadIdx.x; i < num_cols; i += blockDim.x) {
+      output_[i] = input_[i];
+    }
+  }
+}
+
+Tensor jagged_index_select_2d_cuda(
+    const Tensor& values,
+    const Tensor& indices,
+    const Tensor& input_offsets,
+    const Tensor& output_offsets,
+    const int64_t num_dense_output_rows) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values.get_device());
+
+  auto num_cols = values.size(1);
+  const int64_t num_output_rows = indices.numel();
+
+  const int64_t max_num_blocks = 1024; // Arbitrarily set to this number of now
+  const int64_t max_num_threads = kMaxThreads;
+  const int64_t num_blocks = std::min(max_num_blocks, num_dense_output_rows);
+  const int64_t num_threads = std::min(max_num_threads, num_cols);
+  Tensor output =
+      at::empty({num_dense_output_rows, num_cols}, values.options());
+
+  if (num_blocks > 0) {
+    AT_DISPATCH_ALL_TYPES_AND(
+        at::ScalarType::Half,
+        values.scalar_type(),
+        "jagged_index_select_2d_kernel_wrapper_1",
+        [&] {
+          AT_DISPATCH_INDEX_TYPES(
+              indices.scalar_type(),
+              "jagged_index_select_2d_kernel_wrapper_2",
+              [&] {
+                jagged_index_select_2d_kernel<<<
+                    dim3(num_blocks),
+                    dim3(num_cols),
+                    0,
+                    at::cuda::getCurrentCUDAStream()>>>(
+                    output.data_ptr<scalar_t>(),
+                    values.data_ptr<scalar_t>(),
+                    input_offsets.data_ptr<int64_t>(),
+                    indices.data_ptr<index_t>(),
+                    output_offsets.data_ptr<int64_t>(),
+                    num_output_rows,
+                    num_dense_output_rows,
+                    num_cols);
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+              });
+        });
+  }
+
+  return output;
+}
+
+template <typename index_t, typename offset_t, typename scalar_t>
+__global__ __launch_bounds__(kMaxThreads) void jagged_index_add_2d_kernel(
+    scalar_t* output,
+    const scalar_t* grad,
+    const offset_t* grad_offsets,
+    const index_t* indices,
+    const offset_t* output_offsets,
+    const int64_t num_grad_rows,
+    const int64_t num_dense_grad_rows,
+    const int64_t num_cols) {
+  __shared__ int smem[1];
+  for (offset_t dense_grad_offset = blockIdx.x;
+       dense_grad_offset < num_dense_grad_rows;
+       dense_grad_offset += gridDim.x) {
+    // Binary search
+    // TODO: use multiple threads to do bin search to reduce number of steps
+    if (threadIdx.x == 0) {
+      binary_search_range(smem, grad_offsets, dense_grad_offset, num_grad_rows);
+    }
+    __syncthreads();
+
+    // All threads load index_pos from shared memory and return if the index_pos
+    // is invalid
+    int index_pos = smem[0];
+
+    // TODO: Can also be obtained during the binary search
+    // Relative index position
+    const offset_t rel_index =
+        dense_grad_offset - (index_pos == 0 ? 0 : grad_offsets[index_pos - 1]);
+    const index_t index = indices[index_pos];
+    const offset_t output_offset =
+        (index == 0 ? 0 : output_offsets[index - 1]) + rel_index;
+
+    // Shift buffers
+    const scalar_t* grad_ = grad + dense_grad_offset * num_cols;
+    scalar_t* output_ = output + output_offset * num_cols;
+
+    // TODO: Avoid using atoimcAdd (because it could lead to the numerical
+    // indeterminism issue)
+    for (int i = threadIdx.x; i < num_cols; i += blockDim.x) {
+      gpuAtomicAdd(&output_[i], grad_[i]);
+    }
+  }
+}
+
+Tensor jagged_index_add_2d_cuda(
+    const Tensor& grad,
+    const Tensor& indices,
+    const Tensor& grad_offsets,
+    const Tensor& output_offsets,
+    const int64_t num_dense_grad_rows,
+    const int64_t num_output_rows) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(grad.get_device());
+
+  auto num_cols = grad.size(1);
+  const int64_t num_grad_rows = indices.numel();
+
+  const int64_t max_num_blocks = 1024; // Arbitrarily set to this number of now
+  const int64_t max_num_threads = kMaxThreads;
+  const int64_t num_blocks = std::min(max_num_blocks, num_dense_grad_rows);
+  const int64_t num_threads = std::min(max_num_threads, num_cols);
+  Tensor output = at::zeros({num_output_rows, num_cols}, grad.options());
+
+  if (num_blocks > 0) {
+    AT_DISPATCH_ALL_TYPES_AND(
+        at::ScalarType::Half,
+        grad.scalar_type(),
+        "jagged_index_add_2d_kernel_wrapper_1",
+        [&] {
+          AT_DISPATCH_INDEX_TYPES(
+              indices.scalar_type(),
+              "jagged_index_add_2d_kernel_wrapper_2",
+              [&] {
+                jagged_index_add_2d_kernel<<<
+                    dim3(num_blocks),
+                    dim3(num_cols),
+                    0,
+                    at::cuda::getCurrentCUDAStream()>>>(
+                    output.data_ptr<scalar_t>(),
+                    grad.data_ptr<scalar_t>(),
+                    grad_offsets.data_ptr<int64_t>(),
+                    indices.data_ptr<index_t>(),
+                    output_offsets.data_ptr<int64_t>(),
+                    num_grad_rows,
+                    num_dense_grad_rows,
+                    num_cols);
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+              });
+        });
+  }
+
+  return output;
+}
+
+class JaggedIndexSelect2dGPUOp
+    : public torch::autograd::Function<JaggedIndexSelect2dGPUOp> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const Tensor& values,
+      const Tensor& lengths,
+      const Tensor& indices) {
+    TENSOR_ON_CUDA_GPU(lengths);
+    TENSOR_ON_CUDA_GPU(values);
+    TENSOR_ON_CUDA_GPU(indices);
+    TENSORS_ON_SAME_DEVICE(lengths, indices);
+    TENSORS_ON_SAME_DEVICE(values, indices);
+
+    Tensor output_lengths = at::index_select(lengths, 0, indices);
+    Tensor output_offsets = output_lengths.cumsum(0);
+    Tensor input_offsets = lengths.cumsum(0);
+
+    // TODO: Try to not do D->H transfer
+    // The challenge here is num_dense_output_rows is needed for allocating the
+    // output buffer
+    int64_t num_dense_output_rows =
+        output_offsets[output_offsets.numel() - 1].item<int64_t>();
+
+    ctx->save_for_backward({indices, output_offsets, input_offsets});
+    ctx->saved_data["num_dense_grad_rows"] = num_dense_output_rows;
+    ctx->saved_data["num_input_rows"] = values.size(0);
+
+    return {
+        jagged_index_select_2d_cuda(
+            values,
+            indices,
+            input_offsets,
+            output_offsets,
+            num_dense_output_rows),
+        output_lengths};
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_outputs) {
+    TORCH_CHECK(grad_outputs.size() == 2);
+    TENSOR_ON_CUDA_GPU(grad_outputs[0]);
+
+    const auto saved = ctx->get_saved_variables();
+    auto savedItr = std::begin(saved);
+    Tensor indices = *savedItr++;
+    Tensor grad_offsets = *savedItr++;
+    Tensor output_offsets = *savedItr++;
+
+    Tensor grad = grad_outputs[0];
+    TENSORS_ON_SAME_DEVICE(grad, indices);
+
+    int64_t num_dense_grad_rows =
+        ctx->saved_data["num_dense_grad_rows"].toInt();
+    int64_t num_output_rows = ctx->saved_data["num_input_rows"].toInt();
+
+    return {
+        jagged_index_add_2d_cuda(
+            grad,
+            indices,
+            grad_offsets,
+            output_offsets,
+            num_dense_grad_rows,
+            num_output_rows),
+        torch::autograd::Variable(), // lengths
+        torch::autograd::Variable() // indices
+    };
+  }
+};
+
+std::vector<Tensor> jagged_index_select_2d_gpu(
+    const Tensor& values,
+    const Tensor& lengths,
+    const Tensor& indices) {
+  return JaggedIndexSelect2dGPUOp::apply(values, lengths, indices);
+}
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
@@ -1221,9 +1836,14 @@ TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   DISPATCH_TO_CUDA(
       "jagged_dense_elementwise_add_jagged_output",
       fbgemm_gpu::jagged_dense_elementwise_add_jagged_output);
+  DISPATCH_TO_CUDA(
+      "jagged_dense_dense_elementwise_add_jagged_output",
+      fbgemm_gpu::jagged_dense_dense_elementwise_add_jagged_output);
   DISPATCH_TO_CUDA(
       "jagged_dense_elementwise_mul", fbgemm_gpu::jagged_dense_elementwise_mul);
   DISPATCH_TO_CUDA(
       "batched_dense_vec_jagged_2d_mul",
       fbgemm_gpu::batched_dense_vec_jagged_2d_mul);
+  DISPATCH_TO_CUDA(
+      "jagged_index_select", fbgemm_gpu::jagged_index_select_2d_gpu);
 }
diff --git a/fbgemm_gpu/src/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops_cpu.cpp
index 26dc1f63b..459863cbb 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops_cpu.cpp
+++ b/fbgemm_gpu/src/jagged_tensor_ops_cpu.cpp
@@ -14,10 +14,16 @@
 
 namespace fbgemm_gpu {
 
+///@defgroup jagged-tensor-ops-cpu Jagged Tensor Operators
+/// The following are Jagged Tensor CPU Operators
+
 using Tensor = at::Tensor;
 
 namespace {
 
+///@defgroup jagged-tensor-ops-cpu Jagged Tensor Operators
+/// The following are Jagged Tensor CPU Operators
+
 // Ref. http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
 template <int NUM_JAGGED_DIM, typename index_t>
 inline bool walk_down_tensor_storage_tree_except_last_(
@@ -327,6 +333,7 @@ void jagged_dense_elementwise_jagged_output_kernel_(
   } // for each oidx
 }
 
+/// @ingroup jagged-tensor-ops-cpu
 template <typename scalar_t, typename F>
 void jagged_dense_elementwise_jagged_output_(
     const Tensor& x_values,
@@ -355,6 +362,7 @@ void jagged_dense_elementwise_jagged_output_(
 #undef INVOKE_KERNEL_WITH_DIM
 }
 
+///@ingroup jagged-tensor-ops-cpu
 class JaggedToPaddedDenseCPUOp
     : public torch::autograd::Function<JaggedToPaddedDenseCPUOp> {
  public:
@@ -389,6 +397,11 @@ class JaggedToPaddedDenseCPUOp
       padded_values_shape.push_back(values.size(-1));
     }
     Tensor padded_values = at::empty(padded_values_shape, values.options());
+    if (values.numel() == 0) {
+      // To avoid an error due to values_canonicalized.data_ptr is nullptr.
+      padded_values.fill_(padding_value);
+      return {padded_values};
+    }
     Tensor padded_values_view =
         values.dim() == 1 ? padded_values.unsqueeze(-1) : padded_values;
 
@@ -446,6 +459,7 @@ class JaggedToPaddedDenseCPUOp
   }
 };
 
+///@ingroup jagged-tensor-ops-cpu
 Tensor jagged_to_padded_dense(
     const Tensor& values,
     const std::vector<Tensor>& offsets,
@@ -455,7 +469,8 @@ Tensor jagged_to_padded_dense(
       values, offsets, max_lengths, padding_value)[0];
 }
 
-// output = x + y where x is jagged, y and output are dense
+///@ingroup jagged-tensor-ops-cpu
+/// Output = x + y where x is jagged, y and output are dense
 Tensor jagged_dense_elementwise_add(
     const Tensor& x_values,
     const std::vector<Tensor>& x_offsets,
@@ -476,6 +491,7 @@ Tensor jagged_dense_elementwise_add(
   return dense_output;
 }
 
+///@ingroup jagged-tensor-ops-cpu
 // TODO: Add option to pass in total_L
 class DenseToJaggedCPUOp
     : public torch::autograd::Function<DenseToJaggedCPUOp> {
@@ -538,6 +554,7 @@ class DenseToJaggedCPUOp
   }
 };
 
+///@ingroup jagged-tensor-ops-cpu
 // output = x + y where x is jagged, y is dense, and output is jagged
 std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged_cpu(
     const Tensor& dense,
@@ -546,7 +563,8 @@ std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged_cpu(
   return {DenseToJaggedCPUOp::apply(dense, offsets, total_L)[0], offsets};
 }
 
-// output = x + y where x is jagged, y is dense, and output is jagged
+///@ingroup jagged-tensor-ops-cpu
+/// Output = x + y where x is jagged, y is dense, and output is jagged
 std::tuple<Tensor, std::vector<Tensor>>
 jagged_dense_elementwise_add_jagged_output(
     const Tensor& x_values,
@@ -562,6 +580,25 @@ jagged_dense_elementwise_add_jagged_output(
   return {sum_values, x_offsets};
 }
 
+// output = x + y where x is jagged, y is dense, and output is jagged
+std::tuple<Tensor, std::vector<Tensor>>
+jagged_dense_dense_elementwise_add_jagged_output(
+    const Tensor& x_values,
+    const std::vector<Tensor>& x_offsets,
+    const Tensor& y_0,
+    const Tensor& y_1) {
+  // Convert to jagged
+  auto jagged_values_0 =
+      DenseToJaggedCPUOp::apply(y_0, x_offsets, c10::optional<int64_t>())[0];
+  auto jagged_values_1 =
+      DenseToJaggedCPUOp::apply(y_1, x_offsets, c10::optional<int64_t>())[0];
+
+  // Add jagged_values + x_values -> sum_values
+  auto sum_values = x_values + jagged_values_0 + jagged_values_1;
+
+  return {sum_values, x_offsets};
+}
+
 template <
     int NUM_JAGGED_DIM,
     bool NO_INNER_DENSE,
@@ -694,6 +731,7 @@ void jagged_jagged_elementwise_dense_output_(
 #undef INVOKE_KERNEL_WITH_DIM
 }
 
+///@addtogroup jagged-tensor-ops-cpu
 std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_mul(
     const Tensor& x_values,
     const std::vector<Tensor>& x_offsets,
@@ -912,6 +950,7 @@ class BatchedDenseVecJagged2DMulCPUOp
   }
 };
 
+///@ingroup jagged-tensor-ops-cpu
 Tensor batched_dense_vec_jagged_2d_mul(
     const Tensor& v,
     const Tensor& a_values,
@@ -921,6 +960,7 @@ Tensor batched_dense_vec_jagged_2d_mul(
 
 } // namespace
 
+///@ingroup jagged-tensor-ops-cpu
 Tensor
 jagged_2d_to_dense_forward_cpu(Tensor values, Tensor offsets, int64_t max_L) {
   TORCH_CHECK(values.dim() == 2);
@@ -931,6 +971,7 @@ jagged_2d_to_dense_forward_cpu(Tensor values, Tensor offsets, int64_t max_L) {
       values, {offsets}, {max_L}, /*padding_value=*/0);
 }
 
+///@ingroup jagged-tensor-ops-cpu
 Tensor jagged_1d_to_dense_cpu(
     Tensor values,
     Tensor offsets,
@@ -969,6 +1010,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   // output offsets is same as x_offsets)
   m.def(
       "jagged_dense_elementwise_add_jagged_output(Tensor x_values, Tensor[] x_offsets, Tensor y) -> (Tensor, Tensor[])");
+  m.def(
+      "jagged_dense_dense_elementwise_add_jagged_output(Tensor x_values, Tensor[] x_offsets, Tensor y_0, Tensor y_1) -> (Tensor, Tensor[])");
   // jagged * dense -> jagged (its offsets is same as x_offsets)
   m.def(
       "jagged_dense_elementwise_mul(Tensor x_values, Tensor[] x_offsets, Tensor y) -> (Tensor, Tensor[])");
@@ -987,6 +1030,9 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
   DISPATCH_TO_CPU(
       "jagged_dense_elementwise_add_jagged_output",
       fbgemm_gpu::jagged_dense_elementwise_add_jagged_output);
+  DISPATCH_TO_CPU(
+      "jagged_dense_dense_elementwise_add_jagged_output",
+      fbgemm_gpu::jagged_dense_dense_elementwise_add_jagged_output);
   DISPATCH_TO_CPU(
       "jagged_dense_elementwise_mul", fbgemm_gpu::jagged_dense_elementwise_mul);
   DISPATCH_TO_CPU(
diff --git a/fbgemm_gpu/src/layout_transform_ops.cu b/fbgemm_gpu/src/layout_transform_ops.cu
index ed6334503..728fe2046 100644
--- a/fbgemm_gpu/src/layout_transform_ops.cu
+++ b/fbgemm_gpu/src/layout_transform_ops.cu
@@ -27,8 +27,12 @@
 
 using Tensor = at::Tensor;
 
+///@defgroup layout-transform-cuda Layout Transformation CUDA Operators
+///
+
 namespace fbgemm_gpu {
 
+///@ingroup layout-transform-cuda
 Tensor recat_embedding_grad_output_cuda(
     Tensor grad_output, // [B_local][T_global][D]
     const std::vector<int64_t>& num_features_per_rank) {
@@ -51,6 +55,9 @@ Tensor recat_embedding_grad_output_cuda(
         int64_t feature_offset = 0;
         int64_t sgo_offset = 0;
         for (auto num_features : num_features_per_rank) {
+          if (num_features == 0) {
+            continue;
+          }
           AT_CUDA_CHECK(cudaMemcpy2DAsync(
               &sgo[sgo_offset],
               num_features * D * sizeof(scalar_t),
@@ -69,6 +76,7 @@ Tensor recat_embedding_grad_output_cuda(
   return sharded_grad_output;
 }
 
+///@ingroup layout-transform-cuda
 Tensor recat_embedding_grad_output_mixed_D_cuda(
     const Tensor& grad_output, // [B_local][Sum_T_global(D)]
     const std::vector<int64_t>& dim_sum_per_rank) {
@@ -91,6 +99,9 @@ Tensor recat_embedding_grad_output_mixed_D_cuda(
         int64_t sgo_offset = 0;
         int64_t accum_dim_sum = 0;
         for (auto dim_sum : dim_sum_per_rank) {
+          if (dim_sum == 0) {
+            continue;
+          }
           AT_CUDA_CHECK(cudaMemcpy2DAsync(
               &sgo[sgo_offset],
               dim_sum * sizeof(scalar_t),
@@ -110,6 +121,7 @@ Tensor recat_embedding_grad_output_mixed_D_cuda(
   return sharded_grad_output;
 }
 
+///@ingroup layout-transform-cuda
 Tensor recat_embedding_grad_output_mixed_D_batch_cuda(
     const Tensor& grad_output, // [B_local][Sum_T_global(D)]
     const Tensor& dim_sum_per_rank,
diff --git a/fbgemm_gpu/src/layout_transform_ops_cpu.cpp b/fbgemm_gpu/src/layout_transform_ops_cpu.cpp
index 0ef05fa90..454f4ce27 100644
--- a/fbgemm_gpu/src/layout_transform_ops_cpu.cpp
+++ b/fbgemm_gpu/src/layout_transform_ops_cpu.cpp
@@ -12,8 +12,12 @@
 
 using Tensor = at::Tensor;
 
+///@defgroup layout-transform-cpu Layout Transformation CPU Operators
+///
+
 namespace fbgemm_gpu {
 
+///@ingroup layout-transform-cpu
 Tensor recat_embedding_grad_output_mixed_D_cpu(
     const Tensor& grad_output, // [B_local][Sum_T_global(D)]
     const std::vector<int64_t>& dim_sum_per_rank) {
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index 18bcf8cb7..1acb91f54 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -16,9 +16,10 @@
 #include <torch/library.h>
 
 #ifdef __HIP_PLATFORM_HCC__
-#include "rocm_smi/rocm_smi.h"
 #include "hip/hip_runtime.h"
+#include "rocm_smi/rocm_smi.h"
 
+#include <inttypes.h>
 #include <algorithm>
 
 #include "fbgemm_gpu/merge_pooled_embeddings.h"
@@ -26,9 +27,9 @@
 
 using Tensor = at::Tensor;
 
-#define RSMI_CHECK(fn)                  \
-  do {                                  \
-    rsmi_status_t ret = (fn);            \
+#define RSMI_CHECK(fn)                         \
+  do {                                         \
+    rsmi_status_t ret = (fn);                  \
     TORCH_CHECK((ret) == RSMI_STATUS_SUCCESS); \
   } while (0)
 
@@ -60,9 +61,16 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
     bus = (pci_info >> 8) & 0xff;
     device = (pci_info >> 3) & 0x1f;
     function = pci_info & 0x7;
-    // Different form CUDA, we do not get the PCI BUS ID as a char* and we need to reconstruct it.
+    // Different form CUDA, we do not get the PCI BUS ID as a char* and we need
+    // to reconstruct it.
     char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    sprintf(pci_bus_id_str, "%04X:%02X:%02X.%0X", domain, bus, device, function);
+    sprintf(
+        pci_bus_id_str,
+        "%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64,
+        domain,
+        bus,
+        device,
+        function);
 
     std::array<char, RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
     std::copy(
@@ -83,12 +91,13 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
   std::vector<Links> links(world_size * world_size);
   for (const auto i : c10::irange(world_size)) {
     auto src_rsmi_device = rocm_device_to_rsmi_device.find(i);
-    if (src_rsmi_device != rocm_device_to_rsmi_device.end()){
+    if (src_rsmi_device != rocm_device_to_rsmi_device.end()) {
       for (const auto j : c10::irange(world_size)) {
         auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j);
-        if (dst_rsmi_device != rocm_device_to_rsmi_device.end()){
+        if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) {
           bool is_active;
-          RSMI_CHECK(rsmi_is_P2P_accessible(src_rsmi_device->second, dst_rsmi_device->second, &is_active));
+          RSMI_CHECK(rsmi_is_P2P_accessible(
+              src_rsmi_device->second, dst_rsmi_device->second, &is_active));
           if (is_active) {
             links[i * world_size + j] += 1;
           }
@@ -101,7 +110,7 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
 }
 } // namespace
 
-#else  // CUDA
+#else // CUDA
 #include <nvml.h>
 
 #include <algorithm>
diff --git a/fbgemm_gpu/src/metric_ops.cu b/fbgemm_gpu/src/metric_ops.cu
new file mode 100644
index 000000000..1a64ec7a9
--- /dev/null
+++ b/fbgemm_gpu/src/metric_ops.cu
@@ -0,0 +1,352 @@
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <math.h>
+#include <ATen/cuda/Atomic.cuh>
+#include <algorithm>
+
+// clang-format off
+#ifdef __HIP_PLATFORM_HCC__
+#define HIPCUB_ARCH 1
+#include <hipcub/backend/rocprim/block/block_scan.hpp>
+#else
+#include "fbgemm_gpu/cub_namespace_prefix.cuh"
+#include <cub/block/block_scan.cuh>
+#include "fbgemm_gpu/cub_namespace_postfix.cuh"
+#endif
+// clang-format on
+
+#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
+#include "metric_ops.h"
+
+constexpr int MAX_ENTRIES_PER_BLOCK = 512;
+constexpr int NUM_THREADS_PER_BLOCK = 256;
+
+namespace fbgemm_gpu {
+
+#ifdef __HIP_PLATFORM_HCC__
+namespace cub = hipcub;
+#endif
+
+template <typename scalar_t, int ITEMS_PER_THREAD>
+__inline__ __device__ void inclusive_sum_scan_kernel(
+    scalar_t (&arr)[ITEMS_PER_THREAD],
+    typename cub::BlockScan<scalar_t, NUM_THREADS_PER_BLOCK>::TempStorage&
+        temp_storage,
+    int* block_flags, // global flags for inter-block sync
+    scalar_t* block_sums, // global sums for inter-block sync
+    scalar_t* block_prev, // shared memory for previous sum sync within a block
+    const int num_entries_per_block,
+    const int block_id,
+    const bool is_multi_block,
+    const int signal) {
+  // Perform scan within a block
+  cub::BlockScan<scalar_t, NUM_THREADS_PER_BLOCK>(temp_storage)
+      .InclusiveSum(arr, arr);
+
+  // Perform stream scan across blocks
+  if (is_multi_block) {
+    // The thread that holds the last entry in the block does synchronization
+    if (threadIdx.x == (num_entries_per_block - 1) / ITEMS_PER_THREAD) {
+      scalar_t block_prev_local = 0;
+      if (block_id != 0) {
+        // Spin wait for the previous block to write the sum value
+        while (atomicAdd(&block_flags[block_id - 1], 0) < signal)
+          ;
+
+        // Get sum from the previous block
+        *block_prev = block_prev_local = block_sums[block_id - 1];
+      }
+
+      // Write sum to global memory for the next block to consume
+      const int scope = (num_entries_per_block - 1) % ITEMS_PER_THREAD;
+      block_sums[block_id] = block_prev_local + arr[scope];
+      __threadfence();
+      // Set a flag to notify the next block
+      atomicAdd(&block_flags[block_id], 1);
+    }
+
+    __syncthreads();
+
+    if (block_id != 0) {
+      scalar_t block_prev_local = *block_prev;
+      for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+        arr[i] += block_prev_local;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__inline__ __device__ void trapz_kernel(
+    scalar_t* output,
+    const scalar_t* y,
+    const scalar_t* x,
+    const scalar_t* block_y,
+    const scalar_t* block_x,
+    const int num_entries_per_block,
+    const int block_id) {
+  scalar_t sum = 0;
+  // Compute inter-block pair
+  if (block_id > 0 && threadIdx.x == 0) {
+    sum +=
+        0.5 * (x[0] - block_x[block_id - 1]) * (y[0] + block_y[block_id - 1]);
+  }
+  for (int i = threadIdx.x + 1; i < num_entries_per_block; i += blockDim.x) {
+    sum += 0.5 * (x[i] - x[i - 1]) * (y[i] + y[i - 1]);
+  }
+  sum = warpReduceAllSum(sum);
+  // Only first lane threads accumulate results
+  // Expect output to be initialize with zero (we initialize output outside of
+  // trapz_kernel to avoid calling another __syncthreads() here)
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicAdd(output, sum);
+  }
+}
+
+template <
+    typename index_t,
+    typename label_t,
+    typename weight_t,
+    typename acc_t,
+    int PADDED_SECTION_SIZE>
+__global__ void auc_kernel(
+    acc_t* output,
+    const index_t* indices,
+    const label_t* labels,
+    const weight_t* weights,
+    int* block_flags,
+    acc_t* block_sums,
+    const int num_entries,
+    const int last_block_num_entries,
+    const int padded_num_entries_per_block,
+    const int num_blocks) {
+  typedef cub::BlockScan<acc_t, NUM_THREADS_PER_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage bs_temp_storage;
+  __shared__ acc_t smem[MAX_ENTRIES_PER_BLOCK * 2 + 3];
+  acc_t* smem_fp = smem;
+  acc_t* smem_tp = smem + padded_num_entries_per_block;
+  acc_t* smem_tmp = smem_tp + padded_num_entries_per_block;
+  acc_t* smem_auc = smem_tmp + 2;
+
+  const int block_id = blockIdx.x % num_blocks;
+  const int task_id = blockIdx.x / num_blocks;
+
+  const int num_entries_per_block = block_id == num_blocks - 1
+      ? last_block_num_entries
+      : MAX_ENTRIES_PER_BLOCK;
+  const int input_offset = task_id * num_entries;
+  const int block_sums_offset = task_id * num_blocks;
+  const bool is_multi_block = num_blocks > 1;
+  const int section_offset = PADDED_SECTION_SIZE * threadIdx.x;
+
+  indices += input_offset + (block_id * MAX_ENTRIES_PER_BLOCK);
+  labels += input_offset;
+  weights += input_offset;
+  output += task_id;
+
+  acc_t* block_sums_fp =
+      is_multi_block ? (block_sums + (block_sums_offset << 1)) : nullptr;
+  acc_t* block_sums_tp =
+      is_multi_block ? (block_sums_fp + num_blocks) : nullptr;
+  block_flags = is_multi_block ? block_flags + block_sums_offset : nullptr;
+
+  acc_t local_fp[PADDED_SECTION_SIZE];
+  acc_t local_tp[PADDED_SECTION_SIZE];
+
+  // Load data into shared memory
+  for (int i = 0;
+       i < PADDED_SECTION_SIZE && section_offset + i < num_entries_per_block;
+       i++) {
+    const index_t idx = indices[section_offset + i];
+    const acc_t weight = weights[idx];
+    const label_t label = labels[idx];
+    local_fp[i] = weight * (1.0 - label);
+    local_tp[i] = weight * label;
+  }
+
+  if (threadIdx.x == 0) {
+    *smem_auc = 0.0;
+  }
+
+  __syncthreads();
+
+  inclusive_sum_scan_kernel(
+      local_fp,
+      bs_temp_storage,
+      block_flags,
+      block_sums_fp,
+      &smem_tmp[0],
+      num_entries_per_block,
+      block_id,
+      is_multi_block,
+      /*signal=*/1);
+
+  inclusive_sum_scan_kernel(
+      local_tp,
+      bs_temp_storage,
+      block_flags,
+      block_sums_tp,
+      &smem_tmp[0],
+      num_entries_per_block,
+      block_id,
+      is_multi_block,
+      /*signal=*/2);
+
+  for (int i = 0; i < PADDED_SECTION_SIZE; i++) {
+    smem_fp[section_offset + i] = local_fp[i];
+    smem_tp[section_offset + i] = local_tp[i];
+  }
+
+  __syncthreads();
+
+  // Get last fp and tp
+  acc_t last_fp, last_tp;
+  if (is_multi_block) {
+    if (block_id == num_blocks - 1) {
+      last_fp = smem_fp[num_entries_per_block - 1];
+      last_tp = smem_tp[num_entries_per_block - 1];
+    } else {
+      if (threadIdx.x == 0) {
+        // This ensures that the all blocks are done writing block prefix sums
+        // to global memory
+        while (atomicAdd(&block_flags[num_blocks - 1], 0) < 2)
+          ;
+        last_fp = block_sums_fp[num_blocks - 1];
+        last_tp = block_sums_tp[num_blocks - 1];
+        smem_tmp[0] = last_fp;
+        smem_tmp[1] = last_tp;
+      }
+      __syncthreads();
+      if (threadIdx.x != 0) {
+        last_fp = smem_tmp[0];
+        last_tp = smem_tmp[1];
+      }
+    }
+  } else {
+    last_fp = smem_fp[num_entries - 1];
+    last_tp = smem_tp[num_entries - 1];
+  }
+
+  if (last_fp * last_tp == 0.0) {
+    if (threadIdx.x == 0) {
+      *output = 0.5;
+    }
+  } else {
+    trapz_kernel(
+        smem_auc,
+        smem_tp,
+        smem_fp,
+        block_sums_tp,
+        block_sums_fp,
+        num_entries_per_block,
+        block_id);
+
+    // Ensure that atomic add in trapz_kernel is done
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      gpuAtomicAdd(output, *smem_auc / last_fp / last_tp);
+    }
+  }
+}
+
+at::Tensor batch_auc(
+    const int64_t num_tasks,
+    const at::Tensor& indices,
+    const at::Tensor& labels,
+    const at::Tensor& weights) {
+  auto dim = indices.dim();
+  auto num_entries = indices.size(dim - 1);
+  auto num_entries_all_tasks = indices.numel();
+
+  TORCH_CHECK(labels.dim() == dim && weights.dim() == dim)
+  TORCH_CHECK(num_entries_all_tasks == num_entries * num_tasks)
+  TORCH_CHECK(
+      labels.size(dim - 1) == num_entries &&
+      weights.size(dim - 1) == num_entries &&
+      labels.numel() == num_entries_all_tasks &&
+      weights.numel() == num_entries_all_tasks)
+
+  const int log_num_threads = std::log2(NUM_THREADS_PER_BLOCK);
+  const int num_blocks =
+      (num_entries + MAX_ENTRIES_PER_BLOCK - 1) / MAX_ENTRIES_PER_BLOCK;
+  const int num_entries_per_block =
+      num_blocks > 1 ? MAX_ENTRIES_PER_BLOCK : num_entries;
+  const int rounded_section_size = num_entries_per_block >> log_num_threads;
+  const int rounded_num_entries_per_block = rounded_section_size
+      << log_num_threads;
+  const int padded_num_entries_per_block = rounded_num_entries_per_block +
+      (rounded_num_entries_per_block != num_entries_per_block
+           ? NUM_THREADS_PER_BLOCK
+           : 0);
+  const int padded_section_size =
+      padded_num_entries_per_block / NUM_THREADS_PER_BLOCK;
+  const int last_block_num_entries =
+      num_entries - ((num_blocks - 1) * MAX_ENTRIES_PER_BLOCK);
+
+  const auto output_options = weights.scalar_type() == at::ScalarType::Half
+      ? weights.options().dtype(at::kFloat)
+      : weights.options();
+  at::Tensor output = at::zeros({num_tasks}, output_options);
+
+  const int grid_size = num_blocks * num_tasks;
+
+  at::Tensor block_flags;
+  at::Tensor block_sums;
+  if (num_blocks > 1) {
+    block_flags = at::zeros({grid_size}, weights.options().dtype(at::kInt));
+    block_sums = at::empty({grid_size * 2}, output_options);
+  }
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(weights.get_device());
+
+  auto max_smem_size =
+      at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock;
+
+#define LAUNCH_AUC_KERNEL(pad)                                     \
+  typedef cub::BlockScan<acc_t, NUM_THREADS_PER_BLOCK> BlockScan;  \
+  TORCH_CHECK(                                                     \
+      sizeof(BlockScan::TempStorage) +                             \
+          ((MAX_ENTRIES_PER_BLOCK * 2 + 3) * sizeof(acc_t)) <=     \
+      max_smem_size)                                               \
+  auc_kernel<index_t, label_t, scalar_t, acc_t, pad>               \
+      <<<dim3(grid_size),                                          \
+         dim3(NUM_THREADS_PER_BLOCK),                              \
+         0,                                                        \
+         at::cuda::getCurrentCUDAStream()>>>(                      \
+          output.data_ptr<acc_t>(),                                \
+          indices.data_ptr<index_t>(),                             \
+          labels.data_ptr<label_t>(),                              \
+          weights.data_ptr<scalar_t>(),                            \
+          num_blocks > 1 ? block_flags.data_ptr<int>() : nullptr,  \
+          num_blocks > 1 ? block_sums.data_ptr<acc_t>() : nullptr, \
+          num_entries,                                             \
+          last_block_num_entries,                                  \
+          padded_num_entries_per_block,                            \
+          num_blocks);
+
+  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "auc_wrapper_1", [&] {
+    AT_DISPATCH_ALL_TYPES_AND(
+        at::ScalarType::Half, labels.scalar_type(), "auc_wrapper_2", [&] {
+          using label_t = scalar_t;
+          using acc_t = at::acc_type<scalar_t, true>;
+          AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+              weights.scalar_type(), "auc_wrapper_3", [&] {
+                if (padded_section_size == 1) {
+                  LAUNCH_AUC_KERNEL(1)
+                } else {
+                  LAUNCH_AUC_KERNEL(2)
+                }
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+              });
+        });
+  });
+
+#undef LAUNCH_AUC_KERNEL
+
+  return output;
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/metric_ops.h b/fbgemm_gpu/src/metric_ops.h
new file mode 100644
index 000000000..bdbdc1bbd
--- /dev/null
+++ b/fbgemm_gpu/src/metric_ops.h
@@ -0,0 +1,11 @@
+#include <ATen/ATen.h>
+
+namespace fbgemm_gpu {
+
+at::Tensor batch_auc(
+    const int64_t num_tasks,
+    const at::Tensor& indices,
+    const at::Tensor& labels,
+    const at::Tensor& weights);
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/metric_ops_host.cpp b/fbgemm_gpu/src/metric_ops_host.cpp
new file mode 100644
index 000000000..96a2fdc89
--- /dev/null
+++ b/fbgemm_gpu/src/metric_ops_host.cpp
@@ -0,0 +1,18 @@
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops_utils.h"
+#include "metric_ops.h"
+
+namespace fbgemm_gpu {
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  m.def(
+      "batch_auc(int num_tasks, Tensor indices, Tensor laebls, Tensor weights) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
+  DISPATCH_TO_CUDA("batch_auc", fbgemm_gpu::batch_auc);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops_gpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops_gpu.cpp
index 88963a3c4..19bec727d 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops_gpu.cpp
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops_gpu.cpp
@@ -15,8 +15,12 @@
 
 using Tensor = at::Tensor;
 
+///@defgroup permute-pooled-embs-gpu
+///@defgroup permute-pooled-embs-cpu
+
 namespace fbgemm_gpu {
 
+///@ingroup permute-pooled-embs-cpu
 Tensor permute_pooled_embs_cpu(
     const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
     const Tensor& offset_dim_list,
@@ -103,6 +107,7 @@ class PermutePooledEmbsFunction
   }
 };
 
+///@ingroup permute-pooled-embs-gpu
 Tensor permute_pooled_embs_auto_grad_gpu(
     const Tensor& pooled_embs,
     const Tensor& offset_dim_list,
@@ -117,6 +122,7 @@ Tensor permute_pooled_embs_auto_grad_gpu(
       inv_permute_list);
 }
 
+///@ingroup permute-pooled-embs-cpu
 Tensor permute_pooled_embs_auto_grad_cpu(
     const Tensor& pooled_embs,
     const Tensor& offset_dim_list,
diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops_split.cu b/fbgemm_gpu/src/permute_pooled_embedding_ops_split.cu
index 40a052e40..a83e5e65c 100644
--- a/fbgemm_gpu/src/permute_pooled_embedding_ops_split.cu
+++ b/fbgemm_gpu/src/permute_pooled_embedding_ops_split.cu
@@ -63,7 +63,7 @@ Tensor permute_pooled_embs_split_gpu(
       (B + max_grid_dim_y - 1) / max_grid_dim_y);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      pooled_embs_contiguous.type(), "permute_pooled_embeddings", [&] {
+      pooled_embs_contiguous.scalar_type(), "permute_pooled_embeddings", [&] {
         permute_pooled_embs_kernel<scalar_t>
             <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
                 pooled_embs_contiguous.data_ptr<scalar_t>(),
diff --git a/fbgemm_gpu/src/quantize_ops.cu b/fbgemm_gpu/src/quantize_ops.cu
index 23d3c13fe..932ca684d 100644
--- a/fbgemm_gpu/src/quantize_ops.cu
+++ b/fbgemm_gpu/src/quantize_ops.cu
@@ -25,8 +25,11 @@
 
 using Tensor = at::Tensor;
 
-namespace fbgemm_gpu {
+/// @defgroup quantize-data-cuda Quantization Data CUDA Operators
+/// The following are CUDA Operators
 
+namespace fbgemm_gpu {
+///@ingroup quantize-data-cuda
 at::Tensor _float_to_bfloat16_gpu(const at::Tensor& input) {
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(input.get_device());
@@ -50,6 +53,7 @@ at::Tensor _float_to_bfloat16_gpu(const at::Tensor& input) {
   return output;
 }
 
+///@ingroup quantize-data-cuda
 at::Tensor _bfloat16_to_float_gpu(const at::Tensor& input) {
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(input.get_device());
@@ -76,15 +80,15 @@ namespace {
 template <typename input_t>
 __global__ inline void _float_to_fused8bitrowwise_cuda_kernel(
     const input_t* __restrict__ input,
-    int nrows,
-    int ncols,
+    const int nrows,
+    const int ncols,
     std::uint8_t* __restrict__ output) {
   constexpr float kEpsilon = 1e-8f;
 
-  int ncols_aligned = (ncols + 4 - 1) / 4 * 4;
-  int output_columns = ncols_aligned + 2 * sizeof(float);
+  const int ncols_aligned = (ncols + 4 - 1) / 4 * 4;
+  const int output_columns = ncols_aligned + 2 * sizeof(float);
 
-  int64_t row = (int)blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t row = (int)blockIdx.x * blockDim.x + threadIdx.x;
 
   if (row < nrows) {
     const input_t* input_row = input + row * ncols;
@@ -92,9 +96,9 @@ __global__ inline void _float_to_fused8bitrowwise_cuda_kernel(
     float* output_row_scale_bias =
         reinterpret_cast<float*>(output_row + ncols_aligned);
 
-    float minimum_element = fbgemm_gpu::min(input_row, input_row + ncols);
-    float maximum_element = fbgemm_gpu::max(input_row, input_row + ncols);
-    float range = maximum_element - minimum_element;
+    const float minimum_element = fbgemm_gpu::min(input_row, input_row + ncols);
+    const float maximum_element = fbgemm_gpu::max(input_row, input_row + ncols);
+    const float range = maximum_element - minimum_element;
 
     output_row_scale_bias[0] = range / 255.0f;
     output_row_scale_bias[1] = minimum_element;
@@ -109,20 +113,18 @@ __global__ inline void _float_to_fused8bitrowwise_cuda_kernel(
 template <typename T>
 __device__ inline __attribute__((always_inline)) T
 quantize_ops_shfl_xor(const T val, int laneMask, int width) {
-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
   return __shfl_xor(val, laneMask, width);
-#elif CUDA_VERSION >= 9000
-  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
 #else
-  return __shfl_xor(val, laneMask, width);
+  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
 #endif
 }
 
 template <typename input_t>
 __global__ inline void _get_8bit_qparam_cuda_kernel(
     const input_t* __restrict__ input,
-    int nrows,
-    int ncols,
+    const int nrows,
+    const int ncols,
     uint8_t* __restrict__ output,
     float* __restrict__ range_list) {
   const int row = (int)blockIdx.x * blockDim.y + threadIdx.y;
@@ -206,9 +208,9 @@ __global__ inline void _compute_8bit_quantize_cuda_kernel(
       // load scale, bias
       float* row_qparams = reinterpret_cast<float*>(
           output + row * output_columns + ncols_aligned);
-      float bias = row_qparams[1];
+      const float bias = row_qparams[1];
 
-      int input_idx = row * ncols + col;
+      const int input_idx = row * ncols + col;
       uint8_t* output_addr = output + row * output_columns + col;
       // TODO: lift range_list into shared memory. However, when nrows is large,
       // it might exceed the size of shared memory.
@@ -291,13 +293,13 @@ __global__ inline void _fused8bitrowwise_to_float_mixed_dim_cuda_kernel(
 // FP32/FP16 -> Fused 4/2-bit rowwise kernel
 template <typename input_t>
 __global__ inline void _float_to_fusednbitrowwise_cuda_kernel(
-    int bit_rate,
+    const int bit_rate,
     const input_t* __restrict__ input,
-    int nrows,
-    int ncols,
+    const int nrows,
+    const int ncols,
     std::uint8_t* __restrict__ output) {
-  int num_elem_per_byte = 8 / bit_rate;
-  int output_columns =
+  const int num_elem_per_byte = 8 / bit_rate;
+  const int output_columns =
       (ncols + num_elem_per_byte - 1) / num_elem_per_byte + 2 * sizeof(__half);
 
   int row = (int)blockIdx.x * blockDim.x + threadIdx.x;
@@ -329,7 +331,7 @@ __global__ inline void _float_to_fusednbitrowwise_cuda_kernel(
     output_row_scale_bias[0] = __float2half(scale);
     output_row_scale_bias[1] = __float2half(minimum_element);
     for (std::size_t col = 0; col < ncols; ++col) {
-      float X = input_row[col];
+      const float X = input_row[col];
 
       std::uint8_t quantized = QUANTIZE_OPS_MAX(
           0,
@@ -494,6 +496,7 @@ Tensor _float_to_fused8bitrowwise_gpu_t(const Tensor& input) {
   return output;
 }
 
+///@ingroup quantize-data-cuda
 Tensor _float_to_fused8bitrowwise_gpu(const Tensor& input) {
   return _float_to_fused8bitrowwise_gpu_t<float>(input);
 }
@@ -502,6 +505,16 @@ Tensor _half_to_fused8bitrowwise_gpu(const Tensor& input) {
   return _float_to_fused8bitrowwise_gpu_t<at::Half>(input);
 }
 
+///@ingroup quantize-data-cuda
+Tensor _float_or_half_to_fused8bitrowwise_gpu(const Tensor& input) {
+  Tensor output;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(),
+      "float_or_half_to_fused8bitrowwise_cuda_kernel",
+      [&] { output = _float_to_fused8bitrowwise_gpu_t<scalar_t>(input); });
+  return output;
+}
+
 template <typename output_t>
 Tensor _fused8bitrowwise_to_float_gpu_t(const Tensor& input) {
   TENSOR_ON_CUDA_GPU(input);
@@ -543,11 +556,11 @@ Tensor _fused8bitrowwise_to_float_gpu_t(const Tensor& input) {
   constexpr int threads_per_block = 256;
 
   const int blockDim_x = std::min(threads_per_block, output_columns);
-  dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
+  const dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
 
   const auto gridDim_x = cuda_calc_xblock_count(output_columns, blockDim.x);
   const auto gridDim_y = cuda_calc_block_count(nrows, blockDim.y);
-  dim3 gridDim(gridDim_x, gridDim_y);
+  const dim3 gridDim(gridDim_x, gridDim_y);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       output.scalar_type(), "fused8bitrowwise_to_float_cuda_kernel", [&] {
@@ -571,6 +584,28 @@ at::Tensor _fused8bitrowwise_to_half_gpu(const at::Tensor& input) {
   return _fused8bitrowwise_to_float_gpu_t<at::Half>(input);
 }
 
+///@ingroup quantize-data-cuda
+at::Tensor _fused8bitrowwise_to_float_or_half_gpu(
+    const at::Tensor& input,
+    const int64_t output_dtype) {
+  Tensor output;
+
+  SparseType output_sparse_dtype = static_cast<SparseType>(output_dtype);
+  switch (output_sparse_dtype) {
+    case SparseType::FP32:
+      output = _fused8bitrowwise_to_float_gpu_t<float>(input);
+      break;
+    case SparseType::FP16:
+      output = _fused8bitrowwise_to_float_gpu_t<at::Half>(input);
+      break;
+    default:
+      TORCH_CHECK(false);
+  }
+
+  return output;
+}
+
+///@ingroup quantize-data-cuda
 at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu(
     const at::Tensor& input,
     const at::Tensor& D_offsets,
@@ -591,7 +626,7 @@ at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu(
   const int qparam_size = 8;
   // allocate a warp for each output row
   const int num_tables = D_offsets.size(0) - 1;
-  int64_t output_dim =
+  const int64_t output_dim =
       input.size(1) - static_cast<int64_t>(qparam_size * num_tables);
   at::Tensor output;
   SparseType output_sparse_dtype = static_cast<SparseType>(output_dtype);
@@ -611,8 +646,9 @@ at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu(
     return output;
   }
   constexpr int threads_per_block = 256;
-  dim3 blockDim(kWarpSize, threads_per_block / kWarpSize);
-  dim3 gridDim(cuda_calc_xblock_count(num_tables * batch_size, blockDim.y));
+  const dim3 blockDim(kWarpSize, threads_per_block / kWarpSize);
+  const dim3 gridDim(
+      cuda_calc_xblock_count(num_tables * batch_size, blockDim.y));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       output.scalar_type(),
       "_fused8bitrowwise_to_float_mixed_dim_cuda_kernel",
@@ -628,6 +664,7 @@ at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu(
   return output;
 }
 
+///@ingroup quantize-data-cuda
 template <typename input_t>
 Tensor _float_to_fusednbitrowwise_gpu_t(
     const Tensor& input,
@@ -684,18 +721,35 @@ Tensor _float_to_fusednbitrowwise_gpu_t(
   return output;
 }
 
+///@ingroup quantize-data-cuda
 Tensor _float_to_fusednbitrowwise_gpu(
     const Tensor& input,
     const int64_t bit_rate) {
   return _float_to_fusednbitrowwise_gpu_t<float>(input, bit_rate);
 }
 
+///@ingroup quantize-data-cuda
 at::Tensor _half_to_fusednbitrowwise_gpu(
     const at::Tensor& input,
     const int64_t bit_rate) {
   return _float_to_fusednbitrowwise_gpu_t<at::Half>(input, bit_rate);
 }
 
+///@ingroup sparse-data-cuda
+Tensor _float_or_half_to_fusednbitrowwise_gpu(
+    const Tensor& input,
+    const int64_t bit_rate) {
+  Tensor output;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(),
+      "float_or_half_to_fusednbitrowwise_cuda_kernel",
+      [&] {
+        output = _float_to_fusednbitrowwise_gpu_t<scalar_t>(input, bit_rate);
+      });
+  return output;
+}
+
+///@ingroup quantize-data-cuda
 template <typename output_t>
 Tensor _fusednbitrowwise_to_float_gpu_t(
     const Tensor& input,
@@ -735,10 +789,10 @@ Tensor _fusednbitrowwise_to_float_gpu_t(
   constexpr int threads_per_block = 256;
 
   const int blockDim_x = std::min(output_columns, threads_per_block);
-  dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
+  const dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
   const auto gridDim_x = cuda_calc_xblock_count(output_columns, blockDim.x);
   const auto gridDim_y = cuda_calc_block_count(nrows, blockDim.y);
-  dim3 gridDim(gridDim_x, gridDim_y);
+  const dim3 gridDim(gridDim_x, gridDim_y);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       output.scalar_type(), "fusednbitrowwise_to_float_cuda_kernel", [&] {
@@ -761,12 +815,35 @@ at::Tensor _fusednbitrowwise_to_float_gpu(
   return _fusednbitrowwise_to_float_gpu_t<float>(input, bit_rate);
 }
 
+///@ingroup quantize-data-cuda
 at::Tensor _fusednbitrowwise_to_half_gpu(
     const at::Tensor& input,
     const int64_t bit_rate) {
   return _fusednbitrowwise_to_float_gpu_t<at::Half>(input, bit_rate);
 }
 
+///@ingroup quantize-data-cuda
+at::Tensor _fusednbitrowwise_to_float_or_half_gpu(
+    const at::Tensor& input,
+    const int64_t bit_rate,
+    const int64_t output_dtype) {
+  Tensor output;
+
+  SparseType output_sparse_dtype = static_cast<SparseType>(output_dtype);
+  switch (output_sparse_dtype) {
+    case SparseType::FP32:
+      output = _fusednbitrowwise_to_float_gpu_t<float>(input, bit_rate);
+      break;
+    case SparseType::FP16:
+      output = _fusednbitrowwise_to_float_gpu_t<at::Half>(input, bit_rate);
+      break;
+    default:
+      TORCH_CHECK(false);
+  }
+
+  return output;
+}
+
 at::Tensor _float_to_hfp8_gpu(
     const at::Tensor& input,
     const int64_t ebits,
@@ -819,4 +896,177 @@ at::Tensor _hfp8_to_float_gpu(
 
   return output;
 }
+
+__host__ __device__ inline float float_to_msfp(
+    const float val_fp,
+    const int shared_expo,
+    const int mbits,
+    const int bias,
+    const float max_pos) {
+  fbgemm_gpu::fint32 X, bouncer, scale, inv_scale;
+  int32_t expo, emin, delta_E, nbits2round;
+
+  X.F = val_fp;
+  const uint32_t sign_bit = X.I & 0x80000000;
+  X.I = X.I & 0x7FFFFFFF; // 31 bits
+
+  emin = 1 - bias;
+
+  // Because the input value can be of extreme magnitude
+  // We scale them into less extreme to avoid potential exception during
+  // manipulation
+  const int32_t E = ((X.I & 0x7F800000) >> 23) - 127;
+  if (E >= 0) {
+    scale.I = 0X2F800000;
+    inv_scale.I = 0X4F800000; // scale is 2^-32, inv_scale is 2^32
+    delta_E = -32;
+  } else {
+    scale.I = 0x4F800000;
+    inv_scale.I = 0x2F800000;
+    delta_E = 32;
+  }
+  X.F *= scale.F; // at this point X is never close to over/underflow
+  expo = ((X.I & 0x7F800000) >> 23) - 127 - delta_E;
+
+  // If expo >= emin
+  // We round to mbits explicit mantissa bits
+  // That is, we want to round off 23-mbits of the trailing bits in X
+  nbits2round = 23 - mbits;
+  // However, if expo < emin, we need to round more bits off
+  nbits2round += ::max(emin - expo, 0); // max(emin - expo, 0);
+  // also need to right shift mantissa with the shared expoennt
+  nbits2round += ::max(shared_expo - expo, 0);
+
+  bouncer.I = (nbits2round << 23) + (X.I & 0x7F800000);
+  X.F = X.F + bouncer.F; // Because bouncer is exactly 2^nbits2round bigger
+                         // this addition forces the rounding off of nbits2round
+  X.F = X.F - bouncer.F; // X.F is the original X with nbits2round rounded off
+
+  // restore the true magnitude by undoing the previous scale
+  X.F *= inv_scale.F;
+  // clip on the large end of the domain
+  X.F = ::min(X.F, max_pos);
+  // restores the original sign
+  X.I |= sign_bit;
+
+  const float val_msfp = X.F;
+  return val_msfp;
+}
+
+__global__ inline void _compute_msfp_shared_exponent_cuda_kernel(
+    const float* __restrict__ input,
+    const int nrows,
+    const int ncols,
+    const int bounding_box_size,
+    int* __restrict__ shared_exponents) {
+  const int tidy = blockIdx.y * blockDim.y +
+      threadIdx.y; // to get the threadid-y dimension of this thread
+  const int tidx = blockIdx.x * blockDim.x +
+      threadIdx.x; // to get the threadid-x dimension of this thread
+
+  const int row_incre = blockDim.y * gridDim.y;
+  const int col_incre = blockDim.x * gridDim.x;
+
+  for (int row = tidy; row < nrows; row += row_incre) {
+    const float* input_row = input + row * ncols;
+    int* shared_expo_row = shared_exponents + row * ncols;
+    for (int col = tidx; col < ncols; col += col_incre) {
+      const int boundingbox_start = col / bounding_box_size * bounding_box_size;
+      const int boundingbox_end =
+          ::min(boundingbox_start + bounding_box_size, ncols);
+
+      int32_t max_exponent = 0;
+      for (int i = boundingbox_start; i < boundingbox_end; i++) {
+        // update the max_exponent
+        fbgemm_gpu::fint32 org_data;
+        org_data.F = input_row[i];
+        org_data.I = org_data.I & 0x7FFFFFFF; // 31 bits
+        const int32_t exponent = ((org_data.I & 0x7F800000) >> 23);
+        max_exponent = ::max(max_exponent, exponent);
+      }
+      shared_expo_row[col] = static_cast<int>(max_exponent) - 127;
+    }
+  }
+}
+
+at::Tensor _float_to_msfp_gpu(
+    const at::Tensor& input,
+    const int64_t bounding_box_size,
+    const int64_t ebits,
+    const int64_t mbits,
+    const int64_t bias,
+    const double min_pos,
+    const double max_pos) {
+  TENSOR_ON_CUDA_GPU(input);
+  TENSOR_NDIM_EQUALS(input, 2);
+
+  TORCH_CHECK(ebits <= 8);
+  TORCH_CHECK(mbits <= 23);
+  TORCH_CHECK(ebits > 0 && mbits > 0);
+  TORCH_CHECK(min_pos > 0 && max_pos > 0 && max_pos > min_pos);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(input.get_device());
+
+  const int nrows = input.size(0);
+  const int ncols = input.size(1);
+
+  auto output = at::empty({nrows, ncols}, input.options().dtype(at::kFloat));
+  if (nrows == 0 || ncols == 0) {
+    return output;
+  }
+
+  constexpr int threads_per_block = 256;
+
+  const int blockDim_x = std::min(ncols, threads_per_block);
+  const dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
+  const int gridDim_x = (ncols + blockDim.x - 1) / blockDim.x;
+  const int gridDim_y = std::min((nrows + blockDim.y - 1) / blockDim.y, 65535u);
+  const dim3 gridDim(gridDim_x, gridDim_y);
+
+  auto shared_exponents =
+      at::empty({nrows, ncols}, input.options().dtype(at::kInt));
+
+  _compute_msfp_shared_exponent_cuda_kernel<<<
+      gridDim,
+      blockDim,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      input.contiguous().data_ptr<float>(),
+      nrows,
+      ncols,
+      bounding_box_size,
+      shared_exponents.data_ptr<int>());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  auto iter = at::TensorIteratorConfig()
+                  .check_all_same_dtype(false)
+                  .add_output(output)
+                  .add_input(input)
+                  .add_input(shared_exponents)
+                  .build();
+
+  at::native::gpu_kernel(
+      iter, [=] GPU_LAMBDA(float in, int shared_expo) -> float {
+        return float_to_msfp(in, shared_expo, mbits, bias, max_pos);
+      });
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return output;
+}
+
+///@ingroup quantize-data-cuda
+at::Tensor _msfp_to_float_gpu(
+    const at::Tensor& input,
+    const int64_t ebits,
+    const int64_t mbits,
+    const int64_t bias) {
+  TENSOR_ON_CUDA_GPU(input);
+
+  // Because float_to_msfp is a fakequant operator,
+  // the input msfp number is already a FP32 number
+  // with limited precision.
+  // Thus this msfp_to_float is really a no-op
+  return input.clone();
+}
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/quantize_ops_cpu.cpp b/fbgemm_gpu/src/quantize_ops_cpu.cpp
index ddd0e11ee..aa024647b 100644
--- a/fbgemm_gpu/src/quantize_ops_cpu.cpp
+++ b/fbgemm_gpu/src/quantize_ops_cpu.cpp
@@ -10,10 +10,15 @@
 #include <fbgemm_gpu/sparse_ops_utils.h>
 #include <torch/library.h>
 #include "fbgemm/QuantUtils.h"
+#include "fbgemm_gpu/embedding_common.h"
 #include "fbgemm_gpu/quantize_ops_utils.h"
 
 using Tensor = at::Tensor;
 
+/// @defgroup quantize-data-cpu Quantize Data CPU Operators
+/// The following are CPU Operators
+///
+
 namespace fbgemm_gpu {
 
 template <typename input_t>
@@ -37,9 +42,9 @@ Tensor& _float_to_fused8bitrowwise_cpu_out_t(
   output_dims[last_dim] = output_columns;
   at::native::resize_(output, output_dims, c10::nullopt);
 
-  const auto input_data =
-      (input_t*)input.data_ptr(); // input.data_ptr<input_t>(); -> Yields
-                                  // unresolved data_ptr symbol.
+  const auto input_data = static_cast<input_t*>(
+      input.data_ptr()); // input.data_ptr<input_t>(); -> Yields
+                         // unresolved data_ptr symbol.
   fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<input_t>(
       input_data, nrows, ncols, output.data_ptr<uint8_t>());
 
@@ -66,9 +71,9 @@ Tensor& _fused8bitrowwise_to_float_cpu_out_t(
   output_dims[last_dim] = output_columns;
   at::native::resize_(output, output_dims, c10::nullopt);
 
-  auto output_data =
-      (output_t*)output.data_ptr(); // output.data_ptr<output_t>(); -> Yields
-                                    // unresolved data_ptr symbol.
+  auto output_data = static_cast<output_t*>(
+      output.data_ptr()); // output.data_ptr<output_t>(); -> Yields
+                          // unresolved data_ptr symbol.
   fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf<output_t>(
       input.data_ptr<uint8_t>(), nrows, ncols, output_data);
 
@@ -96,9 +101,9 @@ Tensor _float_to_fusednbitrowwise_cpu(
       {nrows, output_columns},
       input.options().dtype(at::kByte)); // at::kBytes for uint8_t
 
-  const auto input_data =
-      (input_t*)input.data_ptr(); // input.data_ptr<input_t>(); -> Yields
-                                  // unresolved data_ptr symbol.
+  const auto input_data = static_cast<input_t*>(
+      input.data_ptr()); // input.data_ptr<input_t>(); -> Yields
+                         // unresolved data_ptr symbol.
   fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<input_t>(
       bit_rate, input_data, nrows, ncols, output.data_ptr<uint8_t>());
 
@@ -130,9 +135,9 @@ Tensor _fusednbitrowwise_to_float_cpu(
         input.options().dtype(at::kHalf));
   }
 
-  auto output_data =
-      (output_t*)output.data_ptr(); // output.data_ptr<output_t>(); -> Yields
-                                    // unresolved data_ptr symbol.
+  auto output_data = static_cast<output_t*>(
+      output.data_ptr()); // output.data_ptr<output_t>(); -> Yields
+                          // unresolved data_ptr symbol.
 
   fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf<output_t>(
       bit_rate, input.data_ptr<uint8_t>(), nrows, ncols, output_data);
@@ -140,26 +145,28 @@ Tensor _fusednbitrowwise_to_float_cpu(
   return output;
 }
 
+///@ingroup quantize-data-cpu
 Tensor& _fused8bitrowwise_to_float_cpu_out(
     Tensor& output,
     const Tensor& input) {
   return _fused8bitrowwise_to_float_cpu_out_t<float>(output, input);
 }
 
+Tensor& fused8bitrowwise_to_half_cpu_out(Tensor& output, const Tensor& input) {
+  return _fused8bitrowwise_to_float_cpu_out_t<fbgemm::float16>(output, input);
+}
+
+///@ingroup quantize-data-cpu
 Tensor& _float_to_fused8bitrowwise_cpu_out(
     Tensor& output,
     const Tensor& input) {
   return _float_to_fused8bitrowwise_cpu_out_t<float>(output, input);
 }
 
-Tensor& fused8bitrowwise_to_half_cpu_out(Tensor& output, const Tensor& input) {
-  return _fused8bitrowwise_to_float_cpu_out_t<fbgemm::float16>(output, input);
-}
-
-Tensor& half_to_fused8bitrowwise_cpu_out(Tensor& output, const Tensor& input) {
+Tensor& _half_to_fused8bitrowwise_cpu_out(Tensor& output, const Tensor& input) {
   return _float_to_fused8bitrowwise_cpu_out_t<fbgemm::float16>(output, input);
 }
-
+///@ingroup quantize-data-cpu
 Tensor float_to_fused8bitrowwise_cpu(const Tensor& input) {
   auto output = at::empty(
       {0},
@@ -167,50 +174,130 @@ Tensor float_to_fused8bitrowwise_cpu(const Tensor& input) {
   return _float_to_fused8bitrowwise_cpu_out(output, input);
 }
 
+///@ingroup quantize-data-cpu
 Tensor half_to_fused8bitrowwise_cpu(const Tensor& input) {
   auto output = at::empty(
       {0},
       input.options().dtype(at::kByte)); // at::kBytes for uint8_t
-  return half_to_fused8bitrowwise_cpu_out(output, input);
+  return _half_to_fused8bitrowwise_cpu_out(output, input);
 }
 
-Tensor fused8bitrowwise_to_float_cpu(const Tensor& input) {
+///@ingroup quantize-data-cpu
+Tensor float_or_half_to_fused8bitrowwise_cpu(const Tensor& input) {
   auto output = at::empty(
       {0},
-      input.options().dtype(at::kFloat)); // at::kBytes for uint8_t
+      input.options().dtype(at::kByte)); // at::kBytes for uint8_t
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "float_or_half_to_fused8bitrowwise_cpu", [&] {
+        if (std::is_same<scalar_t, float>::value) {
+          _float_to_fused8bitrowwise_cpu_out(output, input);
+        } else { // scalar_t = at::Half
+          _half_to_fused8bitrowwise_cpu_out(output, input);
+        }
+      });
+  return output;
+}
+///@ingroup quantize-data-cpu
+Tensor fused8bitrowwise_to_float_cpu(const Tensor& input) {
+  auto output = at::empty({0}, input.options().dtype(at::kFloat));
   return _fused8bitrowwise_to_float_cpu_out(output, input);
 }
-
+///@ingroup quantize-data-cpu
 Tensor fused8bitrowwise_to_half_cpu(const Tensor& input) {
-  auto output = at::empty(
-      {0},
-      input.options().dtype(at::kHalf)); // at::kBytes for uint8_t
+  auto output = at::empty({0}, input.options().dtype(at::kHalf));
   return fused8bitrowwise_to_half_cpu_out(output, input);
 }
+///@ingroup quantize-data-cpu
+Tensor fused8bitrowwise_to_float_or_half_cpu(
+    const Tensor& input,
+    const int64_t output_dtype) {
+  Tensor output;
+
+  SparseType output_sparse_dtype = static_cast<SparseType>(output_dtype);
+  switch (output_sparse_dtype) {
+    case SparseType::FP32:
+      output = at::empty({0}, input.options().dtype(at::kFloat));
+
+      output = _fused8bitrowwise_to_float_cpu_out(output, input);
+
+      break;
+    case SparseType::FP16:
+      output = at::empty({0}, input.options().dtype(at::kHalf));
+      output = fused8bitrowwise_to_half_cpu_out(output, input);
+      break;
+    default:
+      TORCH_CHECK(false);
+  }
 
+  return output;
+}
+
+///@ingroup quantize-data-cpu
 Tensor fusednbitrowwise_to_float_cpu(
     const Tensor& input,
     const int64_t bit_rate) {
   return _fusednbitrowwise_to_float_cpu<float>(input, bit_rate);
 }
 
+///@ingroup quantize-data-cpu
 Tensor fusednbitrowwise_to_half_cpu(
     const Tensor& input,
     const int64_t bit_rate) {
   return _fusednbitrowwise_to_float_cpu<fbgemm::float16>(input, bit_rate);
 }
 
+///@ingroup quantize-data-cpu
+Tensor fusednbitrowwise_to_float_or_half_cpu(
+    const Tensor& input,
+    const int64_t bit_rate,
+    const int64_t output_dtype) {
+  Tensor output;
+
+  SparseType output_sparse_dtype = static_cast<SparseType>(output_dtype);
+  switch (output_sparse_dtype) {
+    case SparseType::FP32:
+      output = _fusednbitrowwise_to_float_cpu<float>(input, bit_rate);
+
+      break;
+    case SparseType::FP16:
+      output = _fusednbitrowwise_to_float_cpu<fbgemm::float16>(input, bit_rate);
+      break;
+    default:
+      TORCH_CHECK(false);
+  }
+
+  return output;
+}
+
 Tensor float_to_fusednbitrowwise_cpu(
     const Tensor& input,
     const int64_t bit_rate) {
   return _float_to_fusednbitrowwise_cpu<float>(input, bit_rate);
 }
+
 Tensor half_to_fusednbitrowwise_cpu(
     const Tensor& input,
     const int64_t bit_rate) {
   return _float_to_fusednbitrowwise_cpu<fbgemm::float16>(input, bit_rate);
 }
 
+Tensor float_or_half_to_fusednbitrowwise_cpu(
+    const Tensor& input,
+    const int64_t bit_rate) {
+  Tensor output;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "float_or_half_to_fusednbitrowwise_cpu", [&] {
+        if (std::is_same<scalar_t, float>::value) {
+          output = _float_to_fusednbitrowwise_cpu<float>(input, bit_rate);
+        } else { // scalar_t = at::Half
+          output =
+              _float_to_fusednbitrowwise_cpu<fbgemm::float16>(input, bit_rate);
+        }
+      });
+  return output;
+}
+
+///@ingroup quantize-data-cpu
 void FloatToFP8Quantized_ref(
     const float* const input,
     const size_t nrows,
@@ -230,6 +317,7 @@ void FloatToFP8Quantized_ref(
   }
 }
 
+///@ingroup quantize-data-cpu
 void FP8QuantizedToFloat_ref(
     const uint8_t* const input,
     const size_t nrows,
@@ -306,8 +394,11 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "FloatToFused8BitRowwiseQuantizedOut(Tensor output, Tensor input) -> Tensor");
   m.def("HalfToFused8BitRowwiseQuantized(Tensor t) -> Tensor");
+  m.def("FloatOrHalfToFused8BitRowwiseQuantized(Tensor t) -> Tensor");
   m.def("Fused8BitRowwiseQuantizedToFloat(Tensor input) -> Tensor");
   m.def("Fused8BitRowwiseQuantizedToHalf(Tensor input) -> Tensor");
+  m.def(
+      "Fused8BitRowwiseQuantizedToFloatOrHalf(Tensor input, int output_dtype=0) -> Tensor");
   m.def(
       "Fused8BitRowwiseQuantizedToFloatOut(Tensor output, Tensor input) -> Tensor");
   m.def(
@@ -316,14 +407,22 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "FloatToFusedNBitRowwiseQuantizedSBHalf(Tensor input, int bit_rate) -> Tensor");
   m.def(
       "HalfToFusedNBitRowwiseQuantizedSBHalf(Tensor input, int bit_rate) -> Tensor");
+  m.def(
+      "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf(Tensor input, int bit_rate) -> Tensor");
   m.def(
       "FusedNBitRowwiseQuantizedSBHalfToFloat(Tensor input, int bit_rate) -> Tensor");
   m.def(
       "FusedNBitRowwiseQuantizedSBHalfToHalf(Tensor input, int bit_rate) -> Tensor");
+  m.def(
+      "FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf(Tensor input, int bit_rate, int output_dtype=0) -> Tensor");
   m.def(
       "FloatToHFP8Quantized(Tensor input, int ebits, int exponent_bias, float max_pos) -> Tensor");
   m.def(
       "HFP8QuantizedToFloat(Tensor input, int ebits, int exponent_bias) -> Tensor");
+  m.def(
+      "FloatToMSFPQuantized(Tensor input, int bounding_box_size, int ebits, int mbits, int bias, float min_pos, float max_pos) -> Tensor");
+  m.def(
+      "MSFPQuantizedToFloat(Tensor input, int ebits, int mbits, int bias) -> Tensor");
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
@@ -333,21 +432,33 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
   DISPATCH_TO_CPU(
       "HalfToFused8BitRowwiseQuantized",
       fbgemm_gpu::half_to_fused8bitrowwise_cpu);
+  DISPATCH_TO_CPU(
+      "FloatOrHalfToFused8BitRowwiseQuantized",
+      fbgemm_gpu::float_or_half_to_fused8bitrowwise_cpu);
   DISPATCH_TO_CPU(
       "FloatToFused8BitRowwiseQuantizedOut",
       fbgemm_gpu::_float_to_fused8bitrowwise_cpu_out);
   DISPATCH_TO_CPU(
       "Fused8BitRowwiseQuantizedToFloat",
       fbgemm_gpu::fused8bitrowwise_to_float_cpu);
-  DISPATCH_TO_CPU(
-      "Fused8BitRowwiseQuantizedToFloatOut",
-      fbgemm_gpu::_fused8bitrowwise_to_float_cpu_out);
   DISPATCH_TO_CPU(
       "Fused8BitRowwiseQuantizedToHalf",
       fbgemm_gpu::fused8bitrowwise_to_half_cpu);
+  DISPATCH_TO_CPU(
+      "Fused8BitRowwiseQuantizedToFloatOrHalf",
+      fbgemm_gpu::fused8bitrowwise_to_float_or_half_cpu);
+  DISPATCH_TO_CPU(
+      "Fused8BitRowwiseQuantizedToFloatOut",
+      fbgemm_gpu::_fused8bitrowwise_to_float_cpu_out);
   DISPATCH_TO_CPU(
       "FloatToFusedNBitRowwiseQuantizedSBHalf",
       fbgemm_gpu::float_to_fusednbitrowwise_cpu);
+  DISPATCH_TO_CPU(
+      "HalfToFusedNBitRowwiseQuantizedSBHalf",
+      fbgemm_gpu::half_to_fusednbitrowwise_cpu);
+  DISPATCH_TO_CPU(
+      "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf",
+      fbgemm_gpu::float_or_half_to_fusednbitrowwise_cpu);
   DISPATCH_TO_CPU(
       "FusedNBitRowwiseQuantizedSBHalfToFloat",
       fbgemm_gpu::fusednbitrowwise_to_float_cpu);
@@ -355,8 +466,8 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
       "FusedNBitRowwiseQuantizedSBHalfToHalf",
       fbgemm_gpu::fusednbitrowwise_to_half_cpu);
   DISPATCH_TO_CPU(
-      "HalfToFusedNBitRowwiseQuantizedSBHalf",
-      fbgemm_gpu::half_to_fusednbitrowwise_cpu);
+      "FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf",
+      fbgemm_gpu::fusednbitrowwise_to_float_or_half_cpu);
   DISPATCH_TO_CPU("FloatToHFP8Quantized", fbgemm_gpu::_float_to_hfp8_cpu);
   DISPATCH_TO_CPU("HFP8QuantizedToFloat", fbgemm_gpu::_hfp8_to_float_cpu);
 }
diff --git a/fbgemm_gpu/src/quantize_ops_gpu.cpp b/fbgemm_gpu/src/quantize_ops_gpu.cpp
index 1905e37d2..5ec985b43 100644
--- a/fbgemm_gpu/src/quantize_ops_gpu.cpp
+++ b/fbgemm_gpu/src/quantize_ops_gpu.cpp
@@ -7,7 +7,6 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #include <torch/library.h>
 
-#include "fbgemm_gpu/quantize_ops_gpu.h"
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
@@ -18,18 +17,30 @@ TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   DISPATCH_TO_CUDA(
       "HalfToFused8BitRowwiseQuantized",
       fbgemm_gpu::_half_to_fused8bitrowwise_gpu);
+  DISPATCH_TO_CUDA(
+      "FloatOrHalfToFused8BitRowwiseQuantized",
+      fbgemm_gpu::_float_or_half_to_fused8bitrowwise_gpu);
   DISPATCH_TO_CUDA(
       "Fused8BitRowwiseQuantizedToFloat",
       fbgemm_gpu::_fused8bitrowwise_to_float_gpu);
-  DISPATCH_TO_CUDA(
-      "Fused8BitRowwiseQuantizedToFloatMixedDim",
-      fbgemm_gpu::_fused8bitrowwise_to_float_mixed_dim_gpu);
   DISPATCH_TO_CUDA(
       "Fused8BitRowwiseQuantizedToHalf",
       fbgemm_gpu::_fused8bitrowwise_to_half_gpu);
+  DISPATCH_TO_CUDA(
+      "Fused8BitRowwiseQuantizedToFloatOrHalf",
+      fbgemm_gpu::_fused8bitrowwise_to_float_or_half_gpu);
+  DISPATCH_TO_CUDA(
+      "Fused8BitRowwiseQuantizedToFloatMixedDim",
+      fbgemm_gpu::_fused8bitrowwise_to_float_mixed_dim_gpu);
   DISPATCH_TO_CUDA(
       "FloatToFusedNBitRowwiseQuantizedSBHalf",
       fbgemm_gpu::_float_to_fusednbitrowwise_gpu);
+  DISPATCH_TO_CUDA(
+      "HalfToFusedNBitRowwiseQuantizedSBHalf",
+      fbgemm_gpu::_half_to_fusednbitrowwise_gpu);
+  DISPATCH_TO_CUDA(
+      "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf",
+      fbgemm_gpu::_float_or_half_to_fusednbitrowwise_gpu);
   DISPATCH_TO_CUDA(
       "FusedNBitRowwiseQuantizedSBHalfToFloat",
       fbgemm_gpu::_fusednbitrowwise_to_float_gpu);
@@ -37,8 +48,10 @@ TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
       "FusedNBitRowwiseQuantizedSBHalfToHalf",
       fbgemm_gpu::_fusednbitrowwise_to_half_gpu);
   DISPATCH_TO_CUDA(
-      "HalfToFusedNBitRowwiseQuantizedSBHalf",
-      fbgemm_gpu::_half_to_fusednbitrowwise_gpu);
+      "FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf",
+      fbgemm_gpu::_fusednbitrowwise_to_float_or_half_gpu);
   DISPATCH_TO_CUDA("FloatToHFP8Quantized", fbgemm_gpu::_float_to_hfp8_gpu);
   DISPATCH_TO_CUDA("HFP8QuantizedToFloat", fbgemm_gpu::_hfp8_to_float_gpu);
+  DISPATCH_TO_CUDA("FloatToMSFPQuantized", fbgemm_gpu::_float_to_msfp_gpu);
+  DISPATCH_TO_CUDA("MSFPQuantizedToFloat", fbgemm_gpu::_msfp_to_float_gpu);
 }
diff --git a/fbgemm_gpu/src/sparse_ops.cu b/fbgemm_gpu/src/sparse_ops.cu
index a1c55a3cf..4a4b036c7 100644
--- a/fbgemm_gpu/src/sparse_ops.cu
+++ b/fbgemm_gpu/src/sparse_ops.cu
@@ -9,11 +9,11 @@
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
 #include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
-
 #include <torch/library.h>
 
 // clang-format off
@@ -30,6 +30,12 @@
 #include <hipblas.h>
 #endif
 
+#ifdef __HIP_PLATFORM_HCC__
+#define LDG(ptr) (*(ptr))
+#else
+#define LDG(ptr) (__ldg(ptr))
+#endif
+
 using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
@@ -1680,7 +1686,7 @@ __launch_bounds__(kMaxThreads) void batched_unary_embeddings_forward_kernel(
   int32_t L = indices_end - indices_start;
   at::acc_type<scalar_t, true> sum = 0.0;
   for (int32_t l = 0; l < L; ++l) {
-    auto idx = __ldg(&indices[indices_start + l]);
+    auto idx = LDG(&indices[indices_start + l]);
     sum += weight[n * sum_E + table_offset + idx + 0];
   }
   output[(n * B + b) * T + t] = sum;
@@ -2261,6 +2267,7 @@ std::tuple<Tensor, Tensor> permute_sequence_embeddings_cuda(
     const Tensor& permute,
     const Tensor& lengths,
     const Tensor& embeddings) {
+  // wrapper for permute_2D_sparse_data_cuda, kept for BC
   TENSOR_ON_CUDA_GPU(permute);
   TENSOR_ON_CUDA_GPU(lengths);
   TENSOR_ON_CUDA_GPU(embeddings);
@@ -2276,83 +2283,541 @@ std::tuple<Tensor, Tensor> permute_sequence_embeddings_cuda(
       "The dimension of lengths tensor should be equal to 2"
       "to correctly infer number of features and batch size.")
 
-  const auto permute_contig = permute.contiguous();
-  const auto lengths_contig = lengths.contiguous();
-  const auto embeddings_contig = embeddings.contiguous();
-  // the features to permute over can be less or more with or without
-  // repetitions
-  const auto num_output_embeddings = permute.numel();
-  const auto num_embeddings = lengths.size(0);
-  const auto B = lengths.size(1);
-
   Tensor permuted_lengths;
   Tensor permuted_embeddings;
+  c10::optional<Tensor> weights_dummy;
+  c10::optional<int64_t> permuted_lengths_sum_dummy;
 
-  permuted_lengths = at::empty({num_output_embeddings, B}, lengths.options());
+  const auto T = permute.numel();
+  const auto B = lengths.size(1);
+
+  permuted_lengths = at::empty({T, B}, lengths.options());
+
+  // ignore the third element in the tuple
+  std::tie(permuted_lengths, permuted_embeddings, std::ignore) =
+      fbgemm_gpu::permute_2D_sparse_data_cuda(
+          permute,
+          lengths,
+          embeddings,
+          weights_dummy,
+          permuted_lengths_sum_dummy);
+
+  return {permuted_lengths, permuted_embeddings};
+}
+
+template <typename Length_T, typename Data_T>
+__global__ void pack_segments_cuda_kernel(
+    const Data_T* const data_ptr,
+    const int64_t data_size_0,
+    const Length_T* const lengths_ptr,
+    const Length_T* const lengths_cum_sum,
+    const Length_T max_length,
+    const int64_t num_seq,
+    const int64_t cell_size,
+    const Data_T padding,
+    Data_T* const out_ptr) {
+  // PackSegments requires that the sum of the lengths is equal to the first
+  //  dimension of data
+  CUDA_KERNEL_ASSERT(
+      data_size_0 == lengths_cum_sum[num_seq - 1] + lengths_ptr[num_seq - 1]);
+
+  CUDA_KERNEL_LOOP(i, num_seq * max_length * cell_size) {
+    const auto seq = (i / cell_size) / max_length;
+    const auto cell = (i / cell_size) % max_length;
+    const auto offset = i % cell_size;
+    if (cell >= lengths_ptr[seq]) {
+      out_ptr[i] = padding;
+    } else {
+      const auto idx = (lengths_cum_sum[seq] + cell) * cell_size + offset;
+      out_ptr[i] = data_ptr[idx];
+    }
+  }
+}
+
+/// Map N dim tensor to N+1 dim based on lengths tensor.
+/// Sequences that are shorter than the longest sequence are padded with
+/// zeros.
+/// @param t_in         N dim Tensor.
+/// @param lengths      1D int/long tensor contains the length in each of the
+/// output.
+/// @param max_length   The pre-defined max_length for the packed segments.
+/// @return packed_tensor
+///         packed_tensor  N + 1 dim Tensor where dim(1) is the max length,
+///                        dim(0) is the batch size.
+Tensor pack_segments_forward_cuda(
+    const Tensor& t_in,
+    const Tensor& lengths,
+    const int64_t max_length) {
+  TENSOR_ON_CUDA_GPU(t_in);
+  TENSOR_ON_CUDA_GPU(lengths);
+  TENSOR_NDIM_IS_GE(t_in, 1);
+  TENSOR_NDIM_EQUALS(lengths, 1);
+  TORCH_CHECK(
+      t_in.dtype() == at::ScalarType::Float ||
+          t_in.dtype() == at::ScalarType::Double,
+      "t_in must be of type float or double");
+  TORCH_CHECK(max_length > 0, "max_length must be a positive number");
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(t_in.get_device());
+
+  const auto t_in_c = t_in.contiguous();
+
+  Tensor packed_tensor;
+
+  AT_DISPATCH_INDEX_TYPES(lengths.scalar_type(), "pack_segments_cuda", [&] {
+    const auto* const lengths_data = lengths.data_ptr<index_t>();
+
+    // Shape of output is batch_size x max_len x ...
+    auto shape = t_in_c.sizes().vec(); // Get copy of current shape
+    shape[0] = max_length; // Set first element to max_len
+    shape.insert(
+        shape.begin(), lengths.numel()); // Insert batch size at beginning
+    packed_tensor = at::zeros(shape, t_in_c.options());
+
+    if (t_in_c.size(0) == 0 || lengths.size(0) == 0) {
+      return; // Return empty output (with the proper shape)
+    }
+
+    auto lengths_prefix_sum =
+        fbgemm_gpu::asynchronous_exclusive_cumsum_gpu(lengths);
+    auto lps_data = lengths_prefix_sum.data_ptr<index_t>();
+
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        t_in_c.scalar_type(),
+        "pack_segments_cuda-packing",
+        [&] {
+          const auto* const data_ptr = t_in_c.data_ptr<scalar_t>();
+          auto* const out_data = packed_tensor.data_ptr<scalar_t>();
+          const auto num_seq = lengths.size(0);
+          const auto cell_size = t_in_c.numel() / t_in_c.size(0);
+          pack_segments_cuda_kernel<index_t, scalar_t>
+              <<<cuda_calc_xblock_count(num_seq * max_length * cell_size, 128),
+                 128,
+                 0,
+                 at::cuda::getCurrentCUDAStream()>>>(
+                  data_ptr,
+                  t_in_c.size(0),
+                  lengths_data,
+                  lps_data,
+                  max_length,
+                  num_seq,
+                  cell_size,
+                  static_cast<scalar_t>(0),
+                  out_data);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+  });
+
+  return packed_tensor;
+}
+
+template <typename Length_T, typename Data_T>
+__global__ void unpack_segments_cuda_kernel(
+    const Data_T* const data_ptr,
+    const Length_T* const lengths_ptr,
+    const Length_T* const lengths_cum_sum,
+    const Length_T max_length,
+    const int64_t num_seq,
+    const int64_t cell_size,
+    Data_T* const out_ptr) {
+  CUDA_KERNEL_LOOP(i, num_seq * max_length * cell_size) {
+    const auto seq = (i / cell_size) / max_length;
+    const auto cell = (i / cell_size) % max_length;
+    const auto offset = i % cell_size;
+    if (cell < lengths_ptr[seq]) {
+      const auto idx = (lengths_cum_sum[seq] + cell) * cell_size + offset;
+      out_ptr[idx] = data_ptr[i];
+    }
+  }
+}
+
+/// Map N+1 dim tensor to N dim based on lengths tensor
+/// Sequences that are shorter than the longest sequence are padded with
+/// zeros.
+/// @param data         N+1 dim Tensor.
+/// @param lengths      1D int/long tensor contains the length in each of the
+/// input.
+/// @param total_length Sum of elements in the 1D tensor legnths
+/// @param max_length   The pre-defined max_length for the packed segments.
+/// @return unpacked_tensor N-dimensional tensor
+Tensor pack_segments_backward_cuda(
+    const Tensor& data,
+    const Tensor& lengths,
+    int64_t total_length,
+    int64_t max_length) {
+  TENSOR_ON_CUDA_GPU(data);
+  TENSOR_ON_CUDA_GPU(lengths);
+  TENSOR_NDIM_IS_GE(data, 2);
+  TENSOR_NDIM_EQUALS(lengths, 1);
+  TORCH_CHECK(
+      data.size(0) == lengths.size(0),
+      "LENGTHS and DATA must match in dimension 0");
+  TORCH_CHECK(
+      data.dtype() == at::ScalarType::Float ||
+          data.dtype() == at::ScalarType::Double,
+      "data must be of type float or double");
+  TORCH_CHECK(
+      max_length == data.size(1),
+      "max_length should be equal to the second dimension of the packed segments");
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(data.get_device());
+
+  Tensor unpacked_tensor; // The output tensor
+
+  AT_DISPATCH_INDEX_TYPES(lengths.scalar_type(), "unpack_segments_cuda", [&] {
+    const auto* const lengths_data = lengths.data_ptr<index_t>();
+
+    // Create output tensor of appropriate dimensions
+    auto shape = data.sizes().vec();
+    shape.erase(shape.begin());
+    shape[0] = total_length;
+    unpacked_tensor = at::empty(shape, data.options());
+
+    if (!(data.size(0) && data.size(1))) { // TODO: What does this mean?
+      return;
+    }
+
+    auto lengths_prefix_sum =
+        fbgemm_gpu::asynchronous_exclusive_cumsum_gpu(lengths);
+    auto lps_data = lengths_prefix_sum.data_ptr<index_t>();
+
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        data.scalar_type(),
+        "unpack_segments_cuda-unpacking",
+        [&] {
+          const auto num_seq = lengths.size(0);
+          const auto cell_size = data.numel() / (data.size(0) * data.size(1));
+          const auto* const data_ptr = data.data_ptr<scalar_t>();
+          auto* const out_data = unpacked_tensor.data_ptr<scalar_t>();
+
+          unpack_segments_cuda_kernel<index_t, scalar_t>
+              <<<cuda_calc_xblock_count(num_seq * max_length * cell_size, 128),
+                 128,
+                 0,
+                 at::cuda::getCurrentCUDAStream()>>>(
+                  data_ptr,
+                  lengths_data,
+                  lps_data,
+                  max_length,
+                  num_seq,
+                  cell_size,
+                  out_data);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+  });
+
+  return unpacked_tensor;
+}
+
+constexpr int MAX_ELEMENTS_PER_THREAD = 4;
+
+template <typename index_t, typename scalar_t, int UNROLL_FACTOR>
+__global__
+__launch_bounds__(kMaxThreads) void index_select_2d_with_sorted_indices_kernel(
+    const at::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits> input,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        sorted_indices,
+    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        orig_indices,
+    at::PackedTensorAccessor32<scalar_t, 2> output) {
+  const int N = sorted_indices.size(0);
+  const int input_size = input.size(0);
+  const int D = input.size(1);
+  CUDA_KERNEL_ASSERT(output.size(0) == N)
+
+  for (int row = blockIdx.x; row < N; row += gridDim.x) {
+    const index_t src_idx = sorted_indices[row];
+    const int64_t dst_idx = orig_indices[row];
+    CUDA_KERNEL_ASSERT(src_idx < input_size)
+    int col;
+    for (col = threadIdx.x * UNROLL_FACTOR;
+         col < D / UNROLL_FACTOR * UNROLL_FACTOR;
+         col += blockDim.x * UNROLL_FACTOR) {
+#pragma unroll
+      for (int i = 0; i < UNROLL_FACTOR; i++) {
+        output[dst_idx][col + i] = LDG(&input[src_idx][col + i]);
+      }
+    }
+    for (; col < D; ++col) {
+      output[dst_idx][col] = LDG(&input[src_idx][col]);
+    }
+  }
+}
+
+template <typename index_t, typename scalar_t, int UNROLL_FACTOR>
+__global__
+__launch_bounds__(kMaxThreads) void index_add_2d_with_unique_indices_kernel(
+    const at::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits>
+        out_grad,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        unique_indices,
+    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
+        orig_indices,
+    const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> offsets,
+    at::PackedTensorAccessor32<scalar_t, 2> in_deduped_grad,
+    const int stride_D,
+    const int rounded_D,
+    const int remaining_D,
+    const bool consecutive_indices,
+    const int consecutive_range_start) {
+  const int start_offset = blockIdx.x == 0 ? 0 : offsets[blockIdx.x - 1];
+  const int end_offset = offsets[blockIdx.x];
+  index_t dst_idx = consecutive_indices ? blockIdx.x + consecutive_range_start
+                                        : unique_indices[blockIdx.x];
+  const bool has_remainder = blockIdx.y == blockDim.y - 1 && remaining_D > 0 &&
+      threadIdx.x < remaining_D;
+
+  // Buffer for storing temporary results
+  scalar_t sum[MAX_ELEMENTS_PER_THREAD];
+  for (int i = 0; i < MAX_ELEMENTS_PER_THREAD; i++) {
+    sum[i] = 0;
+  }
+
+  scalar_t sum_remainder = 0;
+
+  // Each thread block processes max of stride_D elements
+  int start_D = (blockIdx.y * stride_D) + (threadIdx.x * UNROLL_FACTOR);
+
+  // For each row
+  for (int row = start_offset; row < end_offset; row++) {
+    int64_t src_idx = orig_indices[row];
+    int col, i;
+    for (col = start_D, i = 0; col < start_D + stride_D && col < rounded_D;
+         col += blockDim.x * UNROLL_FACTOR, i += UNROLL_FACTOR) {
+#pragma unroll
+      for (int j = 0; j < UNROLL_FACTOR; j++) {
+        sum[i + j] += LDG(&out_grad[src_idx][col + j]);
+      }
+    }
+    if (has_remainder) {
+      sum_remainder += LDG(&out_grad[src_idx][rounded_D + threadIdx.x]);
+    }
+  } // for each row
+
+  // Write results to global memory
+  int col, i;
+  for (col = start_D, i = 0; col < start_D + stride_D && col < rounded_D;
+       col += blockDim.x * UNROLL_FACTOR, i += UNROLL_FACTOR) {
+#pragma unroll
+    for (int j = 0; j < UNROLL_FACTOR; j++) {
+      in_deduped_grad[dst_idx][col + j] = sum[i + j];
+    }
+  }
+  if (has_remainder) {
+    in_deduped_grad[dst_idx][rounded_D + threadIdx.x] += sum_remainder;
+  }
+}
+
+template <typename index_t>
+__global__
+__launch_bounds__(kMaxThreads) void compute_frequency_sequence_kernel(
+    index_t* input,
+    int64_t* output,
+    index_t start_input,
+    const int input_size) {
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i >= input_size) {
+    return;
+  }
+  // Atomic could become a bottleneck if frequencies are very skew
+  atomicAdd(&output[input[i] - start_input], 1);
+}
+
+void compute_frequency_sequence(
+    const Tensor& input,
+    Tensor& output,
+    const int start_input,
+    const int output_size) {
+  output = at::zeros({output_size}, input.options().dtype(at::kLong));
 
-  constexpr int32_t threads_1 = 256;
-  const auto blocks_1 =
-      cuda_calc_xblock_count(B * num_output_embeddings, threads_1);
   AT_DISPATCH_INDEX_TYPES(
-      lengths.scalar_type(), "permute_2D_lengths_kernel", [&] {
-        fbgemm_gpu::permute_2D_lengths_kernel<index_t>
-            <<<blocks_1, threads_1, 0, at::cuda::getCurrentCUDAStream()>>>(
-                num_output_embeddings,
-                B,
-                lengths_contig.data_ptr<index_t>(),
-                permute_contig.data_ptr<int32_t>(),
-                permuted_lengths.data_ptr<index_t>());
+      input.scalar_type(), "compute_frequency_sequence_kernel_1", [&] {
+        compute_frequency_sequence_kernel<index_t>
+            <<<cuda_calc_xblock_count(input.numel(), kWarpSize),
+               kWarpSize,
+               0,
+               at::cuda::getCurrentCUDAStream()>>>(
+                input.data_ptr<index_t>(),
+                output.data_ptr<int64_t>(),
+                start_input,
+                input.numel());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
+}
 
-  // convert lengths to offsets
-  const auto input_offsets =
-      fbgemm_gpu::asynchronous_exclusive_cumsum_gpu(lengths_contig);
-  const auto output_offsets =
-      fbgemm_gpu::asynchronous_exclusive_cumsum_gpu(permuted_lengths);
-  int64_t permuted_lengths_sum = embeddings.numel();
+template <
+    typename scalar_t,
+    int ndim,
+    template <typename U> class PtrTraits = at::DefaultPtrTraits>
+at::PackedTensorAccessor32<scalar_t, ndim, PtrTraits>
+dummy_packed_accessor32() {
+  std::array<int64_t, ndim> zeros{};
+  return {nullptr, zeros.data(), zeros.data()};
+}
 
-  /* TODO: Remove the condition protecting the slow path because even when the
-   * condition below is true permuted_lengths.sum() could still be needed. For
-   * instance if there are three features with indices `[0, 1, 2]`, `permute`
-   * can be `[0, 1, 1]` for which permuted lengths sum would be needed to
-   * create permuted_embeddings and `permuted_lengths_sum = embeddings.numel()
-   * would be incorrect.
-   */
-  if (num_embeddings != num_output_embeddings) {
-    permuted_lengths_sum = permuted_lengths.sum().item<int64_t>();
+Tensor index_select_with_sorted_indices_cuda(
+    const Tensor& input,
+    const Tensor& sorted_indices,
+    const Tensor& orig_indices,
+    const int consecutive_range_start,
+    const int consecutive_range_length) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(input.get_device());
+
+  const int N = sorted_indices.size(0);
+  auto output_shape = input.sizes().vec();
+  output_shape[0] = N;
+
+  if (input.numel() == 0 || N == 0) {
+    return at::empty(output_shape, input.options());
   }
 
-  constexpr int32_t BT_blocks = 32;
-  dim3 threads_2(32, BT_blocks);
-  const auto blocks_2 =
-      cuda_calc_xblock_count(B * num_output_embeddings, BT_blocks);
-  permuted_embeddings = at::empty(permuted_lengths_sum, embeddings.options());
+  Tensor input_reshaped = input.reshape({input.size(0), -1});
+  const int D = input_reshaped.size(1);
+
+  Tensor output = at::empty({N, D}, input_reshaped.options());
+
+  const int UNROLL_FACTOR = 2;
+
   AT_DISPATCH_INDEX_TYPES(
-      input_offsets.scalar_type(), "permute_embeddings_kernel_1", [&] {
-        AT_DISPATCH_FLOATING_TYPES_AND(
-            at::ScalarType::Int,
-            embeddings.scalar_type(),
-            "permute_embeddings_kernel_2",
-            [&] {
-              permute_embeddings_kernel<index_t, scalar_t>
-                  <<<blocks_2,
-                     threads_2,
-                     0,
-                     at::cuda::getCurrentCUDAStream()>>>(
-                      permuted_lengths_sum,
-                      num_output_embeddings,
-                      B,
-                      embeddings_contig.data_ptr<scalar_t>(),
-                      permute_contig.data_ptr<int32_t>(),
-                      input_offsets.data_ptr<index_t>(),
-                      output_offsets.data_ptr<index_t>(),
-                      permuted_embeddings.data_ptr<scalar_t>());
+      sorted_indices.scalar_type(), "index_add_2d_kernel_1", [&] {
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input_reshaped.scalar_type(), "index_add_2d_kernel_2", [&] {
+              index_select_2d_with_sorted_indices_kernel<
+                  index_t,
+                  scalar_t,
+                  UNROLL_FACTOR><<<
+                  cuda_calc_xblock_count(N, 1),
+                  std::min(div_round_up(D, UNROLL_FACTOR), kMaxThreads),
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  input_reshaped
+                      .packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(),
+                  sorted_indices
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  orig_indices
+                      .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+                  output.packed_accessor32<scalar_t, 2>());
               C10_CUDA_KERNEL_LAUNCH_CHECK();
             });
       });
 
-  return {permuted_lengths, permuted_embeddings};
+  return output.reshape(output_shape);
+}
+
+Tensor index_add_with_unique_indices_cuda(
+    const Tensor& grad_output,
+    const Tensor& sorted_indices,
+    const Tensor& orig_indices,
+    std::vector<int64_t>& input_shape,
+    const int consecutive_range_start,
+    const int consecutive_range_length) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(grad_output.get_device());
+
+  const int N = grad_output.size(0);
+
+  if (grad_output.numel() == 0) {
+    return at::zeros(input_shape, grad_output.options());
+  }
+
+  const Tensor grad_output_reshaped = grad_output.reshape({N, -1});
+  const int D = grad_output_reshaped.size(1);
+
+  TORCH_CHECK(sorted_indices.size(0) == N);
+
+  Tensor input_grad = at::zeros({input_shape[0], D}, grad_output.options());
+  bool consecutive_indices =
+      consecutive_range_start >= 0 && consecutive_range_length > 0;
+
+  AT_DISPATCH_INDEX_TYPES(
+      sorted_indices.scalar_type(), "index_add_2d_kernel_1", [&] {
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            grad_output.scalar_type(), "index_add_2d_kernel_2", [&] {
+              // UNROLL_FACTOR is determined based on the empirical study
+              const int UNROLL_FACTOR = std::is_same<scalar_t, float>() ? 4 : 2;
+              const int rounded_D = D / UNROLL_FACTOR * UNROLL_FACTOR;
+              const int remaining_D = D - rounded_D;
+              int block_size =
+                  std::min(div_round_up(D, UNROLL_FACTOR), kMaxThreads);
+              block_size = std::max(remaining_D, block_size);
+              // Number of elements per block
+              const int stride_D = MAX_ELEMENTS_PER_THREAD * block_size;
+
+              int num_unique_indices;
+              Tensor unique_indices, offsets;
+              if (consecutive_indices) {
+                TORCH_CHECK(
+                    consecutive_range_start < input_shape[0] &&
+                    consecutive_range_start + consecutive_range_length - 1 <
+                        input_shape[0]);
+
+                // Since indices are selected from consecutive range, we can
+                // infer the number of unique indices from
+                // consecutive_range_length
+                num_unique_indices = consecutive_range_length;
+                compute_frequency_sequence(
+                    sorted_indices,
+                    offsets,
+                    consecutive_range_start,
+                    num_unique_indices);
+                offsets = offsets.cumsum(0);
+              } else {
+                Tensor unique_count;
+                // Unique consecutive does D->H transfer internally
+                // (enforcing synchronization between host and device)
+                std::tie(unique_indices, std::ignore, unique_count) =
+                    at::unique_consecutive(sorted_indices, false, true, 0);
+
+                // This does D->H transfer
+                num_unique_indices = unique_indices.numel();
+                offsets = unique_count.cumsum(0);
+              }
+
+              const dim3 grid_size(
+                  cuda_calc_xblock_count(num_unique_indices, 1),
+                  (D + stride_D - 1) / stride_D,
+                  1);
+
+              index_add_2d_with_unique_indices_kernel<
+                  index_t,
+                  scalar_t,
+                  UNROLL_FACTOR><<<
+                  grid_size,
+                  block_size,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  grad_output_reshaped
+                      .packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(),
+                  consecutive_indices ? dummy_packed_accessor32<
+                                            index_t,
+                                            1,
+                                            at::RestrictPtrTraits>()
+                                      : unique_indices.packed_accessor32<
+                                            index_t,
+                                            1,
+                                            at::RestrictPtrTraits>(),
+                  orig_indices
+                      .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+                  offsets
+                      .packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
+                  input_grad.packed_accessor32<scalar_t, 2>(),
+                  stride_D, // Pass constants as kernel args
+                  rounded_D,
+                  remaining_D,
+                  consecutive_indices,
+                  consecutive_range_start);
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+      });
+  return input_grad.reshape(input_shape);
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops_cpu.cpp
index b4739c1f9..032dcff5f 100644
--- a/fbgemm_gpu/src/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops_cpu.cpp
@@ -14,6 +14,7 @@
 #include <torch/library.h>
 #include "ATen/Parallel.h"
 
+#include <torch/csrc/autograd/custom_function.h>
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
@@ -297,9 +298,10 @@ void _block_bucketize_sparse_features_cpu(
         // bucketization can distribute them into different ranks and within
         // range of blk_size, we expect the later embedding module to take care
         // of hashing indices calculation.
-        const auto idx = static_cast<int64_t>(indices_data[i]);
-        const auto p =
-            idx < blk_size * my_size ? idx / blk_size : idx % my_size;
+        uindex_t idx = static_cast<uindex_t>(indices_data[i]);
+        uindex_t p = idx < static_cast<uindex_t>(blk_size * my_size)
+            ? idx / blk_size
+            : idx % my_size;
         new_lengths_data[p * lengths_size + b_t]++;
       }
     }
@@ -321,10 +323,13 @@ void _block_bucketize_sparse_features_cpu(
         // bucketization can distribute them into different ranks and within
         // range of blk_size, we expect the later embedding module to take care
         // of hashing indices calculation.
-        const auto idx = static_cast<int64_t>(indices_data[i]);
-        const auto p =
-            idx < blk_size * my_size ? idx / blk_size : idx % my_size;
-        const uindex_t new_idx = idx % blk_size;
+        const uindex_t idx = static_cast<uindex_t>(indices_data[i]);
+        const uindex_t p = idx < static_cast<uindex_t>(blk_size * my_size)
+            ? idx / blk_size
+            : idx % my_size;
+        const uindex_t new_idx = idx < static_cast<uindex_t>(blk_size * my_size)
+            ? idx % blk_size
+            : idx / my_size;
         const uoffset_t pos = new_offsets_data[p * lengths_size + b_t];
         new_indices_data[pos] = new_idx;
         if (sequence) {
@@ -344,59 +349,43 @@ void _block_bucketize_sparse_features_cpu(
 
 void FloatToBFloat16Quantized_ref(
     const float* const input,
-    const size_t nrows,
-    const size_t ncols,
+    const size_t numel,
     uint16_t* const output) {
-  for (const auto row : c10::irange(nrows)) {
-    const float* input_row = input + row * ncols;
-    uint16_t* output_row = output + row * ncols;
-
-    for (const auto col : c10::irange(ncols)) {
-      output_row[col] =
-          (*reinterpret_cast<const uint32_t*>(input_row + col) + (1 << 15)) >>
-          16;
-    }
+  for (const auto idx : c10::irange(numel)) {
+    const float* input_elem = input + idx;
+    uint16_t* output_elem = output + idx;
+    *output_elem =
+        (*reinterpret_cast<const uint32_t*>(input_elem) + (1 << 15)) >> 16;
   }
 }
 
 void BFloat16QuantizedToFloat_ref(
     const at::BFloat16* const input,
-    const size_t nrows,
-    const size_t ncols,
+    const size_t numel,
     float* const output) {
-  const int32_t output_columns = ncols;
-
-  for (const auto row : c10::irange(nrows)) {
-    const at::BFloat16* input_row = input + row * ncols;
-    float* output_row = output + row * output_columns;
-
-    for (const auto col : c10::irange(ncols)) {
-      uint32_t val_fp32 = static_cast<uint32_t>(
-                              reinterpret_cast<const uint16_t*>(input_row)[col])
-          << 16;
-      reinterpret_cast<uint32_t*>(output_row)[col] = val_fp32;
-    }
+  for (const auto idx : c10::irange(numel)) {
+    const at::BFloat16* input_elem = input + idx;
+    float* output_elem = output + idx;
+
+    uint32_t val_fp32 =
+        static_cast<uint32_t>(*reinterpret_cast<const uint16_t*>(input_elem))
+        << 16;
+    *reinterpret_cast<uint32_t*>(output_elem) = val_fp32;
   }
 }
 
 // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia NCCL
 at::Tensor _float_to_bfloat16_cpu(const at::Tensor& input) {
   TENSOR_ON_CPU(input);
-  TENSOR_NDIM_EQUALS(input, 2);
 
   const auto input_sizes = input.sizes();
-  const int32_t nrows = input_sizes[0];
-  const int32_t ncols = input_sizes[1];
-  const int32_t output_columns = ncols;
   auto output = at::empty(
-      {nrows, output_columns},
+      input_sizes,
       input.options().dtype(at::kHalf)); // at::kHalf
-  // input.options().dtype(at::kBFloat16)); // at::kBFloat16
 
   FloatToBFloat16Quantized_ref(
       input.data_ptr<float>(),
-      nrows,
-      ncols,
+      input.numel(),
       reinterpret_cast<uint16_t*>(output.data_ptr<at::Half>()));
 
   return output;
@@ -405,21 +394,14 @@ at::Tensor _float_to_bfloat16_cpu(const at::Tensor& input) {
 // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia NCCL
 at::Tensor _bfloat16_to_float_cpu(const at::Tensor& input) {
   TENSOR_ON_CPU(input);
-  TENSOR_NDIM_EQUALS(input, 2);
 
   const auto input_sizes = input.sizes();
-  const int32_t nrows = input_sizes[0];
-  const int32_t ncols = input_sizes[1];
-  const int32_t output_columns = ncols;
 
-  auto output = at::empty(
-      {nrows, output_columns}, // 4 = sizeof(float)
-      input.options().dtype(at::kFloat)); //
+  auto output = at::empty(input_sizes, input.options().dtype(at::kFloat));
 
   BFloat16QuantizedToFloat_ref(
       reinterpret_cast<at::BFloat16*>(input.data_ptr<at::Half>()),
-      nrows,
-      ncols,
+      input.numel(),
       output.data_ptr<float>());
 
   return output;
@@ -2123,6 +2105,7 @@ std::tuple<Tensor, Tensor> permute_sequence_embeddings_cpu(
     const Tensor& permute,
     const Tensor& lengths,
     const Tensor& embeddings) {
+  // wrapper for permute_2D_sparse_data_cpu, kept for BC
   TENSOR_ON_CPU(permute);
   TENSOR_ON_CPU(lengths);
   TENSOR_ON_CPU(embeddings);
@@ -2132,62 +2115,251 @@ std::tuple<Tensor, Tensor> permute_sequence_embeddings_cpu(
       "The dimension of lengths tensor should be equal to 2"
       "to correctly infer number of features and batch size.");
 
-  const auto permute_contig = permute.expect_contiguous();
-  const auto lengths_contig = lengths.expect_contiguous();
-  const auto embeddings_contig = embeddings.expect_contiguous();
-  // the features to permute over can be less or more with or without
-  // repetitions
-  const auto num_output_features = permute.numel();
-  const auto B = lengths.sizes()[1];
-
   Tensor permuted_lengths;
   Tensor permuted_embeddings;
+  c10::optional<Tensor> weights_dummy;
+  c10::optional<int64_t> permuted_lengths_sum_dummy;
 
-  permuted_lengths = at::empty({num_output_features, B}, lengths.options());
+  const auto T = permute.numel();
+  const auto B = lengths.size(1);
 
-  const auto lengths_size = lengths.numel();
-  auto input_offsets = at::empty({lengths_size + 1}, lengths.options());
+  permuted_lengths = at::empty({T, B}, lengths.options());
 
-  int num_threads = at::get_num_threads();
-  std::vector<int64_t> output_offsets_per_thread_cumsum(
-      (num_threads + 1) * FALSE_SHARING_PAD, 0);
+  // ignore the third element in the tuple
+  std::tie(permuted_lengths, permuted_embeddings, std::ignore) =
+      fbgemm_gpu::permute_2D_sparse_data_cpu(
+          permute,
+          lengths,
+          embeddings,
+          weights_dummy,
+          permuted_lengths_sum_dummy);
+
+  return {permuted_lengths, permuted_embeddings};
+}
+
+/// Map N dim tensor to N+1 dim based on lengths tensor.
+/// Sequences that are shorter than the longest sequence are padded with zeros.
+/// @param t_in         N dim Tensor.
+/// @param lengths      1D int/long tensor contains the length in each of the
+/// output.
+/// @param max_length   The pre-defined max_length for the packed segments. -1
+/// means autodetect
+/// @return packed_tensor
+///            packed_tensor        N + 1 dim Tensor where dim(1) is the max
+///                                 length, dim(0) is the batch size.
+Tensor pack_segments_forward_cpu(
+    const Tensor& t_in,
+    const Tensor& lengths,
+    const int64_t max_length) {
+  TENSOR_NDIM_IS_GE(t_in, 1);
+  TENSOR_NDIM_EQUALS(lengths, 1);
+  TORCH_CHECK(
+      t_in.dtype() == at::ScalarType::Float ||
+          t_in.dtype() == at::ScalarType::Double,
+      "t_in must be of type float or double");
+  TORCH_CHECK(max_length > 0, "max_length must be a positive number");
+
+  const auto t_in_cont = t_in.expect_contiguous();
+  Tensor packed_tensor;
 
   AT_DISPATCH_INDEX_TYPES(
-      lengths.scalar_type(), "permute_lengths_cpu_kernel", ([&] {
-        _permute_lengths_cpu_kernel(
-            num_output_features,
-            B,
-            lengths_contig->data_ptr<index_t>(),
-            lengths_size,
-            permute.data_ptr<int32_t>(),
-            permuted_lengths.data_ptr<index_t>(),
-            input_offsets.data_ptr<index_t>(),
-            output_offsets_per_thread_cumsum.data());
-      })); // for each scalar_t
+      lengths.scalar_type(), "pack_segments_cpu", ([&]() {
+        const auto* const lengths_data = lengths.data_ptr<index_t>();
+
+        // Shape of output is batch_size x max_len x ...
+        auto shape = t_in_cont->sizes().vec(); // Get copy of current shape
+        shape[0] = max_length; // Set first element to max_len
+        shape.insert(
+            shape.begin(), lengths.numel()); // Insert batch size at beginning
+        packed_tensor = at::zeros(shape, t_in_cont->options());
+
+        if (t_in_cont->sizes()[0] == 0) {
+          return; // Return empty output (with the proper shape)
+        }
+
+        AT_DISPATCH_FLOATING_TYPES(
+            t_in_cont->scalar_type(), "pack_segments_cpu-packing", ([&]() {
+              const auto sizes =
+                  t_in_cont->sizes().slice(1, t_in_cont->sizes().size() - 1);
+              const auto block_size = c10::multiply_integers(sizes);
+              const auto block_bytesize = t_in_cont->itemsize() * block_size;
+              const auto* const data_ptr = t_in_cont->data_ptr<scalar_t>();
+              auto* const out_data = packed_tensor.data_ptr<scalar_t>();
+              int64_t start = 0;
+              for (const auto i : c10::irange(lengths.sizes()[0])) {
+                const auto len =
+                    std::min(static_cast<int64_t>(lengths_data[i]), max_length);
+                std::memcpy(
+                    out_data + block_size * max_length * i, // dst
+                    data_ptr + block_size * start, // src
+                    len * block_bytesize);
+                start += lengths_data[i];
+              }
+            }));
+      }));
+
+  return packed_tensor;
+}
+
+/// Map N+1 dim tensor to N dim based on lengths tensor
+/// Sequences that are shorter than the longest sequence are padded with zeros.
+/// @param data         N+1 dim Tensor.
+/// @param lengths      1D int/long tensor contains the length in each of the
+/// input.
+/// @param total_length Sum of elements in the 1D tensor legnths
+/// @param max_length   The pre-defined max_length for the packed segments. -1
+/// means autodetect
+/// @return unpacked_tensor N-dimensional tensor
+Tensor pack_segments_backward_cpu(
+    const Tensor& data,
+    const Tensor& lengths,
+    const int64_t total_length,
+    const int64_t max_length) {
+  TENSOR_NDIM_IS_GE(data, 2);
+  TENSOR_NDIM_EQUALS(lengths, 1);
+  TORCH_CHECK(
+      data.sizes()[0] == lengths.sizes()[0],
+      "LENGTHS and DATA must match in dimension 0");
+  TORCH_CHECK(
+      data.dtype() == at::ScalarType::Float ||
+          data.dtype() == at::ScalarType::Double,
+      "data must be of type float or double");
+  TORCH_CHECK(
+      max_length == data.sizes()[1],
+      "max_length should be equal to the second dimension of the packed segments");
+
+  Tensor unpacked_tensor; // The output tensor
 
-  auto permuted_lengths_sum =
-      output_offsets_per_thread_cumsum[num_threads * FALSE_SHARING_PAD];
-  permuted_embeddings = at::empty(permuted_lengths_sum, embeddings.options());
   AT_DISPATCH_INDEX_TYPES(
-      input_offsets.scalar_type(), "permute_embeddings_kernel_1", ([&] {
+      lengths.scalar_type(), "unpack_segments_cpu", ([&]() {
+        const auto* const lengths_data = lengths.data_ptr<index_t>();
+
+        // Create output tensor of appropriate dimensions
+        auto shape = data.sizes().vec();
+        shape.erase(shape.begin());
+        shape[0] = total_length;
+        unpacked_tensor = at::empty(shape, data.options());
+
+        if (!(data.sizes()[0] &&
+              data.sizes()[1])) { // TODO: What does this mean?
+          return;
+        }
+
         AT_DISPATCH_FLOATING_TYPES(
-            embeddings.scalar_type(), "permute_embeddings_kernel_2", ([&] {
-              permuted_embeddings =
-                  at::empty(permuted_lengths_sum, embeddings.options());
-              _permute_embeddings_kernel_cpu<index_t, scalar_t>(
-                  num_output_features,
-                  B,
-                  embeddings_contig->data_ptr<scalar_t>(),
-                  permute_contig->data_ptr<int32_t>(),
-                  input_offsets.data_ptr<index_t>(),
-                  output_offsets_per_thread_cumsum.data(),
-                  permuted_embeddings.data_ptr<scalar_t>(),
-                  permuted_lengths.data_ptr<index_t>());
-            })); // for each scalar_t
-      })); // for each index_t
-  return {permuted_lengths, permuted_embeddings};
+            data.scalar_type(), "unpack_segments_cpu-unpacking", ([&]() {
+              const auto sizes = data.sizes().slice(2, data.sizes().size() - 2);
+              const auto block_size = c10::multiply_integers(sizes);
+              const auto block_bytesize = data.itemsize() * block_size;
+              const auto* const data_ptr = data.data_ptr<scalar_t>();
+              auto* const out_data = unpacked_tensor.data_ptr<scalar_t>();
+
+              int64_t start = 0;
+              for (const auto i : c10::irange(lengths.sizes()[0])) {
+                int64_t len = lengths_data[i];
+                len =
+                    std::min(static_cast<int64_t>(lengths_data[i]), max_length);
+                std::memcpy(
+                    out_data + block_size * start, // dst
+                    data_ptr + block_size * data.sizes()[1] * i, // src
+                    len * block_bytesize);
+                start += len;
+              }
+            }));
+      }));
+
+  return unpacked_tensor;
 }
 
+class PackSegmentsFunction
+    : public torch::autograd::Function<PackSegmentsFunction> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const Tensor& t_in,
+      const Tensor& lengths,
+      const int64_t max_length) {
+    int64_t total_length = t_in.expect_contiguous()->sizes()[0];
+    ctx->saved_data["max_length"] = max_length;
+    ctx->saved_data["total_length"] = total_length;
+    ctx->save_for_backward({lengths});
+
+    // Run the forward pass.
+    const auto& res = pack_segments_forward_cpu(t_in, lengths, max_length);
+    torch::autograd::variable_list outputs(1);
+    outputs[0] = res;
+    return outputs;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output) {
+    TORCH_CHECK(grad_output.size() == 1);
+    const Tensor& grad = grad_output[0];
+    const auto& max_length = ctx->saved_data["max_length"].toInt();
+    const auto& total_length = ctx->saved_data["total_length"].toInt();
+
+    // Retrieve saved variables for backward.
+    const auto& saved_variables = ctx->get_saved_variables();
+    const auto& lengths = saved_variables[0];
+
+    torch::autograd::variable_list grad_inputs(5);
+    grad_inputs[0] =
+        pack_segments_backward_cpu(grad, lengths, total_length, max_length);
+    return grad_inputs;
+  }
+};
+
+Tensor pack_segments_cpu(
+    const Tensor& t_in,
+    const Tensor& lengths,
+    const int64_t max_length) {
+  const auto& res = PackSegmentsFunction::apply(t_in, lengths, max_length);
+  return res[0];
+}
+
+namespace {
+Tensor index_select_dim0(
+    const Tensor& input,
+    const Tensor& indices,
+    c10::optional<int64_t> /*consecutive_range_start*/,
+    c10::optional<int64_t> /*consecutive_range_length*/) {
+  return at::index_select(input, 0, indices);
+}
+
+Tensor bottom_unique_k_per_row(const Tensor& input, const int64_t k) {
+  auto num_cols = input.size(-1);
+  Tensor input_reshaped = input.reshape({-1, num_cols});
+  auto input_accessor = input_reshaped.accessor<int64_t, 2>();
+
+  // Create output tensor
+  int num_rows = input_reshaped.size(0);
+  Tensor output = at::empty({num_rows, k}, input.options());
+  auto output_accessor = output.accessor<int64_t, 2>();
+
+  for (auto i : c10::irange(input_reshaped.size(0))) {
+    std::set<int64_t> s;
+    for (auto j : c10::irange(num_cols)) {
+      s.insert(input_accessor[i][j]);
+      if (s.size() == static_cast<size_t>(k)) {
+        break;
+      }
+    }
+    TORCH_CHECK(
+        s.size() == static_cast<size_t>(k),
+        "too skewed distribution (alpha too big)")
+    int j = 0;
+    for (int64_t x : s) {
+      output_accessor[i][j] = x;
+      ++j;
+    }
+  }
+
+  auto output_shape = input.sizes().vec();
+  output_shape[output_shape.size() - 1] = k;
+  return output.reshape(output_shape);
+}
+} // namespace
+
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
@@ -2234,6 +2406,30 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "permute102_baddbmm_permute102(Tensor bias, Tensor A, Tensor B) -> Tensor");
   m.def(
       "permute_sequence_embeddings(Tensor permute, Tensor lengths, Tensor embeddings) -> (Tensor, Tensor)");
+  m.def("pack_segments(Tensor t_in, Tensor lengths, int max_length) -> Tensor");
+  // A specialization of at::index_select for selecting dim 0
+  //
+  // The consecutive_range_start and consecutive_range_length arguments are for
+  // the special case where indices are selected from a consecutive range
+  // [consecutive_range_start, consecutive_range_start +
+  // consecutive_range_length).
+  //
+  // For the consecutive indices range case, we can skip the unique indices
+  // computation step in the backward operation because we can infer them from
+  // the consecutive indices range.  This assumption saves computation as well
+  // as a host-device synchronization that occurs in the unique operation of
+  // Torch.
+  //
+  // If indices are not selected from a consecutive range, we perform the
+  // unique indices computation step in the backward operation.
+  m.def(
+      "index_select_dim0(Tensor input, Tensor indices, int? consecutive_range_start=0, int? consecutive_range_length=0) -> Tensor");
+  m.def(
+      "jagged_index_select(Tensor values, Tensor lengths, Tensor indices) -> Tensor[]");
+  // This is an one-off op to be used in bench_utils.py for zipf generation w/o
+  // replacement Along dim=-1, find smallest unique k. If the number of unique
+  // elements is less than k, errors out.
+  m.def("bottom_unique_k_per_row(Tensor input, int k) -> Tensor");
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
@@ -2293,4 +2489,8 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
   DISPATCH_TO_CPU(
       "permute_sequence_embeddings",
       fbgemm_gpu::permute_sequence_embeddings_cpu);
+  DISPATCH_TO_CPU("pack_segments", fbgemm_gpu::pack_segments_cpu);
+  DISPATCH_TO_CPU("index_select_dim0", fbgemm_gpu::index_select_dim0);
+  DISPATCH_TO_CPU(
+      "bottom_unique_k_per_row", fbgemm_gpu::bottom_unique_k_per_row);
 }
diff --git a/fbgemm_gpu/src/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops_gpu.cpp
index e3a68219f..76a4cff22 100644
--- a/fbgemm_gpu/src/sparse_ops_gpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops_gpu.cpp
@@ -17,6 +17,47 @@
 using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
+// Custom PackSegments operator that is based on the Caffe2 PackSegments and
+// UnpackSegments.
+// Needed this to support backward pass.
+class PackSegments : public torch::autograd::Function<PackSegments> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const Tensor& t_in,
+      const Tensor& lengths,
+      const int64_t max_length) {
+    const int64_t total_length = t_in.contiguous().size(0);
+    ctx->saved_data["max_length"] = max_length;
+    ctx->saved_data["total_length"] = total_length;
+    ctx->save_for_backward({lengths});
+
+    // Run the forward pass.
+    const auto& res = pack_segments_forward_cuda(t_in, lengths, max_length);
+
+    torch::autograd::variable_list outputs(1);
+    outputs[0] = res;
+    return outputs;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output) {
+    TORCH_CHECK(grad_output.size() == 2 or grad_output.size() == 1);
+    const Tensor& grad = grad_output[0];
+    const auto& max_length = ctx->saved_data["max_length"].toInt();
+    const auto& total_length = ctx->saved_data["total_length"].toInt();
+
+    // Retrieve saved variables for backward.
+    const auto& saved_variables = ctx->get_saved_variables();
+    const auto& lengths = saved_variables[0];
+
+    torch::autograd::variable_list grad_inputs(5);
+    grad_inputs[0] =
+        pack_segments_backward_cuda(grad, lengths, total_length, max_length);
+    return grad_inputs;
+  }
+};
 
 class LookupFunctionBatchedUnaryEmbeddingOp
     : public torch::autograd::Function<LookupFunctionBatchedUnaryEmbeddingOp> {
@@ -103,6 +144,72 @@ class StackedJagged2DToDenseGPUOp
   }
 };
 
+class IndexSelectDim0GPUOp
+    : public torch::autograd::Function<IndexSelectDim0GPUOp> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const Tensor& input,
+      const Tensor& indices,
+      const int consecutive_range_start,
+      const int consecutive_range_length) {
+    TENSOR_ON_CUDA_GPU(input);
+    TENSOR_ON_CUDA_GPU(indices);
+    TENSORS_ON_SAME_DEVICE(input, indices);
+
+    // Sort indices to promote locality
+    Tensor sorted_indices, orig_indices;
+    std::tie(sorted_indices, orig_indices) = indices.sort();
+
+    ctx->save_for_backward({sorted_indices, orig_indices});
+    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->saved_data["consecutive_range_start"] = consecutive_range_start;
+    ctx->saved_data["consecutive_range_length"] = consecutive_range_length;
+
+    return {index_select_with_sorted_indices_cuda(
+        input,
+        sorted_indices,
+        orig_indices,
+        consecutive_range_start,
+        consecutive_range_length)};
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_outputs) {
+    TORCH_CHECK(grad_outputs.size() == 1);
+    TENSOR_ON_CUDA_GPU(grad_outputs[0]);
+
+    const auto saved = ctx->get_saved_variables();
+    auto savedItr = std::begin(saved);
+    Tensor sorted_indices = *savedItr++;
+    Tensor orig_indices = *savedItr++;
+    TENSOR_ON_CUDA_GPU(sorted_indices);
+    TENSOR_ON_CUDA_GPU(orig_indices);
+    Tensor grad_output = grad_outputs[0];
+    TENSORS_ON_SAME_DEVICE(grad_output, sorted_indices);
+    auto input_shape = ctx->saved_data["input_shape"].toIntVector();
+    int consecutive_range_start =
+        ctx->saved_data["consecutive_range_start"].toInt();
+    int consecutive_range_length =
+        ctx->saved_data["consecutive_range_length"].toInt();
+
+    Tensor undef;
+    return {
+        index_add_with_unique_indices_cuda(
+            grad_output,
+            sorted_indices,
+            orig_indices,
+            input_shape,
+            consecutive_range_start,
+            consecutive_range_length),
+        torch::autograd::Variable(), // indices
+        undef, // consecutive_range_start
+        undef, // consecutive_range_length
+    };
+  }
+};
+
 std::vector<Tensor> stacked_jagged_2d_to_dense_gpu(
     Tensor values,
     Tensor lengths,
@@ -117,6 +224,25 @@ std::vector<Tensor> stacked_jagged_2d_to_dense_gpu(
       values, lengths, offset_per_key, max_lengths_per_key);
 }
 
+Tensor pack_segments_cuda(
+    const Tensor& t_in,
+    const Tensor& lengths,
+    const int64_t max_length) {
+  const auto& res = PackSegments::apply(t_in, lengths, max_length);
+  return res[0];
+}
+
+Tensor index_select_dim0_gpu(
+    const Tensor& input,
+    const Tensor& indices,
+    c10::optional<int64_t> consecutive_range_start,
+    c10::optional<int64_t> consecutive_range_length) {
+  return IndexSelectDim0GPUOp::apply(
+      input,
+      indices,
+      consecutive_range_start ? *consecutive_range_start : 0,
+      consecutive_range_length ? *consecutive_range_length : 0)[0];
+}
 } // namespace fbgemm_gpu
 
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
@@ -186,4 +312,6 @@ TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   DISPATCH_TO_CUDA(
       "permute_sequence_embeddings",
       fbgemm_gpu::permute_sequence_embeddings_cuda);
+  DISPATCH_TO_CUDA("pack_segments", fbgemm_gpu::pack_segments_cuda);
+  DISPATCH_TO_CUDA("index_select_dim0", fbgemm_gpu::index_select_dim0_gpu);
 }
diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
index 6e0c48ccb..1dd718f65 100644
--- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
@@ -15,23 +15,15 @@
 
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
-#if !defined(NEW_GENERATOR_PATH)
-#include <ATen/CUDAGeneratorImpl.h>
-#else
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#endif
 #include <ATen/TensorUtils.h>
 #include <ATen/core/TensorAccessor.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
-#if !defined(NEW_ATOMIC_PATH)
-#include <THC/THCAtomics.cuh>
-#else
 #include <ATen/cuda/Atomic.cuh>
-#endif
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <limits>
 #include <mutex>
@@ -62,7 +54,8 @@ using namespace fbgemm_gpu;
 //   return ((uint64_t)h * (uint64_t)C) >> 32;
 // }
 
-__host__ DEVICE_INLINE uint32_t cache_slot(int64_t h_in, int32_t C) {
+__host__ DEVICE_INLINE uint32_t
+cache_slot(const int64_t h_in, const int32_t C) {
   // MurmurHash3 64-bit mixing function.
   uint64_t h = (uint64_t)h_in;
   h ^= h >> 33;
@@ -98,22 +91,22 @@ __global__ __launch_bounds__(kMaxThreads) void lxu_cache_flush_kernel(
         lxu_cache_weights,
     bool stochastic_rounding,
     at::PhiloxCudaState stochastic_rounding_philox_args) {
-  int32_t B = lxu_cache_weights.size(0);
-  int32_t b = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t B = lxu_cache_weights.size(0);
+  const int32_t b = blockIdx.x * blockDim.y + threadIdx.y;
   if (b >= B) {
     return;
   }
-  int32_t slot = b % kWarpSize;
-  int32_t cache_set = b / kWarpSize;
-  int64_t current_idx = lxu_cache_state[cache_set][slot];
+  const int32_t slot = b % kWarpSize;
+  const int32_t cache_set = b / kWarpSize;
+  const int64_t current_idx = lxu_cache_state[cache_set][slot];
   if (current_idx != static_cast<int64_t>(kCacheStateInvalid)) {
     // evict from slot to backing storage
-    int32_t t_current = cache_index_table_map[current_idx];
-    int64_t idx_current = current_idx - cache_hash_size_cumsum[t_current];
-    int64_t weights_offset_current = weights_offsets[t_current];
-    int32_t D_start_current = D_offsets[t_current];
-    int32_t D_end_current = D_offsets[t_current + 1];
-    int32_t D_current = D_end_current - D_start_current;
+    const int32_t t_current = cache_index_table_map[current_idx];
+    const int64_t idx_current = current_idx - cache_hash_size_cumsum[t_current];
+    const int64_t weights_offset_current = weights_offsets[t_current];
+    const int32_t D_start_current = D_offsets[t_current];
+    const int32_t D_end_current = D_offsets[t_current + 1];
+    const int32_t D_current = D_end_current - D_start_current;
 
     int32_t D_emb = D_current;
     if (std::is_same<emb_t, uint8_t>::value) {
@@ -175,15 +168,15 @@ void lxu_cache_flush_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(lxu_cache_weights.get_device());
 
-  int32_t T = D_offsets.numel() - 1;
-  int32_t S = lxu_cache_weights.size(0);
-  int32_t tx = std::min<int32_t>(total_D / 4 / T, kMaxThreads);
-  dim3 threads(tx, kMaxThreads / tx);
-  dim3 blocks(div_round_up(S, kMaxThreads / tx));
+  const int32_t T = D_offsets.numel() - 1;
+  const int32_t S = lxu_cache_weights.size(0);
+  const int32_t tx = std::min<int32_t>(total_D / 4 / T, kMaxThreads);
+  const dim3 threads(tx, kMaxThreads / tx);
+  const dim3 blocks(div_round_up(S, kMaxThreads / tx));
 
   DISPATCH_EMB_CACHE_TYPES(
-      uvm_weights.type(),
-      lxu_cache_weights.type(),
+      uvm_weights.scalar_type(),
+      lxu_cache_weights.scalar_type(),
       "lxu_cache_flush_kernel_2",
       ([&] {
         at::PhiloxCudaState rng_engine_inputs;
@@ -234,7 +227,8 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_cache_indices_kernel(
   int left = 0;
   int right = table_offsets.size(0);
   while (left != right) {
-    const int middle = (left + right) >> 1;
+    const int middle =
+        left + (right - left) / 2; // Avoid overflow in midpoint calculation
     if (table_offsets[middle] <= index) {
       left = middle + 1;
     } else {
@@ -306,7 +300,7 @@ std::tuple<Tensor, Tensor, c10::optional<Tensor>> get_unique_indices_cuda(
   device_guard.set_index(linear_indices.get_device());
 
   TORCH_CHECK(linear_indices.numel() < std::numeric_limits<int32_t>::max());
-  int32_t N = linear_indices.numel();
+  const int32_t N = linear_indices.numel();
   auto sorted_indices = at::empty_like(linear_indices);
   auto unique_indices = at::empty_like(linear_indices);
   auto unique_indices_length =
@@ -411,10 +405,10 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> cache_sets,
     int64_t time_stamp,
     at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state) {
-  int32_t N = unique_indices.size(0);
-  int32_t C = lxu_cache_state.size(0);
+  const int32_t N = unique_indices.size(0);
+  const int32_t C = lxu_cache_state.size(0);
 
-  int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
   if (n >= N) {
     return;
   }
@@ -433,8 +427,8 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
   }
   int32_t cache_set = cache_slot(idx, C);
 
-  auto slot = threadIdx.x;
-  bool found = __ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
+  const auto slot = threadIdx.x;
+  const bool found = __ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
   if (found) {
     // mark it as existing.
     cache_sets[n] = C; // invalid index, used as sentinel
@@ -443,9 +437,6 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
   }
 
 #ifdef __HIP_PLATFORM_HCC__
-  // FIXME: __any_sync with mask isn't supported by HIP yet.
-  // See https://fburl.com/fvy7j0lq for the similar context.
-  // assert false here with https://fburl.com/pfm7enw2
   if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
 #else
   if (!__any_sync(0xFFFFFFFF, found)) {
@@ -473,7 +464,7 @@ std::pair<Tensor, Tensor> lru_cache_find_uncached_cuda(
 
   auto cache_sets =
       empty_like(unique_indices, unique_indices.options().dtype(at::kInt));
-  int32_t N = unique_indices.numel();
+  const int32_t N = unique_indices.numel();
   auto sorted_cache_sets = empty_like(cache_sets);
   auto cache_set_sorted_unique_indices = empty_like(unique_indices);
 
@@ -548,17 +539,17 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
         lxu_cache_state,
     at::PackedTensorAccessor64<cache_t, 2, at::RestrictPtrTraits>
         lxu_cache_weights,
-    int64_t time_stamp,
+    const int64_t time_stamp,
     at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
-    bool stochastic_rounding,
+    const bool stochastic_rounding,
     at::PhiloxCudaState stochastic_rounding_philox_args) {
-  int32_t C = lxu_cache_state.size(0);
-  int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t C = lxu_cache_state.size(0);
+  const int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
   if (n >= *N_unique) {
     return;
   }
   // check if this warp is responsible for this whole segment.
-  bool segment_start =
+  const bool segment_start =
       (n == 0 || sorted_cache_sets[n - 1] != sorted_cache_sets[n]);
 
   if (!segment_start) {
@@ -566,7 +557,7 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
     // so we can just exit this warp entirely.
     return;
   }
-  int32_t cache_set = sorted_cache_sets[n];
+  const int32_t cache_set = sorted_cache_sets[n];
   if (cache_set == C) {
     // ignore the already-existing elements
     return;
@@ -579,28 +570,28 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
 
   // now, we need to insert the (unique!) values in indices[n:n + SL] into
   // our slots.
-  int32_t slot = threadIdx.x;
-  int64_t slot_time = lru_state[cache_set][slot];
+  const int32_t slot = threadIdx.x;
+  const int64_t slot_time = lru_state[cache_set][slot];
   int64_t costs[1] = {slot_time};
   int32_t slots[1] = {slot};
 
   BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-  int32_t sorted_slot = slots[0];
-  int64_t sorted_lru_cost = costs[0];
+  const int32_t sorted_slot = slots[0];
+  const int64_t sorted_lru_cost = costs[0];
 
   for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-    int32_t insert_slot = shfl_sync(sorted_slot, l);
-    int64_t insert_current_lru_cost = shfl_sync(sorted_lru_cost, l);
+    const int32_t insert_slot = shfl_sync(sorted_slot, l);
+    const int64_t insert_current_lru_cost = shfl_sync(sorted_lru_cost, l);
     if (insert_current_lru_cost == time_stamp) {
       return;
     }
-    int64_t insert_idx = cache_set_sorted_indices[n + l];
-    int32_t t_insert = cache_index_table_map[insert_idx];
-    int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-    int64_t weights_offset_insert = weights_offsets[t_insert];
-    int32_t D_start_insert = D_offsets[t_insert];
-    int32_t D_end_insert = D_offsets[t_insert + 1];
-    int32_t D_insert = D_end_insert - D_start_insert;
+    const int64_t insert_idx = cache_set_sorted_indices[n + l];
+    const int32_t t_insert = cache_index_table_map[insert_idx];
+    const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+    const int64_t weights_offset_insert = weights_offsets[t_insert];
+    const int32_t D_start_insert = D_offsets[t_insert];
+    const int32_t D_end_insert = D_offsets[t_insert + 1];
+    const int32_t D_insert = D_end_insert - D_start_insert;
 
     // ensure that threadIdx.x is the only thread reading/writing to
     // lxu_cache_state
@@ -611,12 +602,13 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel(
     // not empty
     if (current_idx != static_cast<int64_t>(kCacheStateInvalid)) {
       // evict from slot to backing storage
-      int32_t t_current = cache_index_table_map[current_idx];
-      int64_t idx_current = current_idx - cache_hash_size_cumsum[t_current];
-      int64_t weights_offset_current = weights_offsets[t_current];
-      int32_t D_start_current = D_offsets[t_current];
-      int32_t D_end_current = D_offsets[t_current + 1];
-      int32_t D_current = D_end_current - D_start_current;
+      const int32_t t_current = cache_index_table_map[current_idx];
+      const int64_t idx_current =
+          current_idx - cache_hash_size_cumsum[t_current];
+      const int64_t weights_offset_current = weights_offsets[t_current];
+      const int32_t D_start_current = D_offsets[t_current];
+      const int32_t D_end_current = D_offsets[t_current + 1];
+      const int32_t D_current = D_end_current - D_start_current;
       int32_t D_emb = D_current;
       if (std::is_same<emb_t, uint8_t>::value) {
         D_emb += kINT8QparamsBytes;
@@ -707,9 +699,9 @@ void lru_cache_insert_cuda(
     Tensor unique_indices_length,
     Tensor lxu_cache_state,
     Tensor lxu_cache_weights,
-    int64_t time_stamp,
+    const int64_t time_stamp,
     Tensor lru_state,
-    bool stochastic_rounding) {
+    const bool stochastic_rounding) {
   TENSOR_ON_CUDA_GPU(weights);
   TENSOR_ON_CUDA_GPU(cache_hash_size_cumsum);
   TENSOR_ON_CUDA_GPU(cache_index_table_map);
@@ -725,11 +717,11 @@ void lru_cache_insert_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(weights.get_device());
 
-  int32_t N = cache_set_sorted_unique_indices.numel();
+  const int32_t N = cache_set_sorted_unique_indices.numel();
 
   DISPATCH_EMB_CACHE_TYPES(
-      weights.type(),
-      lxu_cache_weights.type(),
+      weights.scalar_type(),
+      lxu_cache_weights.scalar_type(),
       "lru_cache_insert_kernel_2",
       ([&] {
         at::PhiloxCudaState rng_engine_inputs;
@@ -775,16 +767,16 @@ void lru_cache_insert_cuda(
 void lru_cache_populate_cuda(
     Tensor weights,
     Tensor cache_hash_size_cumsum,
-    int64_t total_cache_hash_size,
+    const int64_t total_cache_hash_size,
     Tensor cache_index_table_map,
     Tensor weights_offsets,
     Tensor D_offsets,
     Tensor linear_cache_indices,
     Tensor lxu_cache_state,
     Tensor lxu_cache_weights,
-    int64_t time_stamp,
+    const int64_t time_stamp,
     Tensor lru_state,
-    bool stochastic_rounding) {
+    const bool stochastic_rounding) {
   TENSOR_ON_CUDA_GPU(weights);
   TENSOR_ON_CUDA_GPU(cache_hash_size_cumsum);
   TENSOR_ON_CUDA_GPU(cache_index_table_map);
@@ -866,13 +858,13 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_byte_kernel(
     int64_t time_stamp,
     at::PackedTensorAccessor32<int64_t, 2, at::RestrictPtrTraits> lru_state,
     const int64_t row_alignment) {
-  int32_t C = lxu_cache_state.size(0);
-  int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t C = lxu_cache_state.size(0);
+  const int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
   if (n >= *N_unique) {
     return;
   }
   // check if this warp is responsible for this whole segment.
-  bool segment_start =
+  const bool segment_start =
       (n == 0 || sorted_cache_sets[n - 1] != sorted_cache_sets[n]);
 
   if (!segment_start) {
@@ -880,7 +872,7 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_byte_kernel(
     // so we can just exit this warp entirely.
     return;
   }
-  int32_t cache_set = sorted_cache_sets[n];
+  const int32_t cache_set = sorted_cache_sets[n];
   if (cache_set == C) {
     // ignore the already-existing elements
     return;
@@ -893,30 +885,30 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_byte_kernel(
 
   // now, we need to insert the (unique!) values in indices[n:n + SL] into
   // our slots.
-  int32_t slot = threadIdx.x;
-  int64_t slot_time = lru_state[cache_set][slot];
+  const int32_t slot = threadIdx.x;
+  const int64_t slot_time = lru_state[cache_set][slot];
   int64_t costs[1] = {slot_time};
   int32_t slots[1] = {slot};
 
   BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-  int32_t sorted_slot = slots[0];
-  int64_t sorted_lru_cost = costs[0];
+  const int32_t sorted_slot = slots[0];
+  const int64_t sorted_lru_cost = costs[0];
 
   for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-    int32_t insert_slot = shfl_sync(sorted_slot, l);
-    int64_t insert_current_lru_cost = shfl_sync(sorted_lru_cost, l);
+    const int32_t insert_slot = shfl_sync(sorted_slot, l);
+    const int64_t insert_current_lru_cost = shfl_sync(sorted_lru_cost, l);
     if (insert_current_lru_cost == time_stamp) {
       return;
     }
     index_t insert_idx = cache_set_sorted_indices[n + l];
-    int32_t t_insert = cache_index_table_map[insert_idx];
+    const int32_t t_insert = cache_index_table_map[insert_idx];
     SparseType weight_ty_insert =
         static_cast<SparseType>(weights_tys[t_insert]);
-    int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-    int64_t weights_offset_insert = weights_offsets[t_insert];
-    int32_t D_start_insert = D_offsets[t_insert];
-    int32_t D_end_insert = D_offsets[t_insert + 1];
-    int32_t D_insert = D_end_insert - D_start_insert;
+    const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+    const int64_t weights_offset_insert = weights_offsets[t_insert];
+    const int32_t D_start_insert = D_offsets[t_insert];
+    const int32_t D_end_insert = D_offsets[t_insert + 1];
+    const int32_t D_insert = D_end_insert - D_start_insert;
 
     const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
         D_insert, weight_ty_insert, row_alignment);
@@ -966,7 +958,7 @@ void lru_cache_insert_byte_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(weights.get_device());
 
-  int32_t N = cache_set_sorted_unique_indices.numel();
+  const int32_t N = cache_set_sorted_unique_indices.numel();
 
   AT_DISPATCH_INDEX_TYPES(
       cache_set_sorted_unique_indices.scalar_type(),
@@ -1082,11 +1074,11 @@ __global__ __launch_bounds__(kMaxThreads) void lfu_update_counts_kernel(
     const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         unique_indices_count,
     at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> lfu_state) {
-  int32_t n = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n >= *N_unique) {
     return;
   }
-  auto idx = unique_indices[n];
+  const auto idx = unique_indices[n];
   lfu_state[idx] += unique_indices_count[n];
 }
 
@@ -1103,7 +1095,7 @@ void lfu_update_counts_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(unique_indices.get_device());
 
-  int32_t N = unique_indices.size(0);
+  const int32_t N = unique_indices.size(0);
   AT_DISPATCH_INDEX_TYPES(
       unique_indices.scalar_type(), "lfu_update_counts_cuda", [&] {
         lfu_update_counts_kernel<<<
@@ -1136,9 +1128,9 @@ __global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
     uint64_t* __restrict__ cache_sets,
     const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
         lfu_state) {
-  int32_t N = unique_indices.size(0);
-  int32_t C = lxu_cache_state.size(0);
-  int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t N = unique_indices.size(0);
+  const int32_t C = lxu_cache_state.size(0);
+  const int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
   if (n >= N) {
     return;
   }
@@ -1150,7 +1142,7 @@ __global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
     }
     return;
   }
-  int64_t idx = unique_indices[n];
+  const int64_t idx = unique_indices[n];
   if (idx == max_indices) {
     if (threadIdx.x == 0) {
       cache_sets[n] =
@@ -1159,10 +1151,10 @@ __global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
     }
     return;
   }
-  uint32_t cache_set = cache_slot(idx, C);
+  const uint32_t cache_set = cache_slot(idx, C);
 
-  auto slot = threadIdx.x;
-  bool found = __ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
+  const auto slot = threadIdx.x;
+  const bool found = __ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
   if (found) {
     // mark it as existing.
     cache_sets[n] =
@@ -1171,9 +1163,6 @@ __global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
   }
 
 #ifdef __HIP_PLATFORM_HCC__
-  // FIXME: __any_sync with mask isn't supported by HIP yet.
-  // See https://fburl.com/fvy7j0lq for the similar context.
-  // assert false here with https://fburl.com/pfm7enw2
   if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
 #else
   if (!__any_sync(0xFFFFFFFF, found)) {
@@ -1203,7 +1192,7 @@ std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
 
   auto cache_sets =
       empty_like(unique_indices, unique_indices.options().dtype(at::kLong));
-  int32_t N = unique_indices.numel();
+  const int32_t N = unique_indices.numel();
   auto sorted_cache_sets = empty_like(cache_sets);
   auto cache_set_sorted_unique_indices = empty_like(unique_indices);
 
@@ -1280,13 +1269,13 @@ __global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
         lfu_state,
     bool stochastic_rounding,
     at::PhiloxCudaState stochastic_rounding_philox_args) {
-  int32_t C = lxu_cache_state.size(0);
-  int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t C = lxu_cache_state.size(0);
+  const int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
   if (n >= *N_unique) {
     return;
   }
   // check if this warp is responsible for this whole segment.
-  bool segment_start =
+  const bool segment_start =
       (n == 0 ||
        (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
            (sorted_cache_sets[n] >> kLFUCounterBits));
@@ -1296,7 +1285,7 @@ __global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
     // so we can just exit this warp entirely.
     return;
   }
-  uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
+  const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
   if (cache_set == C) {
     // ignore the already-existing elements
     return;
@@ -1310,9 +1299,9 @@ __global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
 
   // now, we need to insert the (unique!) values in indices[n:n + SL] into
   // our slots.
-  int32_t slot = threadIdx.x;
-  int64_t current_idx = lxu_cache_state[cache_set][slot];
-  int64_t current_lfu_cost =
+  const int32_t slot = threadIdx.x;
+  const int64_t current_idx = lxu_cache_state[cache_set][slot];
+  const int64_t current_lfu_cost =
       (current_idx != static_cast<int64_t>(kCacheStateInvalid))
       ? lfu_state[current_idx]
       : -1;
@@ -1320,14 +1309,14 @@ __global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
   int32_t slots[1] = {slot};
 
   BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-  int32_t sorted_slot = slots[0];
-  int64_t sorted_lfu_cost = costs[0];
+  const int32_t sorted_slot = slots[0];
+  const int64_t sorted_lfu_cost = costs[0];
 
   for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-    int32_t insert_slot = shfl_sync(sorted_slot, l);
-    int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
-    int64_t insert_idx = cache_set_sorted_indices[n + l];
-    int64_t insert_lfu_cost = lfu_state[insert_idx];
+    const int32_t insert_slot = shfl_sync(sorted_slot, l);
+    const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
+    const int64_t insert_idx = cache_set_sorted_indices[n + l];
+    const int64_t insert_lfu_cost = lfu_state[insert_idx];
 
     if (insert_current_lfu_cost > insert_lfu_cost) {
       // don't insert.
@@ -1336,12 +1325,12 @@ __global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
       // early here.
       return;
     }
-    int32_t t_insert = cache_index_table_map[insert_idx];
-    int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-    int64_t weights_offset_insert = weights_offsets[t_insert];
-    int32_t D_start_insert = D_offsets[t_insert];
-    int32_t D_end_insert = D_offsets[t_insert + 1];
-    int32_t D_insert = D_end_insert - D_start_insert;
+    const int32_t t_insert = cache_index_table_map[insert_idx];
+    const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+    const int64_t weights_offset_insert = weights_offsets[t_insert];
+    const int32_t D_start_insert = D_offsets[t_insert];
+    const int32_t D_end_insert = D_offsets[t_insert + 1];
+    const int32_t D_insert = D_end_insert - D_start_insert;
 
     // not empty
     if (insert_current_lfu_cost != -1) {
@@ -1350,12 +1339,13 @@ __global__ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_kernel(
       int64_t current_idx =
           threadIdx.x == 0 ? lxu_cache_state[cache_set][insert_slot] : 0;
       current_idx = shfl_sync(current_idx, 0);
-      int32_t t_current = cache_index_table_map[current_idx];
-      int64_t idx_current = current_idx - cache_hash_size_cumsum[t_current];
-      int64_t weights_offset_current = weights_offsets[t_current];
-      int32_t D_start_current = D_offsets[t_current];
-      int32_t D_end_current = D_offsets[t_current + 1];
-      int32_t D_current = D_end_current - D_start_current;
+      const int32_t t_current = cache_index_table_map[current_idx];
+      const int64_t idx_current =
+          current_idx - cache_hash_size_cumsum[t_current];
+      const int64_t weights_offset_current = weights_offsets[t_current];
+      const int32_t D_start_current = D_offsets[t_current];
+      const int32_t D_end_current = D_offsets[t_current + 1];
+      const int32_t D_current = D_end_current - D_start_current;
 
       int32_t D_emb = D_current;
       if (std::is_same<emb_t, uint8_t>::value) {
@@ -1463,11 +1453,11 @@ void lfu_cache_insert_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(weights.get_device());
 
-  int32_t N = cache_set_sorted_unique_indices.numel();
+  const int32_t N = cache_set_sorted_unique_indices.numel();
 
   DISPATCH_EMB_CACHE_TYPES(
-      weights.type(),
-      lxu_cache_weights.type(),
+      weights.scalar_type(),
+      lxu_cache_weights.scalar_type(),
       "lfu_cache_insert_kernel_2",
       ([&] {
         at::PhiloxCudaState rng_engine_inputs;
@@ -1559,8 +1549,9 @@ void lfu_cache_populate_cuda(
       total_cache_hash_size,
       lxu_cache_state,
       lfu_state);
-  auto sorted_cache_sets = cache_sets_and_unique_indices.first;
-  auto cache_set_sorted_unique_indices = cache_sets_and_unique_indices.second;
+  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
+  const auto cache_set_sorted_unique_indices =
+      cache_sets_and_unique_indices.second;
 
   // insert caching weights
   lfu_cache_insert_cuda(
@@ -1618,13 +1609,13 @@ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
     const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
         lfu_state,
     const int64_t row_alignment) {
-  int32_t C = lxu_cache_state.size(0);
-  int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
+  const int32_t C = lxu_cache_state.size(0);
+  const int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
   if (n >= *N_unique) {
     return;
   }
   // check if this warp is responsible for this whole segment.
-  bool segment_start =
+  const bool segment_start =
       (n == 0 ||
        (sorted_cache_sets[n - 1] >> kLFUCounterBits) !=
            (sorted_cache_sets[n] >> kLFUCounterBits));
@@ -1634,7 +1625,7 @@ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
     // so we can just exit this warp entirely.
     return;
   }
-  uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
+  const uint32_t cache_set = (sorted_cache_sets[n] >> kLFUCounterBits);
   if (cache_set == C) {
     // ignore the already-existing elements
     return;
@@ -1648,9 +1639,9 @@ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
 
   // now, we need to insert the (unique!) values in indices[n:n + SL] into
   // our slots.
-  int32_t slot = threadIdx.x;
-  int64_t current_idx = lxu_cache_state[cache_set][slot];
-  int64_t current_lfu_cost =
+  const int32_t slot = threadIdx.x;
+  const int64_t current_idx = lxu_cache_state[cache_set][slot];
+  const int64_t current_lfu_cost =
       (current_idx != static_cast<int64_t>(kCacheStateInvalid))
       ? lfu_state[current_idx]
       : -1;
@@ -1658,14 +1649,14 @@ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
   int32_t slots[1] = {slot};
 
   BitonicSort<int64_t, int32_t, 1, Comparator<int64_t>>::sort(costs, slots);
-  int32_t sorted_slot = slots[0];
-  int64_t sorted_lfu_cost = costs[0];
+  const int32_t sorted_slot = slots[0];
+  const int64_t sorted_lfu_cost = costs[0];
 
   for (int32_t l = 0; l < min(SL, kWarpSize); ++l) {
-    int32_t insert_slot = shfl_sync(sorted_slot, l);
-    int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
-    index_t insert_idx = cache_set_sorted_indices[n + l];
-    int64_t insert_lfu_cost = lfu_state[insert_idx];
+    const int32_t insert_slot = shfl_sync(sorted_slot, l);
+    const int64_t insert_current_lfu_cost = shfl_sync(sorted_lfu_cost, l);
+    const index_t insert_idx = cache_set_sorted_indices[n + l];
+    const int64_t insert_lfu_cost = lfu_state[insert_idx];
 
     if (insert_current_lfu_cost > insert_lfu_cost) {
       // don't insert.
@@ -1674,14 +1665,14 @@ __launch_bounds__(kCacheMaxThreads) void lfu_cache_insert_byte_kernel(
       // early here.
       return;
     }
-    int32_t t_insert = cache_index_table_map[insert_idx];
-    SparseType weight_ty_insert =
+    const int32_t t_insert = cache_index_table_map[insert_idx];
+    const SparseType weight_ty_insert =
         static_cast<SparseType>(weights_tys[t_insert]);
-    int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
-    int64_t weights_offset_insert = weights_offsets[t_insert];
-    int32_t D_start_insert = D_offsets[t_insert];
-    int32_t D_end_insert = D_offsets[t_insert + 1];
-    int32_t D_insert = D_end_insert - D_start_insert;
+    const int64_t idx_insert = insert_idx - cache_hash_size_cumsum[t_insert];
+    const int64_t weights_offset_insert = weights_offsets[t_insert];
+    const int32_t D_start_insert = D_offsets[t_insert];
+    const int32_t D_end_insert = D_offsets[t_insert + 1];
+    const int32_t D_insert = D_end_insert - D_start_insert;
 
     const int32_t D_insert_bytes = nbit::padded_row_size_in_bytes(
         D_insert, weight_ty_insert, row_alignment);
@@ -1729,7 +1720,7 @@ void lfu_cache_insert_byte_cuda(
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(weights.get_device());
 
-  int32_t N = cache_set_sorted_unique_indices.numel();
+  const int32_t N = cache_set_sorted_unique_indices.numel();
 
   AT_DISPATCH_INDEX_TYPES(
       cache_set_sorted_unique_indices.scalar_type(),
@@ -1811,14 +1802,15 @@ void lfu_cache_populate_byte_cuda(
       unique_indices, unique_indices_length, *unique_indices_count, lfu_state);
 
   // find uncached indices
-  auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
+  const auto cache_sets_and_unique_indices = lfu_cache_find_uncached_cuda(
       unique_indices,
       unique_indices_length,
       total_cache_hash_size,
       lxu_cache_state,
       lfu_state);
-  auto sorted_cache_sets = cache_sets_and_unique_indices.first;
-  auto cache_set_sorted_unique_indices = cache_sets_and_unique_indices.second;
+  const auto sorted_cache_sets = cache_sets_and_unique_indices.first;
+  const auto cache_set_sorted_unique_indices =
+      cache_sets_and_unique_indices.second;
 
   // insert caching weights
   lfu_cache_insert_byte_cuda(
diff --git a/fbgemm_gpu/src/split_embeddings_utils.cu b/fbgemm_gpu/src/split_embeddings_utils.cu
index f5dd9ec04..debcd869c 100644
--- a/fbgemm_gpu/src/split_embeddings_utils.cu
+++ b/fbgemm_gpu/src/split_embeddings_utils.cu
@@ -68,25 +68,25 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel(
     at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> infos,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         linear_indices) {
-  int32_t T = hash_size_cumsum.size(0) - 1;
-  int32_t B = (offsets.size(0) - 1) / T;
-  int32_t b_t = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_t b = b_t % B;
-  int32_t t = b_t / B;
-  bool valid = t < T;
+  const int32_t T = hash_size_cumsum.size(0) - 1;
+  const int32_t B = (offsets.size(0) - 1) / T;
+  const int32_t b_t = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t b = b_t % B;
+  const int32_t t = b_t / B;
+  const bool valid = t < T;
 
-  index_t hash_offset = valid ? hash_size_cumsum[t] : -1;
-  index_t indices_start = valid ? offsets[t * B + b] : -1;
-  int32_t L = valid ? offsets[t * B + b + 1] - indices_start : 0;
-  int32_t lane_id = threadIdx.x % fbgemm_gpu::kWarpSize;
+  const index_t hash_offset = valid ? hash_size_cumsum[t] : -1;
+  const index_t indices_start = valid ? offsets[t * B + b] : -1;
+  const int32_t L = valid ? offsets[t * B + b + 1] - indices_start : 0;
+  const int32_t lane_id = threadIdx.x % fbgemm_gpu::kWarpSize;
 
   for (int32_t j = 0; j < fbgemm_gpu::kWarpSize; ++j) {
-    index_t indices_start_warp = fbgemm_gpu::shfl_sync(indices_start, j);
-    int32_t b_t_warp = fbgemm_gpu::shfl_sync(b_t, j);
-    int32_t L_warp = fbgemm_gpu::shfl_sync(L, j);
-    index_t hash_offset_warp = fbgemm_gpu::shfl_sync(hash_offset, j);
+    const index_t indices_start_warp = fbgemm_gpu::shfl_sync(indices_start, j);
+    const int32_t b_t_warp = fbgemm_gpu::shfl_sync(b_t, j);
+    const int32_t L_warp = fbgemm_gpu::shfl_sync(L, j);
+    const index_t hash_offset_warp = fbgemm_gpu::shfl_sync(hash_offset, j);
     for (int32_t i = lane_id; i < L_warp; i += fbgemm_gpu::kWarpSize) {
-      index_t idx = __ldg(&indices[indices_start_warp + i]);
+      const index_t idx = __ldg(&indices[indices_start_warp + i]);
       infos[indices_start_warp + i] = b_t_warp;
       linear_indices[indices_start_warp + i] = hash_offset_warp + idx;
     }
@@ -102,26 +102,26 @@ __global__ __launch_bounds__(kMaxThreads) void nobag_linearize_index_kernel(
     at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> infos,
     at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
         linear_indices) {
-  int32_t T = hash_size_cumsum.size(0) - 1;
-  int32_t B = (offsets.size(0) - 1) / T;
-  int32_t b_t = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_t b = b_t % B;
-  int32_t t = b_t / B;
-  bool valid = t < T;
+  const int32_t T = hash_size_cumsum.size(0) - 1;
+  const int32_t B = (offsets.size(0) - 1) / T;
+  const int32_t b_t = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t b = b_t % B;
+  const int32_t t = b_t / B;
+  const bool valid = t < T;
 
-  index_t hash_offset = valid ? hash_size_cumsum[t] : -1;
-  index_t indices_start = valid ? offsets[t * B + b] : -1;
-  int32_t L = valid ? offsets[t * B + b + 1] - indices_start : 0;
-  int32_t lane_id = threadIdx.x % fbgemm_gpu::kWarpSize;
+  const index_t hash_offset = valid ? hash_size_cumsum[t] : -1;
+  const index_t indices_start = valid ? offsets[t * B + b] : -1;
+  const int32_t L = valid ? offsets[t * B + b + 1] - indices_start : 0;
+  const int32_t lane_id = threadIdx.x % fbgemm_gpu::kWarpSize;
 
   for (int32_t j = 0; j < fbgemm_gpu::kWarpSize; ++j) {
-    index_t indices_start_warp = fbgemm_gpu::shfl_sync(indices_start, j);
-    int32_t t_warp = fbgemm_gpu::shfl_sync(t, j);
-    int32_t L_warp = fbgemm_gpu::shfl_sync(L, j);
-    index_t hash_offset_warp = fbgemm_gpu::shfl_sync(hash_offset, j);
+    const index_t indices_start_warp = fbgemm_gpu::shfl_sync(indices_start, j);
+    const int32_t t_warp = fbgemm_gpu::shfl_sync(t, j);
+    const int32_t L_warp = fbgemm_gpu::shfl_sync(L, j);
+    const index_t hash_offset_warp = fbgemm_gpu::shfl_sync(hash_offset, j);
     for (int32_t i = lane_id; i < L_warp; i += fbgemm_gpu::kWarpSize) {
-      index_t idx = __ldg(&indices[indices_start_warp + i]);
-      int64_t l_t = (indices_start_warp + i) * T + t_warp;
+      const index_t idx = __ldg(&indices[indices_start_warp + i]);
+      const int64_t l_t = (indices_start_warp + i) * T + t_warp;
       infos[indices_start_warp + i] = l_t;
       linear_indices[indices_start_warp + i] = hash_offset_warp + idx;
     }
@@ -142,8 +142,8 @@ transpose_embedding_input(
     Tensor indices,
     Tensor offsets,
     bool nobag) {
-  int32_t T = hash_size_cumsum.size(0) - 1;
-  int32_t B = (offsets.size(0) - 1) / T;
+  const int32_t T = hash_size_cumsum.size(0) - 1;
+  const int32_t B = (offsets.size(0) - 1) / T;
 
   auto infos = at::empty_like(
       indices, indices.options().dtype(nobag ? at::kLong : at::kInt));
@@ -281,11 +281,11 @@ transpose_embedding_input(
       KeyT* d_keys_out,                                              \
       const ValueT* d_values_in,                                     \
       ValueT* d_values_out,                                          \
-      int num_items,                                                 \
-      int begin_bit,                                                 \
-      int end_bit,                                                   \
+      const int num_items,                                           \
+      const int begin_bit,                                           \
+      const int end_bit,                                             \
       cudaStream_t stream,                                           \
-      bool debug_synchronous) {                                      \
+      const bool debug_synchronous) {                                \
     return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
         d_temp_storage,                                              \
         temp_storage_bytes,                                          \
diff --git a/fbgemm_gpu/src/split_table_batched_embeddings.cpp b/fbgemm_gpu/src/split_table_batched_embeddings.cpp
index e83e98091..68a5b3037 100644
--- a/fbgemm_gpu/src/split_table_batched_embeddings.cpp
+++ b/fbgemm_gpu/src/split_table_batched_embeddings.cpp
@@ -10,19 +10,24 @@
 
 #include "fbgemm_gpu/sparse_ops_utils.h"
 
+///@defgroup table-batched-embed-cuda CUDA Operators
+/// The following are CUDA Operators
+
 using Tensor = at::Tensor;
 
 // Map index to cache_set. h_in: linear_indices; C: #cache_sets.
 int64_t host_lxu_cache_slot(int64_t h_in, int64_t C);
 
-// Linearize the indices of all tables to make it be unique
+///@ingroup table-batched-embed-cuda
+/// Linearize the indices of all tables to make it be unique
 Tensor linearize_cache_indices_cuda(
     Tensor cache_hash_size_cumsum,
     Tensor indices,
     Tensor offsets);
 
-// LRU cache: fetch the rows corresponding to `linear_cache_indices` from
-// `weights`, and insert them into the cache at timestep `time_stamp`.
+///@ingroup table-batched-embed-cuda
+/// LRU cache: fetch the rows corresponding to `linear_cache_indices` from
+///`weights`, and insert them into the cache at timestep `time_stamp`.
 void lru_cache_populate_cuda(
     Tensor weights,
     Tensor hash_size_cumsum,
@@ -37,9 +42,10 @@ void lru_cache_populate_cuda(
     Tensor lru_state,
     bool stochastic_rounding);
 
-// LRU cache: fetch the rows corresponding to `linear_cache_indices` from
-// `weights`, and insert them into the cache at timestep `time_stamp`.
-// weights and lxu_cache_weights have "uint8_t" byte elements
+///@ingroup table-batched-embed-cuda
+/// LRU cache: fetch the rows corresponding to `linear_cache_indices` from
+///`weights`, and insert them into the cache at timestep `time_stamp`.
+/// weights and lxu_cache_weights have "uint8_t" byte elements
 void lru_cache_populate_byte_cuda(
     Tensor weights,
     Tensor hash_size_cumsum,
@@ -55,8 +61,9 @@ void lru_cache_populate_byte_cuda(
     Tensor lru_state,
     int64_t row_alignment);
 
-// LFU cache: fetch the rows corresponding to `linear_cache_indices` from
-// `weights`, and insert them into the cache.
+///@ingroup table-batched-embed-cuda
+/// LFU cache: fetch the rows corresponding to `linear_cache_indices` from
+///`weights`, and insert them into the cache.
 void lfu_cache_populate_cuda(
     Tensor weights,
     Tensor cache_hash_size_cumsum,
@@ -70,9 +77,10 @@ void lfu_cache_populate_cuda(
     Tensor lfu_state,
     bool stochastic_rounding);
 
-// LFU cache: fetch the rows corresponding to `linear_cache_indices` from
-// `weights`, and insert them into the cache.
-// weights and lxu_cache_weights have "uint8_t" byte elements
+///@ingroup table-batched-embed-cuda
+/// LFU cache: fetch the rows corresponding to `linear_cache_indices` from
+///`weights`, and insert them into the cache.
+/// weights and lxu_cache_weights have "uint8_t" byte elements
 void lfu_cache_populate_byte_cuda(
     Tensor weights,
     Tensor cache_hash_size_cumsum,
@@ -87,15 +95,17 @@ void lfu_cache_populate_byte_cuda(
     Tensor lfu_state,
     int64_t row_alignment);
 
-// Lookup the LRU/LFU cache: find the cache weights location for all indices.
-// Look up the slots in the cache corresponding to `linear_cache_indices`, with
-// a sentinel value for missing.
+///@ingroup table-batched-embed-cuda
+/// Lookup the LRU/LFU cache: find the cache weights location for all indices.
+/// Look up the slots in the cache corresponding to `linear_cache_indices`, with
+/// a sentinel value for missing.
 Tensor lxu_cache_lookup_cuda(
     Tensor linear_cache_indices,
     Tensor lxu_cache_state,
     int64_t invalid_index);
 
-// Flush the cache: store the weights from the cache to the backing storage.
+//////@ingroup table-batched-embed-cuda
+/// Flush the cache: store the weights from the cache to the backing storage.
 void lxu_cache_flush_cuda(
     Tensor uvm_weights,
     Tensor cache_hash_size_cumsum,
diff --git a/fbgemm_gpu/test/cpu_kernel_test.cpp b/fbgemm_gpu/test/cpu_kernel_test.cpp
new file mode 100644
index 000000000..14e049727
--- /dev/null
+++ b/fbgemm_gpu/test/cpu_kernel_test.cpp
@@ -0,0 +1,109 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+
+#include "deeplearning/fbgemm/fbgemm_gpu/codegen/embedding_forward_split_cpu.h"
+#include "fbgemm_gpu/cpu_utils.h"
+#include "fbgemm_gpu/embedding_common.h"
+#include "torch/types.h" // @manual=//caffe2:torch-cpp-cpu
+
+TEST(cpu_kernel_test, radix_sort_parallel_test) {
+  std::array<int, 8> keys = {1, 2, 4, 5, 4, 3, 2, 9};
+  std::array<int, 8> values = {0, 0, 0, 0, 1, 1, 1, 1};
+
+  int* sorted_keys;
+  int* sorted_values;
+
+  std::array<int, 8> keys_tmp;
+  std::array<int, 8> values_tmp;
+
+  std::tie(sorted_keys, sorted_values) = fbgemm_gpu::radix_sort_parallel(
+      keys.data(),
+      values.data(),
+      keys_tmp.data(),
+      values_tmp.data(),
+      keys.size(),
+      10);
+
+  std::array<int, 8> expect_keys_tmp = {1, 2, 2, 3, 4, 4, 5, 9};
+  std::array<int, 8> expect_values_tmp = {0, 0, 1, 1, 0, 1, 0, 1};
+  EXPECT_EQ(sorted_keys, keys_tmp.data());
+  EXPECT_EQ(sorted_values, values_tmp.data());
+  EXPECT_EQ(keys_tmp, expect_keys_tmp);
+  EXPECT_EQ(values_tmp, expect_values_tmp);
+}
+
+TEST(cpu_kernel_test, csr2csc_test) {
+  internal::HyperCompressedSparseColumn csc;
+  int B = 2;
+  at::Tensor offsets = torch::tensor({0, 4, 8});
+  at::Tensor indices = torch::tensor({1, 2, 4, 5, 4, 3, 2, 9});
+  int64_t pooling_mode = (int64_t)fbgemm_gpu::PoolingMode::SUM;
+  int table_to_feature_offset[2] = {0, 1};
+  int num_embeddings = 10;
+
+  ::internal::csr2csc(
+      csc,
+      B,
+      offsets.accessor<int64_t, 1>(),
+      indices.accessor<int64_t, 1>(),
+      at::TensorAccessor<at::acc_type<float, true>, 1>(
+          nullptr, nullptr, nullptr), // no weights
+      pooling_mode,
+      table_to_feature_offset,
+      num_embeddings);
+
+  // sorted list of unique elements in indices
+  std::array<int, 6> expect_cs_indices = {1, 2, 3, 4, 5, 9};
+  for (int i = 0; i < expect_cs_indices.size(); ++i) {
+    EXPECT_EQ(expect_cs_indices[i], csc.column_segment_indices[i]);
+  }
+
+  // column_segment_ptr[i+1]-column_segment_ptr[i] gives the count of
+  // column_segment_indices[i] in indices
+  std::array<int, 7> expect_cs_ptr = {0, 1, 3, 4, 6, 7, 8};
+  for (int i = 0; i < expect_cs_ptr.size(); ++i) {
+    EXPECT_EQ(expect_cs_ptr[i], csc.column_segment_ptr[i]);
+  }
+
+  // gives the bag of the ith lowest value in indices, where the bag is
+  // determined according to offsets
+  std::array<int, 8> expect_row_indices = {0, 0, 1, 1, 0, 1, 0, 1};
+  for (int i = 0; i < expect_row_indices.size(); ++i) {
+    EXPECT_EQ(expect_row_indices[i], csc.row_indices[i]);
+  }
+
+  internal::HyperCompressedSparseColumn csc_weighted;
+  at::Tensor indice_weights = torch::tensor(
+      {1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f}, torch::kFloat32);
+  ::internal::csr2csc(
+      csc_weighted,
+      B,
+      offsets.accessor<int64_t, 1>(),
+      indices.accessor<int64_t, 1>(),
+      indice_weights.accessor<at::acc_type<float, true>, 1>(),
+      pooling_mode,
+      table_to_feature_offset,
+      num_embeddings);
+
+  for (int i = 0; i < expect_cs_indices.size(); ++i) {
+    EXPECT_EQ(expect_cs_indices[i], csc_weighted.column_segment_indices[i]);
+  }
+
+  for (int i = 0; i < expect_cs_ptr.size(); ++i) {
+    EXPECT_EQ(expect_cs_ptr[i], csc_weighted.column_segment_ptr[i]);
+  }
+
+  for (int i = 0; i < expect_row_indices.size(); ++i) {
+    EXPECT_EQ(expect_row_indices[i], csc_weighted.row_indices[i]);
+  }
+
+  // sorting should be exact, no arithmetic needed. check for strict equality
+  // of floats, not relative error
+  std::array<float, 8> expect_weights = {
+      1.0f, 1.1f, 1.6f, 1.5f, 1.2f, 1.4f, 1.3f, 1.7f};
+  for (int i = 0; i < expect_weights.size(); ++i) {
+    EXPECT_EQ(expect_weights[i], csc_weighted.weights[i]);
+  }
+}
diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py
index 3afb3b45d..1e2593d49 100644
--- a/fbgemm_gpu/test/input_combine_test.py
+++ b/fbgemm_gpu/test/input_combine_test.py
@@ -204,7 +204,7 @@ def _run_padding_fused_test(self, dtypes, batch_size) -> None:
             indices_list, offsets_list, per_sample_weights, batch_size
         )
         for i, j in zip(outputs, ref_outputs):
-            torch.testing.assert_allclose(i, j)
+            torch.testing.assert_close(i, j)
         self.assertTrue(outputs[0].dtype == torch.int32)
         self.assertTrue(outputs[1].dtype == torch.int32)
 
@@ -219,7 +219,7 @@ def _run_padding_fused_test(self, dtypes, batch_size) -> None:
             indices_list, offsets_list, empty_per_sample_weights, batch_size
         )
         for i, j in zip(outputs[:-1], ref_outputs[:-1]):
-            torch.testing.assert_allclose(i, j)
+            torch.testing.assert_close(i, j)
             self.assertTrue(j.dtype == torch.int32)
 
         self.assertTrue(outputs[0].dtype == torch.int32)
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
new file mode 100644
index 000000000..7a67f0e12
--- /dev/null
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -0,0 +1,1050 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import random
+import unittest
+from typing import List, Tuple
+
+import hypothesis.strategies as st
+import numpy as np
+import torch
+from hypothesis import assume, given, settings, Verbosity
+
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+
+    # pyre-ignore[21]
+    from test_utils import gpu_available, gpu_unavailable
+except Exception:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+    from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable
+
+
+def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
+    return torch.repeat_interleave(
+        torch._dim_arange(lengths, 0).long(),
+        lengths.long(),
+    )
+
+
+# Converts lengths + values format to COO format
+# [B], [N, D] -> [B, N', D].
+# pyre-ignore Missing return annotation [3]
+def var_list_to_coo(lengths: torch.Tensor, values: torch.Tensor, N: int, D: int):
+    rows = lengths_to_segment_ids(lengths)
+    num_rows = lengths.size()[0]
+    offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+    output_size = lengths.sum()
+    # This does D&H sync
+    cols = torch.ops.fbgemm.offsets_range(offsets, output_size)
+    indices = torch.stack([rows, cols])
+    dims = [num_rows, N, D]
+    # torch.sparse_coo_tensor is not supported by torch.fx, wrap it.
+    return torch.sparse_coo_tensor(
+        indices=indices,
+        values=values,
+        size=dims,
+    )
+
+
+class JaggedTensorOpsTest(unittest.TestCase):
+    @staticmethod
+    def expand_into_jagged_permute_ref_(
+        permute: List[int],
+        length: List[int],
+    ) -> List[int]:
+        offsets = [0] + list(itertools.accumulate(length))
+        output_permute = []
+        for r in permute:
+            output_permute.extend(
+                range(
+                    offsets[r],
+                    offsets[r + 1],
+                )
+            )
+        return output_permute
+
+    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
+    @given(
+        T=st.integers(min_value=10, max_value=20),
+        W=st.integers(min_value=8, max_value=64),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
+    def test_expand_into_jagged_permute(
+        self,
+        T: int,
+        W: int,
+    ) -> None:
+        length_per_w = [random.randint(5000, 10000) for i in range(W)]
+        length_1d = list(
+            itertools.chain.from_iterable(itertools.repeat(x, T) for x in length_per_w)
+        )
+        permute_list = list(range(T * W))
+        random.shuffle(permute_list)
+        permuted_length_1d = [length_1d[r] for r in permute_list]
+        permute_tensor = torch.tensor(permute_list)
+
+        # compute offsets
+        offsets_1d = [0] + list(itertools.accumulate(length_1d))
+        permuted_offsets_1d = [0] + list(itertools.accumulate(permuted_length_1d))
+        offsets_1d_tensor = torch.tensor(offsets_1d)
+        permuted_offsets_1d_tensor = torch.tensor(permuted_offsets_1d)
+
+        # cpu op
+        output_permute_cpu = torch.ops.fbgemm.expand_into_jagged_permute(
+            permute_tensor,
+            offsets_1d_tensor,
+            permuted_offsets_1d_tensor,
+            offsets_1d[-1],
+        )
+
+        # reference solution
+        output_permute_ref = self.expand_into_jagged_permute_ref_(
+            permute_list,
+            length_1d,
+        )
+        output_permute_ref_tensor = torch.tensor(output_permute_ref)
+
+        # assert cpu and gpu ops
+        torch.testing.assert_close(output_permute_cpu, output_permute_ref_tensor)
+        if gpu_available:
+            # gpu op
+            output_permute_gpu = torch.ops.fbgemm.expand_into_jagged_permute(
+                permute_tensor.cuda(),
+                offsets_1d_tensor.cuda(),
+                permuted_offsets_1d_tensor.cuda(),
+                offsets_1d[-1],
+            )
+            torch.testing.assert_close(
+                output_permute_gpu.cpu(), output_permute_ref_tensor
+            )
+
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore [56]
+    @given(
+        B=st.integers(min_value=1, max_value=128),
+        D=st.integers(min_value=1, max_value=128),
+        max_sequence_length=st.integers(min_value=1, max_value=200),
+        is_half=st.booleans(),
+    )
+    def test_jagged_2d_to_dense(
+        self,
+        B: int,
+        D: int,
+        max_sequence_length: int,
+        is_half: bool,
+    ) -> None:
+        D = D * 4
+        lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B)
+        total_lengths = lengths_.sum()
+        lengths = torch.from_numpy(lengths_)
+        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+
+        ref_values = torch.rand(total_lengths, D)
+        ref_output_values = var_list_to_coo(
+            lengths,
+            ref_values,
+            max_sequence_length,
+            D,
+        ).to_dense()
+        if is_half:
+            ref_output_values = ref_output_values.half()
+
+        # test cpu forward
+        if is_half:
+            values = ref_values.clone().half().detach().requires_grad_(True)
+        else:
+            values = ref_values.clone().detach().requires_grad_(True)
+        output_values = torch.ops.fbgemm.jagged_2d_to_dense(
+            values=values,
+            offsets=offsets,
+            max_sequence_length=max_sequence_length,
+        )
+        torch.testing.assert_close(ref_output_values, output_values)
+
+        if torch.cuda.is_available():
+            # test gpu forward
+            ref_values = ref_values.cuda()
+            if is_half:
+                values = ref_values.clone().half().detach().requires_grad_(True)
+            else:
+                values = ref_values.clone().detach().requires_grad_(True)
+            offsets = offsets.cuda()
+            ref_output_values = ref_output_values.cuda()
+            output_values = torch.ops.fbgemm.jagged_2d_to_dense(
+                values=values,
+                offsets=offsets,
+                max_sequence_length=max_sequence_length,
+            )
+            torch.testing.assert_close(ref_output_values, output_values)
+
+            # test gpu backward
+            output_values.backward(ref_output_values)
+            if is_half:
+                ref_values = ref_values.half()
+            torch.testing.assert_close(ref_values, values.grad)
+
+    def test_jagged_2d_to_dense_truncation(self) -> None:
+        # Test the case where max_sequence_length < max(lengths[i])
+        lengths_ = np.array([2, 3, 0, 1])
+        lengths = torch.from_numpy(lengths_)
+        total_lengths = lengths_.sum()
+        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+
+        embedding_dim = 16
+        max_sequence_length = 2
+        ref_values = torch.rand(total_lengths, embedding_dim)
+        ref_output_values = var_list_to_coo(
+            lengths,
+            ref_values,
+            3,
+            embedding_dim,
+        ).to_dense()[:, :max_sequence_length, :]
+
+        # test cpu forward
+        values = ref_values.clone().detach().requires_grad_(True)
+        output_values = torch.ops.fbgemm.jagged_2d_to_dense(
+            values=values,
+            offsets=offsets,
+            max_sequence_length=max_sequence_length,
+        )
+        torch.testing.assert_close(ref_output_values, output_values)
+
+        if torch.cuda.is_available():
+            # test gpu forward
+            ref_values = ref_values.cuda()
+            values = ref_values.clone().detach().requires_grad_(True)
+            offsets = offsets.cuda()
+            ref_output_values = ref_output_values.cuda()
+            output_values = torch.ops.fbgemm.jagged_2d_to_dense(
+                values=values,
+                offsets=offsets,
+                max_sequence_length=max_sequence_length,
+            )
+            torch.testing.assert_close(ref_output_values, output_values)
+
+            # test gpu backward
+            expected_grad = ref_values
+            expected_grad[4, :] = 0  # due to truncation
+            expected_grad = expected_grad.cuda()
+            output_values.backward(ref_output_values)
+            torch.testing.assert_close(expected_grad, values.grad)
+
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore [56]
+    @given(
+        T=st.integers(min_value=1, max_value=5),
+        B=st.integers(min_value=1, max_value=64),
+        D=st.integers(min_value=1, max_value=128),
+        max_sequence_length=st.integers(min_value=1, max_value=300),
+    )
+    def test_stacked_jagged_2d_to_dense(
+        self,
+        T: int,
+        B: int,
+        D: int,
+        max_sequence_length: int,
+    ) -> None:
+        D = D * 4
+        lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B * T)
+        total_lengths = lengths_.sum()
+        lengths = torch.from_numpy(lengths_).cuda()
+        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        ref_values = torch.rand(total_lengths, D).cuda()
+        ref_output_values = var_list_to_coo(
+            lengths,
+            ref_values,
+            max_sequence_length,
+            D,
+        ).to_dense()
+        lengths = lengths.view(T, B)
+
+        values = ref_values.clone().detach().requires_grad_(True)
+        output_values_per_table = torch.ops.fbgemm.stacked_jagged_2d_to_dense(
+            values=values,
+            lengths=lengths,
+            offset_per_key=[0]
+            + np.cumsum([lengths[t].sum().item() for t in range(T)]).tolist(),
+            max_lengths_per_key=[max_sequence_length] * T,
+        )
+        ref_output_values = torch.ops.fbgemm.jagged_2d_to_dense(
+            values=ref_values,
+            offsets=offsets,
+            max_sequence_length=max_sequence_length,
+        )
+        torch.testing.assert_close(
+            ref_output_values, torch.cat(output_values_per_table)
+        )
+
+        # test backward
+        output_values = torch.cat(output_values_per_table)
+        output_values.backward(ref_output_values)
+        torch.testing.assert_close(ref_values, values.grad)
+
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore [56]
+    @given(
+        B=st.integers(min_value=1, max_value=128),
+        max_sequence_length=st.integers(min_value=1, max_value=500),
+        padding_value=st.integers(min_value=-100000, max_value=100000),
+    )
+    def test_jagged_1d_to_dense(
+        self,
+        B: int,
+        max_sequence_length: int,
+        padding_value: int,
+    ) -> None:
+        def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
+            return torch.repeat_interleave(
+                torch._dim_arange(lengths, 0).long(),
+                lengths.long(),
+            )
+
+        # Converts lengths + values format to COO format
+        # [B], [N] -> [B, N'].
+        # pyre-ignore Missing return annotation [3]
+        def var_list_to_coo(
+            lengths: torch.Tensor,
+            values: torch.Tensor,
+            N: int,
+        ):
+            rows = lengths_to_segment_ids(lengths)
+            num_rows = lengths.size()[0]
+            # This does D&H sync
+            offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+            output_size = lengths.sum()
+            # This does D&H sync
+            cols = torch.ops.fbgemm.offsets_range(offsets, output_size)
+            indices = torch.stack([rows, cols])
+            dims = [num_rows, N]
+            # torch.sparse_coo_tensor is not supported by torch.fx, wrap it.
+            return torch.sparse_coo_tensor(
+                indices=indices,
+                values=values,
+                size=dims,
+            )
+
+        lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B)
+        total_lengths = lengths_.sum()
+        lengths = torch.from_numpy(lengths_)
+        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+
+        ref_values = torch.randint(low=0, high=1000000000, size=(total_lengths,))
+        ref_values_mask = var_list_to_coo(
+            lengths, torch.ones_like(ref_values), max_sequence_length
+        ).to_dense()
+        ref_output_values = (
+            var_list_to_coo(
+                lengths,
+                ref_values,
+                max_sequence_length,
+            ).to_dense()
+            + (1 - ref_values_mask) * torch.ones_like(ref_values_mask) * padding_value
+        )
+
+        # test cpu forward
+        values = ref_values.clone().detach().requires_grad_(False)
+        output_values = torch.ops.fbgemm.jagged_1d_to_dense(
+            values=values,
+            offsets=offsets,
+            max_sequence_length=max_sequence_length,
+            padding_value=padding_value,
+        )
+        torch.testing.assert_close(ref_output_values, output_values)
+
+        if torch.cuda.is_available():
+            # test gpu forward
+            ref_values = ref_values.cuda()
+            values = ref_values.clone().detach().requires_grad_(False)
+            offsets = offsets.cuda()
+            ref_output_values = ref_output_values.cuda()
+            output_values = torch.ops.fbgemm.jagged_1d_to_dense(
+                values=values,
+                offsets=offsets,
+                max_sequence_length=max_sequence_length,
+                padding_value=padding_value,
+            )
+            torch.testing.assert_close(ref_output_values, output_values)
+
+    def test_jagged_1d_to_dense_truncation(self) -> None:
+        lengths_ = np.array([1, 3, 0, 1])
+        lengths = torch.from_numpy(lengths_)
+        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+
+        ref_values = torch.from_numpy(np.array([100, 3, 4, 5, 6]))
+        ref_output = torch.from_numpy(np.array([100, 3, -1, 6])).reshape(-1, 1)
+
+        # test cpu forward
+        values = ref_values.clone().detach().requires_grad_(False)
+        output = torch.ops.fbgemm.jagged_1d_to_dense(
+            values=values,
+            offsets=offsets,
+            max_sequence_length=1,
+            padding_value=-1,
+        )
+        torch.testing.assert_close(ref_output, output)
+
+        if torch.cuda.is_available():
+            # test gpu forward
+            ref_values = ref_values.cuda()
+            values = ref_values.clone().detach().requires_grad_(False)
+            offsets = offsets.cuda()
+            ref_output = ref_output.cuda()
+            output = torch.ops.fbgemm.jagged_1d_to_dense(
+                values=values,
+                offsets=offsets,
+                max_sequence_length=1,
+                padding_value=-1,
+            )
+            torch.testing.assert_close(ref_output, output)
+
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore [56]
+    @given(
+        T=st.integers(min_value=1, max_value=20),
+        B=st.integers(min_value=1, max_value=128),
+        max_sequence_length=st.integers(min_value=1, max_value=500),
+        padding_value=st.integers(min_value=-100000, max_value=100000),
+    )
+    def test_stacked_jagged_1d_to_dense(
+        self,
+        T: int,
+        B: int,
+        max_sequence_length: int,
+        padding_value: int,
+    ) -> None:
+        def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
+            return torch.repeat_interleave(
+                torch._dim_arange(lengths, 0).long(),
+                lengths.long(),
+            )
+
+        # Converts lengths + values format to COO format
+        # [B], [N] -> [B, N'].
+        # pyre-ignore Missing return annotation [3]
+        def var_list_to_coo(
+            lengths: torch.Tensor,
+            values: torch.Tensor,
+            N: int,
+        ):
+            rows = lengths_to_segment_ids(lengths)
+            num_rows = lengths.size()[0]
+            # This does D&H sync
+            offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+            output_size = lengths.sum()
+            # This does D&H sync
+            cols = torch.ops.fbgemm.offsets_range(offsets, output_size)
+            indices = torch.stack([rows, cols])
+            dims = [num_rows, N]
+            # torch.sparse_coo_tensor is not supported by torch.fx, wrap it.
+            return torch.sparse_coo_tensor(
+                indices=indices,
+                values=values,
+                size=dims,
+            )
+
+        lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B * T)
+        total_lengths = lengths_.sum()
+        lengths = torch.from_numpy(lengths_).cuda()
+        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        lengths = lengths.view(T, B)
+        ref_values = torch.randint(low=0, high=1000000000, size=(total_lengths,)).cuda()
+
+        values = ref_values.clone().detach().requires_grad_(False)
+        output_values_per_table = torch.ops.fbgemm.stacked_jagged_1d_to_dense(
+            values=values,
+            lengths=lengths,
+            offset_per_key=[0]
+            + np.cumsum([lengths[t].sum().item() for t in range(T)]).tolist(),
+            max_lengths_per_key=[max_sequence_length] * T,
+            padding_value=padding_value,
+        )
+        ref_output_values = torch.ops.fbgemm.jagged_1d_to_dense(
+            values=ref_values,
+            offsets=offsets,
+            max_sequence_length=max_sequence_length,
+            padding_value=padding_value,
+        )
+        torch.testing.assert_close(
+            ref_output_values, torch.cat(output_values_per_table)
+        )
+
+        # TODO: reuse code with var_list_to_coo and to_dense
+
+    def _to_padded_dense(
+        self,
+        values: torch.Tensor,
+        offsets: List[torch.LongTensor],
+        max_lengths: List[int],
+        padding_value: float = 0,
+    ) -> torch.Tensor:
+        outer_dense_size = len(offsets[0]) - 1
+        inner_dense_size = values.size(-1)
+        dense = torch.empty(
+            (outer_dense_size,) + tuple(max_lengths) + (inner_dense_size,),
+            dtype=values.dtype,
+            device=values.device,
+        )
+        for i in range(outer_dense_size):
+            for jagged_coord in itertools.product(
+                *(list(range(max_l)) for max_l in max_lengths)
+            ):
+                cur_offset = i
+                is_zero = False
+                for d in range(len(max_lengths)):
+                    begin = offsets[d][cur_offset].item()
+                    end = offsets[d][cur_offset + 1].item()
+                    # pyre-fixme[6]: For 1st param expected `int` but got
+                    #  `Union[bool, float, int]`.
+                    if jagged_coord[d] >= end - begin:
+                        is_zero = True
+                        break
+                    cur_offset = begin + jagged_coord[d]
+                dense[(i,) + jagged_coord] = (
+                    padding_value if is_zero else values[cur_offset]
+                )
+        return dense
+
+    # TODO: reuse this code in test_(stacked)_jagged_1/2d
+    def _generate_jagged_tensor(
+        self,
+        num_jagged_dim: int,
+        outer_dense_size: int,
+        inner_dense_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, List[torch.LongTensor], List[int]]:
+        max_lengths = np.random.randint(low=1, high=10, size=(num_jagged_dim,))
+        x_offsets: List[torch.LongTensor] = []
+        num_lengths = outer_dense_size
+        for d in range(num_jagged_dim):
+            # Sometimes length[i] exceed max_L meaning jagged->dense will be
+            # truncation vs. padding
+            lengths = torch.randint(
+                low=0,
+                high=max_lengths[d] * 2,
+                # pyre-fixme[6]: For 3rd param expected `Union[List[int], Size,
+                #  typing.Tuple[int, ...]]` but got `Tuple[Union[bool, float, int]]`.
+                size=(num_lengths,),
+                device=device,
+            )
+            x_offsets.append(torch.ops.fbgemm.asynchronous_complete_cumsum(lengths))
+            num_lengths = x_offsets[-1][-1].item()
+
+        x_values = torch.rand(
+            # pyre-fixme[6]: For 1st param expected `Union[List[int], Size,
+            #  typing.Tuple[int, ...]]` but got `Tensor`.
+            x_offsets[-1][-1] * inner_dense_size,
+            dtype=dtype,
+            device=device
+            # pyre-fixme[6]: For 1st param expected `int` but got `Union[bool, float, int]`.
+        ).reshape(x_offsets[-1][-1].item(), inner_dense_size)
+
+        return x_values, x_offsets, max_lengths
+
+    # pyre-ignore [56]
+    @given(
+        num_jagged_dim=st.integers(1, 5),
+        outer_dense_size=st.integers(0, 5),
+        inner_dense_size=st.integers(0, 5),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
+        precompute_total_L=st.booleans(),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_dense_to_jagged(
+        self,
+        num_jagged_dim: int,
+        outer_dense_size: int,
+        inner_dense_size: int,
+        use_cpu: bool,
+        precompute_total_L: bool,
+    ) -> None:
+
+        # Generate multi-dim jagged tensor
+        device = torch.device("cpu" if use_cpu else "cuda")
+        values_2d, offsets, max_lengths = self._generate_jagged_tensor(
+            num_jagged_dim, outer_dense_size, inner_dense_size, torch.float, device
+        )
+        values_2d = values_2d.clone().detach().requires_grad_(True)
+
+        # jagged -> dense
+        dense = torch.ops.fbgemm.jagged_to_padded_dense(values_2d, offsets, max_lengths)
+
+        # dense -> jagged (op which is being tested)
+        if precompute_total_L:
+            total_L = values_2d.size(0)
+            jagged_values, jagged_offsets = torch.ops.fbgemm.dense_to_jagged(
+                dense, offsets, total_L
+            )
+        else:
+            jagged_values, jagged_offsets = torch.ops.fbgemm.dense_to_jagged(
+                dense, offsets
+            )
+
+        # jagged -> dense
+        dense2 = torch.ops.fbgemm.jagged_to_padded_dense(
+            jagged_values, jagged_offsets, max_lengths
+        )
+
+        # verify forward
+        torch.testing.assert_close(dense, dense2)
+
+        # verify backward
+        dense.retain_grad()
+        ref_output_values = jagged_values.clone().detach().requires_grad_(True)
+        ref_values = dense.clone().detach().requires_grad_(True)
+        jagged_values.backward(ref_output_values)
+        torch.testing.assert_close(dense.grad, ref_values)
+
+    # pyre-ignore [56]
+    @given(
+        num_jagged_dim=st.integers(1, 5),
+        outer_dense_size=st.integers(0, 5),
+        inner_dense_size=st.integers(0, 5),
+        padding_value=st.sampled_from([0, -1e-8]),
+        dtype=st.sampled_from([torch.float, torch.half, torch.bfloat16, torch.double]),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_jagged_to_padded_dense(
+        self,
+        num_jagged_dim: int,
+        outer_dense_size: int,
+        inner_dense_size: int,
+        padding_value: float,
+        dtype: torch.dtype,
+        use_cpu: bool,
+    ) -> None:
+        # CPU doesn't support bfloat16
+        assume(not use_cpu or dtype != torch.bfloat16)
+
+        # Testing with a basic crafted example.
+        # dense representation is
+        # [[[[0, 1], [ 0,  0], [0, 0]],
+        #   [[2, 3], [ 4,  5], [6, 7]],
+        #   [[0, 0], [ 0,  0], [0, 0]],
+        #   [[0, 0], [ 0,  0], [0, 0]]],
+        #  [[[0, 0], [ 0,  0], [0, 0]],
+        #   [[0, 0], [ 0,  0], [0, 0]],
+        #   [[0, 0], [ 0,  0], [0, 0]],
+        #   [[0, 0], [ 0,  0], [0, 0]]],
+        #  [[[8, 9], [10, 11], [0, 0]],
+        #   [[0, 0], [ 0,  0], [0, 0]],
+        #   [[0, 0], [ 0,  0], [0, 0]],
+        #   [[0, 0], [ 0,  0], [0, 0]]]],
+        # inner_dense_size = 2
+        # x_offsets = [
+        #     torch.LongTensor([0, 2, 2, 3]),  # lengths torch.Tensor([2, 0, 1]),
+        #     torch.LongTensor([0, 1, 4, 6]),  # lengths torch.Tensor([1, 3, 2]),
+        # ]
+        # outer_dense_size = len(x_offsets[0]) - 1
+        # max_lengths = [4, 3]
+
+        device = torch.device("cpu" if use_cpu else "cuda")
+
+        x_values, x_offsets, max_lengths = self._generate_jagged_tensor(
+            num_jagged_dim, outer_dense_size, inner_dense_size, torch.float, device
+        )
+
+        output_ref = self._to_padded_dense(
+            x_values, x_offsets, max_lengths, padding_value=padding_value
+        )
+        output = torch.ops.fbgemm.jagged_to_padded_dense(
+            x_values,
+            x_offsets,
+            max_lengths,
+            padding_value=padding_value,
+        )
+
+        torch.testing.assert_close(output, output_ref)
+
+        torch.autograd.gradcheck(
+            torch.ops.fbgemm.jagged_to_padded_dense,
+            (
+                x_values.double().requires_grad_(True),
+                x_offsets,
+                max_lengths,
+                padding_value,
+            ),
+        )
+
+    # pyre-ignore [56]
+    @given(
+        num_jagged_dim=st.integers(1, 4),
+        outer_dense_size=st.integers(0, 4),
+        inner_dense_size=st.integers(0, 4),
+        operation=st.sampled_from(["add", "add_jagged_output", "mul"]),
+        dtype=st.sampled_from([torch.float, torch.half, torch.double]),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_jagged_elementwise_binary(
+        self,
+        num_jagged_dim: int,
+        outer_dense_size: int,
+        inner_dense_size: int,
+        operation: str,
+        dtype: torch.dtype,
+        use_cpu: bool,
+    ) -> None:
+        device = torch.device("cpu" if use_cpu else "cuda")
+
+        x_values, x_offsets, max_lengths = self._generate_jagged_tensor(
+            num_jagged_dim, outer_dense_size, inner_dense_size, dtype, device
+        )
+        y = torch.rand(
+            outer_dense_size * np.prod(max_lengths) * inner_dense_size,
+            dtype=dtype,
+            device=device,
+        ).reshape((outer_dense_size,) + tuple(max_lengths) + (inner_dense_size,))
+
+        x_padded = self._to_padded_dense(x_values, x_offsets, max_lengths)
+        if operation == "add":
+            output_ref = x_padded + y
+            output = torch.ops.fbgemm.jagged_dense_elementwise_add(
+                x_values, x_offsets, y
+            )
+        elif operation == "add_jagged_output":
+            # create a jagged tensor and then densify
+            y = self._to_padded_dense(
+                torch.rand(
+                    (
+                        max(outer_dense_size * np.prod(max_lengths), x_values.size(0)),
+                        inner_dense_size,
+                    ),
+                    dtype=dtype,
+                    device=device,
+                ),
+                x_offsets,
+                max_lengths,
+            )
+            output_ref = x_padded + y
+            (
+                output,
+                output_offsets,
+            ) = torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output(
+                x_values, x_offsets, y
+            )
+            output = self._to_padded_dense(output, output_offsets, max_lengths)
+        elif operation == "mul":
+            output_ref = x_padded * y
+            output, output_offsets = torch.ops.fbgemm.jagged_dense_elementwise_mul(
+                x_values, x_offsets, y
+            )
+            output = self._to_padded_dense(output, output_offsets, max_lengths)
+        else:
+            raise AssertionError(f"Unknown operation {operation}")
+
+        torch.testing.assert_close(output, output_ref)
+
+        if operation == "add":
+            f = torch.ops.fbgemm.jagged_dense_elementwise_add
+        elif operation == "add_jagged_output":
+
+            # pyre-fixme[2]: Parameter must be annotated.
+            def add_jagged_output_func(*args) -> torch.Tensor:
+                return torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output(
+                    *args
+                )[0]
+
+            f = add_jagged_output_func
+        else:
+            assert operation == "mul"
+
+            # pyre-fixme[2]: Parameter must be annotated.
+            def mul_func(*args) -> torch.Tensor:
+                return torch.ops.fbgemm.jagged_dense_elementwise_mul(*args)[0]
+
+            f = mul_func
+
+        torch.autograd.gradcheck(
+            f,
+            (
+                x_values.double().requires_grad_(True),
+                x_offsets,
+                y.double().requires_grad_(True),
+            ),
+        )
+
+    # pyre-ignore [56]
+    @given(
+        num_jagged_dim=st.integers(1, 4),
+        outer_dense_size=st.integers(0, 4),
+        inner_dense_size=st.integers(0, 4),
+        dtype=st.sampled_from([torch.float, torch.half, torch.double]),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_jagged_dense_dense_elementwise_add_jagged_output(
+        self,
+        num_jagged_dim: int,
+        outer_dense_size: int,
+        inner_dense_size: int,
+        dtype: torch.dtype,
+        use_cpu: bool,
+    ) -> None:
+        device = torch.device("cpu" if use_cpu else "cuda")
+
+        x_values, x_offsets, max_lengths = self._generate_jagged_tensor(
+            num_jagged_dim, outer_dense_size, inner_dense_size, dtype, device
+        )
+
+        x_padded = self._to_padded_dense(x_values, x_offsets, max_lengths)
+        # create a jagged tensor and then densify
+        y_0 = self._to_padded_dense(
+            torch.rand(
+                (
+                    max(outer_dense_size * np.prod(max_lengths), x_values.size(0)),
+                    inner_dense_size,
+                ),
+                dtype=dtype,
+                device=device,
+            ),
+            x_offsets,
+            max_lengths,
+        )
+        y_1 = self._to_padded_dense(
+            torch.rand(
+                (
+                    max(outer_dense_size * np.prod(max_lengths), x_values.size(0)),
+                    inner_dense_size,
+                ),
+                dtype=dtype,
+                device=device,
+            ),
+            x_offsets,
+            max_lengths,
+        )
+        output_ref = x_padded + y_0 + y_1
+        (
+            output,
+            output_offsets,
+        ) = torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output(
+            x_values, x_offsets, y_0, y_1
+        )
+        output = self._to_padded_dense(output, output_offsets, max_lengths)
+
+        torch.testing.assert_close(output, output_ref)
+
+        # pyre-fixme[2]: Parameter must be annotated.
+        def add_jagged_output_func(*args) -> torch.Tensor:
+            return torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output(
+                *args
+            )[0]
+
+        f = add_jagged_output_func
+
+        torch.autograd.gradcheck(
+            f,
+            (
+                x_values.double().requires_grad_(True),
+                x_offsets,
+                y_0.double().requires_grad_(True),
+                y_1.double().requires_grad_(True),
+            ),
+        )
+
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore [56]
+    @given(
+        B=st.integers(0, 32),
+        H=st.integers(1, 3),
+        max_L=st.integers(1, 32),
+        D=st.integers(0, 32),
+        dtype=st.sampled_from([torch.float, torch.half, torch.bfloat16, torch.double]),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
+    )
+    def test_batched_dense_vec_jagged_2d_mul(
+        self,
+        B: int,
+        H: int,
+        max_L: int,
+        D: int,
+        dtype: torch.dtype,
+        use_cpu: bool,
+    ) -> None:
+        assume(H == 1 or B != 0)
+        # CPU doesn't support bfloat16
+        assume(not use_cpu or dtype != torch.bfloat16)
+
+        device = torch.device("cpu" if use_cpu else "cuda")
+        torch.backends.cuda.matmul.allow_tf32 = False
+
+        # Sometimes length[i] exceed max_L meaning jagged->dense will be
+        # truncation vs. padding
+        lengths = torch.randint(max_L * 2, size=(B,), device=device)
+        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        values = torch.rand((offsets[-1], H * D), dtype=dtype, device=device)
+        dense = torch.rand((B * H, max_L), dtype=dtype, device=device)
+        padded_values = torch.ops.fbgemm.jagged_to_padded_dense(
+            values,
+            [offsets],
+            [max_L],
+        )  # [B, N, H * D]
+
+        bmm_arg1 = dense.unsqueeze(1)
+        bmm_arg2 = (
+            padded_values.reshape(B, max_L, H, D)
+            .transpose(1, 2)
+            .reshape(B * H, max_L, D)
+        )
+        # torch.bmm not implemented for Half on CPU
+        if dtype == torch.half and use_cpu:
+            bmm_arg1 = bmm_arg1.float()
+            bmm_arg2 = bmm_arg2.float()
+        output_ref = torch.bmm(bmm_arg1, bmm_arg2).squeeze(
+            1
+        )  # [B H, 1, N] x [B H, N, D] = [B H, 1, D]
+        if dtype == torch.half and use_cpu:
+            output_ref = output_ref.half()
+        output = torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul(
+            dense, values, offsets
+        )
+        torch.testing.assert_close(
+            output,
+            output_ref,
+            rtol=1e-2 if dtype == torch.half else None,
+            atol=1e-2 if dtype == torch.half else None,
+        )
+
+        torch.autograd.gradcheck(
+            torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul,
+            (
+                dense.clone().detach().double().requires_grad_(True),
+                values.clone().detach().double().requires_grad_(True),
+                offsets,
+            ),
+        )
+
+    @staticmethod
+    def jagged_index_select_2d_ref(
+        values: torch.Tensor, lengths: torch.Tensor, inverse_lookup: torch.Tensor
+    ) -> torch.Tensor:
+        offsets = torch.ops.fbgemm.asynchronous_exclusive_cumsum(lengths)
+        end_offsets = offsets + lengths
+        full_start_offset = torch.index_select(offsets, 0, inverse_lookup)
+        full_end_offset = torch.index_select(end_offsets, 0, inverse_lookup)
+        index_ranges = torch.stack(
+            (full_start_offset, full_end_offset), dim=0
+        ).transpose(0, 1)
+
+        to_be_merged_tensors = []
+        for row in index_ranges:
+            to_be_merged_tensors.append(torch.arange(row[0], row[1], device="cuda"))
+        all_indices = torch.cat(to_be_merged_tensors, dim=0)
+        new_embeddings = torch.index_select(values, 0, all_indices)
+        return new_embeddings
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore [56]
+    @given(
+        max_seq_length=st.integers(5, 10),
+        batch_size=st.integers(1, 128),
+        num_cols=st.integers(1, 128),
+        num_jagged_tensor_rows=st.integers(1, 128),
+        index_dtype=st.sampled_from([torch.int, torch.long]),
+        jagged_tensor_dtype=st.sampled_from(
+            [torch.float, torch.half, torch.int, torch.long]
+        ),
+    )
+    @settings(max_examples=20, deadline=None)
+    def test_jagged_index_select_2d(
+        self,
+        max_seq_length: int,
+        batch_size: int,
+        num_cols: int,
+        num_jagged_tensor_rows: int,
+        index_dtype: torch.dtype,
+        jagged_tensor_dtype: torch.dtype,
+    ) -> None:
+        is_float = jagged_tensor_dtype in [torch.float, torch.half]
+        lengths = torch.randint(
+            low=0,
+            high=max_seq_length,
+            size=(num_jagged_tensor_rows,),
+            dtype=index_dtype,
+            device="cuda",
+        )
+        indices, _ = torch.sort(
+            torch.randint(
+                low=0,
+                high=num_jagged_tensor_rows,
+                size=(batch_size,),
+                dtype=index_dtype,
+                device="cuda",
+            )
+        )
+        if is_float:
+            values = torch.rand(
+                int(lengths.sum().item()),
+                num_cols,
+                dtype=jagged_tensor_dtype,
+                device="cuda",
+            )
+        else:
+            values = torch.randint(
+                2**16,
+                (int(lengths.sum().item()), num_cols),
+                dtype=jagged_tensor_dtype,
+                device="cuda",
+            )
+        values_ref = values.detach().clone()
+
+        # Only float tensors can require grad
+        if is_float:
+            values.requires_grad = True
+            values_ref.requires_grad = True
+
+        output, _ = torch.ops.fbgemm.jagged_index_select(values, lengths, indices)
+        output_ref = self.jagged_index_select_2d_ref(values_ref, lengths, indices)
+
+        assert torch.equal(output, output_ref)
+
+        if not is_float:
+            return
+
+        grad = torch.rand_like(output)
+        grad_ref = grad.detach().clone()
+
+        output.backward(grad)
+        output_ref.backward(grad_ref)
+
+        torch.testing.assert_close(
+            values.grad,
+            values_ref.grad,
+            rtol=1e-2 if jagged_tensor_dtype == torch.half else None,
+            atol=1e-2 if jagged_tensor_dtype == torch.half else None,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fbgemm_gpu/test/layout_transform_ops_test.py b/fbgemm_gpu/test/layout_transform_ops_test.py
index d299a1808..3451aa860 100644
--- a/fbgemm_gpu/test/layout_transform_ops_test.py
+++ b/fbgemm_gpu/test/layout_transform_ops_test.py
@@ -10,7 +10,7 @@
 import hypothesis.strategies as st
 import numpy as np
 import torch
-from hypothesis import Verbosity, given, settings
+from hypothesis import given, settings, Verbosity
 
 try:
     # pyre-ignore[21]
@@ -122,7 +122,9 @@ def test_recat_embedding_grad_output_mixed_D_batch(self, B: int, W: int) -> None
             )
             for i in range(W)
         ]
+        # pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
         dim_sum_per_rank_tensor = torch.cuda.LongTensor(dim_sum_per_rank)
+        # pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
         cumsum_dim_sum_per_rank_tensor = torch.cuda.LongTensor(
             np.cumsum([0] + dim_sum_per_rank)[:-1]
         )
@@ -160,7 +162,9 @@ def test_recat_embedding_grad_output_mixed_D_batch(self, B: int, W: int) -> None
             )
             for i in range(W)
         ]
+        # pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
         dim_sum_per_rank_tensor = torch.cuda.LongTensor(dim_sum_per_rank)
+        # pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
         cumsum_dim_sum_per_rank_tensor = torch.cuda.LongTensor(
             np.cumsum([0] + dim_sum_per_rank)[:-1]
         )
diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
index 1cf5b8ba7..c3829604f 100644
--- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py
+++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py
@@ -11,7 +11,7 @@
 
 import hypothesis.strategies as st
 import torch
-from hypothesis import Verbosity, given, settings
+from hypothesis import given, settings, Verbosity
 
 
 try:
diff --git a/fbgemm_gpu/test/metric_ops_test.py b/fbgemm_gpu/test/metric_ops_test.py
new file mode 100644
index 000000000..bba1c28f5
--- /dev/null
+++ b/fbgemm_gpu/test/metric_ops_test.py
@@ -0,0 +1,51 @@
+import unittest
+
+import fbgemm_gpu.metrics
+import hypothesis.strategies as st
+import torch
+from hypothesis import given, settings
+
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+
+    # pyre-ignore[21]
+    from test_utils import gpu_unavailable
+except Exception:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:metric_ops")
+    from fbgemm_gpu.test.test_utils import gpu_unavailable
+
+
+class MetricOpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore [56]
+    @given(
+        n_tasks=st.integers(1, 5),
+        batch_size=st.integers(1, 1024),
+        dtype=st.sampled_from([torch.half, torch.float, torch.double]),
+    )
+    @settings(max_examples=20, deadline=None)
+    def test_auc(self, n_tasks: int, batch_size: int, dtype: torch.dtype) -> None:
+        predictions = torch.randint(0, 1000, (n_tasks, batch_size)).to(dtype).cuda()
+        labels = torch.randint(0, 1000, (n_tasks, batch_size)).to(dtype).cuda() / 1000.0
+        weights = torch.rand(n_tasks, batch_size).to(dtype).cuda()
+
+        compute_auc = fbgemm_gpu.metrics.Auc()
+        output_ref = compute_auc(n_tasks, predictions, labels, weights)
+        output = fbgemm_gpu.metrics.auc(n_tasks, predictions, labels, weights)
+
+        # Explicitly convert type based on output_ref's dtype
+        output = output.to(output_ref.dtype)
+
+        # Test correctness only if output_ref does not product nan or inf
+        if not (torch.isnan(output_ref).any() or torch.isinf(output_ref).any()):
+            torch.testing.assert_close(
+                output_ref,
+                output,
+                rtol=1e-2 if dtype == torch.half else None,
+                atol=1e-2 if dtype == torch.half else None,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fbgemm_gpu/test/permute_pooled_embedding_modules_test.py b/fbgemm_gpu/test/permute_pooled_embedding_modules_test.py
index 0c5013b3a..59d3dca40 100644
--- a/fbgemm_gpu/test/permute_pooled_embedding_modules_test.py
+++ b/fbgemm_gpu/test/permute_pooled_embedding_modules_test.py
@@ -11,7 +11,7 @@
 import fbgemm_gpu
 import torch
 from fbgemm_gpu.permute_pooled_embedding_modules import PermutePooledEmbeddings
-from hypothesis import HealthCheck, given, settings
+from hypothesis import given, HealthCheck, settings
 from torch import nn, Tensor
 
 # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
@@ -92,6 +92,7 @@ def test_permutation_autograd(self) -> None:
         output.sum().backward()
 
         # check grads for fc1 when permuted, equals to fc2 weights times input_sum
+        # pyre-fixme[16]: Optional type has no attribute `view`.
         permute_res = net.permute_pooled_embeddings(net.fc1.weight.grad.view(1, 10))
         permute_ref = input_sum * net.fc2.weight
         torch.testing.assert_close(permute_res, permute_ref, rtol=1e-03, atol=1e-03)
diff --git a/fbgemm_gpu/test/quantize_comm_test.py b/fbgemm_gpu/test/quantize_comm_test.py
new file mode 100644
index 000000000..df024d645
--- /dev/null
+++ b/fbgemm_gpu/test/quantize_comm_test.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Optional, Tuple
+
+import hypothesis.strategies as st
+import torch
+from fbgemm_gpu.quantize_comm import QuantizedCommCodec
+from fbgemm_gpu.split_embedding_configs import SparseType
+from hypothesis import assume, given, settings
+
+
+class QuantizedCommCodecTest(unittest.TestCase):
+    @settings(deadline=2000)
+    # pyre-ignore
+    @given(
+        comm_precisions_loss_scale=st.sampled_from(
+            [
+                (SparseType.FP32, None),
+                (SparseType.FP16, None),
+                (SparseType.FP16, 4.0),
+                (SparseType.BF16, None),
+                (SparseType.BF16, 2.0),
+                (SparseType.FP8, None),
+                (SparseType.FP8, 3.0),
+            ]
+        ),
+        row_size=st.integers(4, 256),
+        col_size=st.integers(4, 256),
+        rand_seed=st.integers(0, 65534),
+    )
+    def test_quantized_comm_codec(
+        self,
+        comm_precisions_loss_scale: Tuple[SparseType, Optional[float]],
+        row_size: int,
+        col_size: int,
+        rand_seed: int,
+    ) -> None:
+
+        (comm_precision, loss_scale) = comm_precisions_loss_scale
+        if comm_precision == SparseType.FP8:
+            assume(col_size % 4 == 0)
+
+        torch.manual_seed(rand_seed)
+        shape = (row_size, col_size)
+
+        quant_codec = QuantizedCommCodec(comm_precision, loss_scale)
+
+        input_tensor = torch.rand(shape, requires_grad=True)
+
+        quant_tensor = quant_codec.encode(input_tensor)
+        output_tensor = quant_codec.decode(quant_tensor)
+
+        rtol = 0.005
+        atol = 0.005
+        if comm_precision == SparseType.FP8:
+            rtol = 0.05
+            atol = 0.05
+
+        torch.testing.assert_close(
+            input_tensor.detach(), output_tensor.detach(), rtol=rtol, atol=atol
+        )
diff --git a/fbgemm_gpu/test/quantize_ops_test.py b/fbgemm_gpu/test/quantize_ops_test.py
index 90905c7fe..246d92263 100644
--- a/fbgemm_gpu/test/quantize_ops_test.py
+++ b/fbgemm_gpu/test/quantize_ops_test.py
@@ -5,14 +5,14 @@
 
 import random
 import unittest
-from ctypes import POINTER, c_float, c_int32, cast, pointer
-from typing import Tuple, Dict
+from ctypes import c_float, c_int32, cast, POINTER, pointer
+from typing import Dict, Tuple
 
 import hypothesis.strategies as st
 import numpy as np
 import torch
 from fbgemm_gpu.split_embedding_configs import SparseType
-from hypothesis import HealthCheck, given, assume, settings
+from hypothesis import assume, given, HealthCheck, settings
 from torch import Tensor
 
 
@@ -20,26 +20,24 @@
     # pyre-ignore[21]
     from fbgemm_gpu import open_source  # noqa: F401
     from test_utils import (  # pyre-ignore[21]
-        fused_rowwise_8bit_quantize_reference,
+        bytes_to_half_floats,
         fused_rowwise_8bit_dequantize_reference,
-        fused_rowwise_nbit_quantize_reference,
+        fused_rowwise_8bit_quantize_reference,
         fused_rowwise_nbit_quantize_dequantize_reference,
-        bytes_to_half_floats,
+        fused_rowwise_nbit_quantize_reference,
         gpu_available,
-        skipIfRocm,
         gpu_unavailable,
+        skipIfRocm,
     )
 except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-    torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators_gpu")
-    torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators")
     from fbgemm_gpu.test.test_utils import (
-        fused_rowwise_8bit_quantize_reference,
+        bytes_to_half_floats,
         fused_rowwise_8bit_dequantize_reference,
-        fused_rowwise_nbit_quantize_reference,
+        fused_rowwise_8bit_quantize_reference,
         fused_rowwise_nbit_quantize_dequantize_reference,
-        bytes_to_half_floats,
+        fused_rowwise_nbit_quantize_reference,
         gpu_available,
         gpu_unavailable,
     )
@@ -55,6 +53,7 @@ class TestFused8BitRowwiseQuantizationConversion(unittest.TestCase):
         nrows=st.integers(min_value=0, max_value=100),
         ncols=st.integers(min_value=0, max_value=100),
         is_half=st.booleans(),
+        test_float_or_half_op=st.booleans(),
     )
     @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_op(
@@ -62,19 +61,25 @@ def test_quantize_op(
         nrows: int,
         ncols: int,
         is_half: bool,
+        test_float_or_half_op: bool,
     ) -> None:
         input_data = torch.rand(nrows, ncols).float()
         if is_half:
             input_data = torch.rand(nrows, ncols).half()
 
-        if not is_half:
-            quantized_data = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
+        if test_float_or_half_op:
+            quantized_data = torch.ops.fbgemm.FloatOrHalfToFused8BitRowwiseQuantized(
                 input_data
             )
         else:
-            quantized_data = torch.ops.fbgemm.HalfToFused8BitRowwiseQuantized(
-                input_data
-            )
+            if not is_half:
+                quantized_data = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
+                    input_data
+                )
+            else:
+                quantized_data = torch.ops.fbgemm.HalfToFused8BitRowwiseQuantized(
+                    input_data
+                )
 
         if nrows == 0 or ncols == 0:
             assert quantized_data.numel() == nrows * ((ncols + 3) // 4 * 4 + 8)
@@ -86,15 +91,23 @@ def test_quantize_op(
 
         if gpu_available:
             input_data_gpu = input_data.cuda()
-            if not is_half:
-                quantized_data_gpu = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
-                    input_data_gpu
+            if test_float_or_half_op:
+                quantized_data_gpu = (
+                    torch.ops.fbgemm.FloatOrHalfToFused8BitRowwiseQuantized(
+                        input_data_gpu
+                    )
                 )
             else:
-                quantized_data_gpu = torch.ops.fbgemm.HalfToFused8BitRowwiseQuantized(
-                    input_data_gpu
-                )
-
+                if not is_half:
+                    quantized_data_gpu = (
+                        torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
+                            input_data_gpu
+                        )
+                    )
+                else:
+                    quantized_data_gpu = (
+                        torch.ops.fbgemm.HalfToFused8BitRowwiseQuantized(input_data_gpu)
+                    )
             quantized_data_numpy = quantized_data_gpu.cpu().numpy()
             ncols_aligned = (ncols + 4 - 1) // 4 * 4
             # compare quantized data
@@ -117,6 +130,7 @@ def test_quantize_op(
         nrows=st.integers(min_value=0, max_value=100),
         ncols=st.integers(min_value=0, max_value=100),
         is_output_half=st.booleans(),
+        test_float_or_half_op=st.booleans(),
     )
     @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op(
@@ -124,6 +138,7 @@ def test_quantize_and_dequantize_op(
         nrows: int,
         ncols: int,
         is_output_half: bool,
+        test_float_or_half_op: bool,
     ) -> None:
         num_elem_per_byte = 1
         input_data = torch.rand(nrows, ncols).float()
@@ -132,20 +147,30 @@ def test_quantize_and_dequantize_op(
 
         assume(ncols % (2 * num_elem_per_byte) == 0)
 
-        if not is_output_half:
-            quantized_data = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
+        if test_float_or_half_op:
+            quantized_data = torch.ops.fbgemm.FloatOrHalfToFused8BitRowwiseQuantized(
                 input_data
             )
-            dequantized_data = torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloat(
-                quantized_data
+            dequantized_data = torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloatOrHalf(
+                quantized_data,
+                output_dtype=1 if is_output_half else 0,
             )
         else:
-            quantized_data = torch.ops.fbgemm.HalfToFused8BitRowwiseQuantized(
-                input_data
-            )
-            dequantized_data = torch.ops.fbgemm.Fused8BitRowwiseQuantizedToHalf(
-                quantized_data
-            )
+            if not is_output_half:
+                quantized_data = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
+                    input_data
+                )
+                dequantized_data = torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloat(
+                    quantized_data
+                )
+            else:
+                quantized_data = torch.ops.fbgemm.HalfToFused8BitRowwiseQuantized(
+                    input_data
+                )
+                dequantized_data = torch.ops.fbgemm.Fused8BitRowwiseQuantizedToHalf(
+                    quantized_data
+                )
+
         if nrows == 0 or ncols == 0:
             assert dequantized_data.numel() == 0
             return
@@ -161,22 +186,39 @@ def test_quantize_and_dequantize_op(
         if gpu_available:
             input_data_gpu = input_data.cuda()
 
-            if not is_output_half:
-                quantized_data_gpu = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
-                    input_data_gpu
+            if test_float_or_half_op:
+                quantized_data_gpu = (
+                    torch.ops.fbgemm.FloatOrHalfToFused8BitRowwiseQuantized(
+                        input_data_gpu
+                    )
                 )
                 dequantized_data_gpu = (
-                    torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloat(
-                        quantized_data_gpu
+                    torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloatOrHalf(
+                        quantized_data_gpu,
+                        output_dtype=1 if is_output_half else 0,
                     )
                 )
             else:
-                quantized_data_gpu = torch.ops.fbgemm.HalfToFused8BitRowwiseQuantized(
-                    input_data_gpu
-                )
-                dequantized_data_gpu = torch.ops.fbgemm.Fused8BitRowwiseQuantizedToHalf(
-                    quantized_data_gpu
-                )
+                if not is_output_half:
+                    quantized_data_gpu = (
+                        torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(
+                            input_data_gpu
+                        )
+                    )
+                    dequantized_data_gpu = (
+                        torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloat(
+                            quantized_data_gpu
+                        )
+                    )
+                else:
+                    quantized_data_gpu = (
+                        torch.ops.fbgemm.HalfToFused8BitRowwiseQuantized(input_data_gpu)
+                    )
+                    dequantized_data_gpu = (
+                        torch.ops.fbgemm.Fused8BitRowwiseQuantizedToHalf(
+                            quantized_data_gpu
+                        )
+                    )
 
             dequantized_data_numpy = dequantized_data_gpu.cpu().numpy()
             dequantized_data_trimmed = torch.from_numpy(
@@ -355,10 +397,16 @@ class TestFusedNBitRowwiseQuantizationConversion(unittest.TestCase):
         ncols=st.integers(min_value=0, max_value=100),
         bit_rate=st.sampled_from([2, 4]),
         is_half=st.booleans(),
+        test_float_or_half_op=st.booleans(),
     )
     @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_op(
-        self, nrows: int, ncols: int, bit_rate: int, is_half: bool
+        self,
+        nrows: int,
+        ncols: int,
+        bit_rate: int,
+        is_half: bool,
+        test_float_or_half_op: bool,
     ) -> None:
         assert 8 % bit_rate == 0
         num_elem_per_byte = 8 // bit_rate
@@ -368,15 +416,23 @@ def test_quantize_op(
         if is_half:
             input_data = input_data.half()
 
-        if not is_half:
-            quantized_data = torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
-                input_data, bit_rate
+        if test_float_or_half_op:
+            quantized_data = (
+                torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf(
+                    input_data, bit_rate
+                )
             )
         else:
-            quantized_data = torch.ops.fbgemm.HalfToFusedNBitRowwiseQuantizedSBHalf(
-                input_data, bit_rate
-            )
-
+            if not is_half:
+                quantized_data = (
+                    torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
+                        input_data, bit_rate
+                    )
+                )
+            else:
+                quantized_data = torch.ops.fbgemm.HalfToFusedNBitRowwiseQuantizedSBHalf(
+                    input_data, bit_rate
+                )
         if nrows == 0 or ncols == 0:
             assert quantized_data.numel() == nrows * (
                 (ncols + bit_rate - 1) // bit_rate + 4
@@ -408,19 +464,25 @@ def test_quantize_op(
 
         if gpu_available:
             input_data_gpu = input_data.cuda()
-            if not is_half:
+            if test_float_or_half_op:
                 quantized_data_gpu = (
-                    torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
+                    torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf(
                         input_data_gpu, bit_rate
                     )
                 )
             else:
-                quantized_data_gpu = (
-                    torch.ops.fbgemm.HalfToFusedNBitRowwiseQuantizedSBHalf(
-                        input_data_gpu, bit_rate
+                if not is_half:
+                    quantized_data_gpu = (
+                        torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
+                            input_data_gpu, bit_rate
+                        )
+                    )
+                else:
+                    quantized_data_gpu = (
+                        torch.ops.fbgemm.HalfToFusedNBitRowwiseQuantizedSBHalf(
+                            input_data_gpu, bit_rate
+                        )
                     )
-                )
-
             quantized_data_numpy = quantized_data_gpu.cpu().numpy()
             # compare quantized data
             np.testing.assert_array_equal(
@@ -433,6 +495,7 @@ def test_quantize_op(
         ncols=st.integers(min_value=0, max_value=100),
         bit_rate=st.sampled_from([2, 4]),
         is_output_half=st.booleans(),
+        test_float_or_half_op=st.booleans(),
     )
     @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op(
@@ -441,6 +504,7 @@ def test_quantize_and_dequantize_op(
         ncols: int,
         bit_rate: int,
         is_output_half: bool,
+        test_float_or_half_op: bool,
     ) -> None:
         assert 8 % bit_rate == 0
         num_elem_per_byte = 8 // bit_rate
@@ -450,20 +514,40 @@ def test_quantize_and_dequantize_op(
 
         assume(ncols % (2 * num_elem_per_byte) == 0)
 
-        if not is_output_half:
-            quantized_data = torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
-                input_data, bit_rate
+        if test_float_or_half_op:
+            quantized_data = (
+                torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf(
+                    input_data, bit_rate
+                )
             )
-            dequantized_data = torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloat(
-                quantized_data, bit_rate
+            dequantized_data = (
+                torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf(
+                    quantized_data,
+                    bit_rate,
+                    output_dtype=1 if is_output_half else 0,
+                )
             )
         else:
-            quantized_data = torch.ops.fbgemm.HalfToFusedNBitRowwiseQuantizedSBHalf(
-                input_data, bit_rate
-            )
-            dequantized_data = torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToHalf(
-                quantized_data, bit_rate
-            )
+            if not is_output_half:
+                quantized_data = (
+                    torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
+                        input_data, bit_rate
+                    )
+                )
+                dequantized_data = (
+                    torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloat(
+                        quantized_data, bit_rate
+                    )
+                )
+            else:
+                quantized_data = torch.ops.fbgemm.HalfToFusedNBitRowwiseQuantizedSBHalf(
+                    input_data, bit_rate
+                )
+                dequantized_data = (
+                    torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToHalf(
+                        quantized_data, bit_rate
+                    )
+                )
         if nrows == 0 or ncols == 0:
             assert dequantized_data.numel() == 0
             return
@@ -483,28 +567,42 @@ def test_quantize_and_dequantize_op(
 
         if gpu_available:
             input_data_gpu = input_data.cuda()
-            if not is_output_half:
+            if test_float_or_half_op:
                 quantized_data_gpu = (
-                    torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
+                    torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf(
                         input_data_gpu, bit_rate
                     )
                 )
                 dequantized_data_gpu = (
-                    torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloat(
-                        quantized_data_gpu, bit_rate
+                    torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf(
+                        quantized_data_gpu,
+                        bit_rate,
+                        output_dtype=1 if is_output_half else 0,
                     )
                 )
             else:
-                quantized_data_gpu = (
-                    torch.ops.fbgemm.HalfToFusedNBitRowwiseQuantizedSBHalf(
-                        input_data_gpu, bit_rate
+                if not is_output_half:
+                    quantized_data_gpu = (
+                        torch.ops.fbgemm.FloatToFusedNBitRowwiseQuantizedSBHalf(
+                            input_data_gpu, bit_rate
+                        )
                     )
-                )
-                dequantized_data_gpu = (
-                    torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToHalf(
-                        quantized_data_gpu, bit_rate
+                    dequantized_data_gpu = (
+                        torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToFloat(
+                            quantized_data_gpu, bit_rate
+                        )
+                    )
+                else:
+                    quantized_data_gpu = (
+                        torch.ops.fbgemm.HalfToFusedNBitRowwiseQuantizedSBHalf(
+                            input_data_gpu, bit_rate
+                        )
+                    )
+                    dequantized_data_gpu = (
+                        torch.ops.fbgemm.FusedNBitRowwiseQuantizedSBHalfToHalf(
+                            quantized_data_gpu, bit_rate
+                        )
                     )
-                )
             # compare quantized data
             torch.testing.assert_close(
                 dequantized_data_gpu.cpu().float(), dequantized_data.float()
@@ -592,7 +690,7 @@ def _test_conversion(
             dequantized_data_gpu = torch.ops.fbgemm.HFP8QuantizedToFloat(
                 quantized_data_gpu, ebits, exponent_bias
             )
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 dequantized_data_gpu.cpu(), reference_data, rtol=rtol, atol=atol
             )
 
@@ -730,7 +828,7 @@ def test_quantize_op(self, nrows: int, ncols: int) -> None:
         bounding_box_size = 16
         print("MSFP parameters", bounding_box_size, ebits, mbits, bias)
         input_data = torch.rand(nrows, ncols).float()
-        quantized_data = torch.ops.fb.FloatToMSFPQuantized(
+        quantized_data = torch.ops.fbgemm.FloatToMSFPQuantized(
             input_data.cuda(),
             bounding_box_size,
             ebits,
@@ -739,7 +837,7 @@ def test_quantize_op(self, nrows: int, ncols: int) -> None:
             min_pos,
             max_pos,
         )
-        dequantized_data = torch.ops.fb.MSFPQuantizedToFloat(
+        dequantized_data = torch.ops.fbgemm.MSFPQuantizedToFloat(
             quantized_data.cuda(), ebits, mbits, bias
         )
         torch.testing.assert_close(dequantized_data.cpu(), input_data, rtol=1, atol=0)
@@ -759,12 +857,10 @@ def test_dense_mlp_quantize_ops(
         self, precision: str, batch_size: int, k: int, n: int
     ) -> None:
         if precision == "BF16":
-            input_data = torch.tensor(
-                np.random.rand(n, k).astype(np.float32), dtype=torch.float32
-            )
+            input_data = torch.rand((n, k), dtype=torch.float32)
             quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
             dequantized_data = torch.ops.fbgemm.Bfloat16QuantizedToFloat(quantized_data)
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 dequantized_data, input_data, rtol=1e-2, atol=1e-2
             )
 
@@ -831,7 +927,7 @@ def test_quantize_and_dequantize_op(self, nrows: int, ncols: int) -> None:
         ref_bfloat16 = f(input_data.numpy())
         f = np.vectorize(lambda x: bfloat_dequantize(x))
         ref_fp32 = torch.from_numpy(f(ref_bfloat16)).float()
-        torch.testing.assert_allclose(dequantized_data, ref_fp32)
+        torch.testing.assert_close(dequantized_data, ref_fp32)
 
         if torch.cuda.is_available():
             input_data_gpu = input_data.cuda()
@@ -842,7 +938,7 @@ def test_quantize_and_dequantize_op(self, nrows: int, ncols: int) -> None:
                 quantized_data_gpu
             )
             # compare quantized data
-            torch.testing.assert_allclose(dequantized_data_gpu.cpu(), ref_fp32)
+            torch.testing.assert_close(dequantized_data_gpu.cpu(), ref_fp32)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available")
     # pyre-fixme[56]: Pyre was not able to infer the type of argument
@@ -869,41 +965,7 @@ def test_quantize_and_dequantize_op_cuda_large_nrows_bf16(
                 quantized_data_gpu
             )
             # compare quantized data
-            torch.testing.assert_allclose(dequantized_data_gpu.cpu(), dequantized_data)
-
-
-class TestDenseMLPQuantizationConversion(unittest.TestCase):
-    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
-    @skipIfRocm
-    @given(
-        nrows=st.integers(min_value=0, max_value=100),
-        ncols=st.integers(min_value=0, max_value=100),
-    )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
-    def test_quantize_op(self, nrows: int, ncols: int) -> None:
-        ebits = 8
-        mbits = 7
-        bias = 127
-        max_pos = (1 << ((1 << ebits) - 2 - bias)) * (2 - 2 ** (-mbits))
-        min_pos = 2 ** (1 - bias - mbits)
-        bounding_box_size = 16
-        print("MSFP parameters", bounding_box_size, ebits, mbits, bias)
-        input_data = torch.rand(nrows, ncols).float()
-        quantized_data = torch.ops.fb.FloatToMSFPQuantized(
-            input_data.cuda(),
-            bounding_box_size,
-            ebits,
-            mbits,
-            bias,
-            min_pos,
-            max_pos,
-        )
-        dequantized_data = torch.ops.fb.MSFPQuantizedToFloat(
-            quantized_data.cuda(), ebits, mbits, bias
-        )
-        torch.testing.assert_allclose(
-            dequantized_data.cpu(), input_data, rtol=1, atol=0
-        )
+            torch.testing.assert_close(dequantized_data_gpu.cpu(), dequantized_data)
 
 
 if __name__ == "__main__":
diff --git a/fbgemm_gpu/test/run.sh b/fbgemm_gpu/test/run.sh
deleted file mode 100755
index 2e9347833..000000000
--- a/fbgemm_gpu/test/run.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# exit immediately on failure, or if an undefined variable is used
-set -eux
-
-export FBGEMM_TEST_WITH_ROCM=1
-
-python layout_transform_ops_test.py --verbose
-
-python permute_pooled_embedding_modules_test.py --verbose
-
-python sparse_ops_test.py --verbose
-
-python merge_pooled_embeddings_test.py --verbose
-
-python quantize_ops_test.py --verbose
-
-python split_embedding_inference_converter_test.py --verbose
-
-python split_table_batched_embeddings_test.py --verbose
\ No newline at end of file
diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py
index 2c0d7c150..eeb87d802 100644
--- a/fbgemm_gpu/test/sparse_ops_test.py
+++ b/fbgemm_gpu/test/sparse_ops_test.py
@@ -9,12 +9,12 @@
 import random
 import unittest
 from itertools import accumulate
-from typing import List, Optional, Tuple, Type, Union, Callable, Any
+from typing import Any, Callable, List, Optional, Tuple, Type, Union
 
 import hypothesis.strategies as st
 import numpy as np
 import torch
-from hypothesis import Verbosity, assume, given, settings
+from hypothesis import given, settings, Verbosity
 
 try:
     # pyre-ignore[21]
@@ -27,8 +27,6 @@
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
     from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable
 
-np_int_types = Union[Type[np.int32], Type[np.int64]]
-
 
 def unbucketize_indices_value(
     bucketized_indices: torch.Tensor,
@@ -52,31 +50,20 @@ def unbucketize_indices_value(
     return bucket_expand * block_size_expand + bucketized_indices
 
 
-def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
-    return torch.repeat_interleave(
-        torch._dim_arange(lengths, 0).long(),
-        lengths.long(),
-    )
+def get_n_rand_num_summing_to_k(n: int, k: int) -> np.ndarray:
+    """Get a list of `n` integers which collectively sum to `k`, drawn
+    uniformly from the set of all such lists.
 
-
-# Converts lengths + values format to COO format
-# [B], [N, D] -> [B, N', D].
-# pyre-ignore Missing return annotation [3]
-def var_list_to_coo(lengths: torch.Tensor, values: torch.Tensor, N: int, D: int):
-    rows = lengths_to_segment_ids(lengths)
-    num_rows = lengths.size()[0]
-    offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-    output_size = lengths.sum()
-    # This does D&H sync
-    cols = torch.ops.fbgemm.offsets_range(offsets, output_size)
-    indices = torch.stack([rows, cols])
-    dims = [num_rows, N, D]
-    # torch.sparse_coo_tensor is not supported by torch.fx, wrap it.
-    return torch.sparse_coo_tensor(
-        indices=indices,
-        values=values,
-        size=dims,
-    )
+    Args:
+        n - The number of integers in the result list
+        k - The value they should sum to
+    """
+    # There are a lot of ways to do this wrong, probably including
+    # the ones you've just thought of. I think the following does
+    # it correctly, though.
+    if n == 0:
+        return np.array([])
+    return np.random.multinomial(k, np.ones(n) / n, size=1)[0]
 
 
 class SparseOpsTest(unittest.TestCase):
@@ -134,77 +121,6 @@ def permute_indices_ref_(
 
         return permuted_lengths, permuted_indices, permuted_weights
 
-    @staticmethod
-    def expand_into_jagged_permute_ref_(
-        permute: List[int],
-        length: List[int],
-    ) -> List[int]:
-        offsets = [0] + list(itertools.accumulate(length))
-        output_permute = []
-        for r in permute:
-            output_permute.extend(
-                range(
-                    offsets[r],
-                    offsets[r + 1],
-                )
-            )
-        return output_permute
-
-    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
-    @given(
-        T=st.integers(min_value=10, max_value=20),
-        W=st.integers(min_value=8, max_value=128),
-    )
-    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
-    def test_expand_into_jagged_permute(
-        self,
-        T: int,
-        W: int,
-    ) -> None:
-        length_per_w = [random.randint(10000, 20000) for i in range(W)]
-        length_1d = list(
-            itertools.chain.from_iterable(itertools.repeat(x, T) for x in length_per_w)
-        )
-        permute_list = list(range(T * W))
-        random.shuffle(permute_list)
-        permuted_length_1d = [length_1d[r] for r in permute_list]
-        permute_tensor = torch.tensor(permute_list)
-
-        # compute offsets
-        offsets_1d = [0] + list(itertools.accumulate(length_1d))
-        permuted_offsets_1d = [0] + list(itertools.accumulate(permuted_length_1d))
-        offsets_1d_tensor = torch.tensor(offsets_1d)
-        permuted_offsets_1d_tensor = torch.tensor(permuted_offsets_1d)
-
-        # cpu op
-        output_permute_cpu = torch.ops.fbgemm.expand_into_jagged_permute(
-            permute_tensor,
-            offsets_1d_tensor,
-            permuted_offsets_1d_tensor,
-            offsets_1d[-1],
-        )
-
-        # reference solution
-        output_permute_ref = self.expand_into_jagged_permute_ref_(
-            permute_list,
-            length_1d,
-        )
-        output_permute_ref_tensor = torch.tensor(output_permute_ref)
-
-        # assert cpu and gpu ops
-        torch.testing.assert_allclose(output_permute_cpu, output_permute_ref_tensor)
-        if gpu_available:
-            # gpu op
-            output_permute_gpu = torch.ops.fbgemm.expand_into_jagged_permute(
-                permute_tensor.cuda(),
-                offsets_1d_tensor.cuda(),
-                permuted_offsets_1d_tensor.cuda(),
-                offsets_1d[-1],
-            )
-            torch.testing.assert_allclose(
-                output_permute_gpu.cpu(), output_permute_ref_tensor
-            )
-
     # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
     @given(
         B=st.integers(min_value=1, max_value=20),
@@ -238,6 +154,8 @@ def test_permute_indices(
         else:
             lengths = torch.randint(low=1, high=L, size=(T, B)).type(index_dtype)
 
+        # pyre-fixme[6]: For 1st param expected `Union[List[int], Size,
+        #  typing.Tuple[int, ...]]` but got `Union[bool, float, int]`.
         weights = torch.rand(lengths.sum().item()).float() if has_weight else None
         indices = torch.randint(
             low=1,
@@ -282,6 +200,7 @@ def test_permute_indices(
             permuted_lengths_ref,
             permuted_indices_ref,
             permuted_weights_ref,
+            # pyre-fixme[6]: For 4th param expected `LongTensor` but got `Tensor`.
         ) = self.permute_indices_ref_(lengths, indices, weights, permute.long(), is_1D)
         torch.testing.assert_close(permuted_indices_cpu, permuted_indices_ref)
         torch.testing.assert_close(permuted_lengths_cpu, permuted_lengths_ref)
@@ -300,6 +219,7 @@ def test_permute_indices(
                     permute.cuda(),
                     lengths.cuda(),
                     indices.cuda(),
+                    # pyre-fixme[16]: `Optional` has no attribute `cuda`.
                     weights.cuda() if has_weight else None,
                     None,
                 )
@@ -338,6 +258,8 @@ def test_permute_indices_with_repeats(
     ) -> None:
         index_dtype = torch.int64 if long_index else torch.int32
         lengths = torch.randint(low=1, high=L, size=(T, B)).type(index_dtype)
+        # pyre-fixme[6]: For 1st param expected `Union[List[int], Size,
+        #  typing.Tuple[int, ...]]` but got `Union[bool, float, int]`.
         weights = torch.rand(lengths.sum().item()).float() if has_weight else None
         indices = torch.randint(
             low=1,
@@ -364,6 +286,7 @@ def test_permute_indices_with_repeats(
             permuted_lengths_ref,
             permuted_indices_ref,
             permuted_weights_ref,
+            # pyre-fixme[6]: For 4th param expected `LongTensor` but got `Tensor`.
         ) = self.permute_indices_ref_(lengths, indices, weights, permute.long())
         torch.testing.assert_close(permuted_indices_cpu, permuted_indices_ref)
         torch.testing.assert_close(permuted_lengths_cpu, permuted_lengths_ref)
@@ -381,6 +304,7 @@ def test_permute_indices_with_repeats(
                 permute.cuda(),
                 lengths.cuda(),
                 indices.cuda(),
+                # pyre-fixme[16]: `Optional` has no attribute `cuda`.
                 weights.cuda() if has_weight else None,
             )
             torch.testing.assert_close(permuted_indices_gpu.cpu(), permuted_indices_cpu)
@@ -427,6 +351,8 @@ def test_permute_embeddings(
     ) -> None:
         index_dtype = torch.int64 if long_index else torch.int32
         lengths = torch.randint(low=1, high=L, size=(T, B)).type(index_dtype)
+        # pyre-fixme[6]: For 1st param expected `Union[List[int], Size,
+        #  typing.Tuple[int, ...]]` but got `Union[bool, float, int]`.
         embeddings = torch.rand(lengths.sum().item()).float()
         permute_list = list(range(T))
         random.shuffle(permute_list)
@@ -439,6 +365,7 @@ def test_permute_embeddings(
             permuted_lengths_ref,
             permuted_embeddings_ref,
             _,
+            # pyre-fixme[6]: For 4th param expected `LongTensor` but got `Tensor`.
         ) = self.permute_indices_ref_(lengths, embeddings, None, permute.long())
         torch.testing.assert_close(permuted_embeddings_cpu, permuted_embeddings_ref)
         torch.testing.assert_close(permuted_lengths_cpu, permuted_lengths_ref)
@@ -455,19 +382,18 @@ def test_permute_embeddings(
             )
             torch.testing.assert_close(permuted_lengths_gpu.cpu(), permuted_lengths_cpu)
 
-    @unittest.skipIf(*gpu_unavailable)
     # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
     @given(
         long_indices=st.booleans(),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=16, deadline=None)
     def test_block_bucketize_sparse_features_long_indices(
-        self,
-        long_indices: bool,
+        self, long_indices: bool, use_cpu: bool
     ) -> None:
         bucketize_pos = False
         sequence = False
-        index_type = torch.long
+        index_type = torch.long if long_indices else torch.int
 
         # 3 GPUs
         my_size = 3
@@ -509,23 +435,44 @@ def test_block_bucketize_sparse_features_long_indices(
             )
 
         (
-            new_lengths_gpu,
-            new_indices_gpu,
-            new_weights_gpu,
-            new_pos_gpu,
-            unbucketize_permute_gpu,
+            new_lengths_cpu,
+            new_indices_cpu,
+            new_weights_cpu,
+            new_pos_cpu,
+            unbucketize_permute_cpu,
         ) = torch.ops.fbgemm.block_bucketize_sparse_features(
-            lengths.cuda(),
-            indices.cuda(),
+            lengths,
+            indices,
             bucketize_pos,
             sequence,
-            block_sizes.cuda(),
+            block_sizes,
             my_size,
             None,
         )
+        torch.testing.assert_close(new_lengths_cpu, new_lengths_ref)
+        torch.testing.assert_close(new_indices_cpu, new_indices_ref)
+
+        if not use_cpu:
+            (
+                new_lengths_gpu,
+                new_indices_gpu,
+                new_weights_gpu,
+                new_pos_gpu,
+                unbucketize_permute_gpu,
+            ) = torch.ops.fbgemm.block_bucketize_sparse_features(
+                lengths.cuda(),
+                indices.cuda(),
+                bucketize_pos,
+                sequence,
+                block_sizes.cuda(),
+                my_size,
+                None,
+            )
 
-        torch.testing.assert_close(new_lengths_gpu.cpu(), new_lengths_ref)
-        torch.testing.assert_close(new_indices_gpu.cpu(), new_indices_ref)
+            torch.testing.assert_close(new_lengths_gpu.cpu(), new_lengths_ref)
+            torch.testing.assert_close(new_indices_gpu.cpu(), new_indices_ref)
+            torch.testing.assert_close(new_lengths_gpu.cpu(), new_lengths_cpu)
+            torch.testing.assert_close(new_indices_gpu.cpu(), new_indices_cpu)
 
     # pyre-ignore [56]
     @given(
@@ -590,7 +537,11 @@ def test_cumsum(self, n: int, long_index: bool) -> None:
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
     def test_offsets_range(
-        self, N: int, offsets_type: "Union[Type[torch.int32], Type[torch.int64]]"
+        self,
+        N: int,
+        # pyre-fixme[11]: Annotation `int32` is not defined as a type.
+        # pyre-fixme[11]: Annotation `int64` is not defined as a type.
+        offsets_type: "Union[Type[torch.int32], Type[torch.int64]]",
     ) -> None:
         lengths = np.array([np.random.randint(low=0, high=20) for _ in range(N)])
         offsets = np.cumsum(np.concatenate(([0], lengths)))[:-1]
@@ -733,6 +684,7 @@ def test_block_bucketize_sparse_features(
                 sequence,
                 block_sizes.cuda(),
                 my_size,
+                # pyre-fixme[16]: `Optional` has no attribute `cuda`.
                 weights.cuda() if has_weight else None,
             )
             torch.testing.assert_close(
@@ -800,12 +752,12 @@ def test_bucketize_sparse_features(
         ) = torch.ops.fbgemm.bucketize_sparse_features(
             lengths, indices, bucketize_pos, 2, weights
         )
-        torch.testing.assert_allclose(new_lengths_cpu, new_lengths_ref, 0, 0)
-        torch.testing.assert_allclose(new_indices_cpu, new_indices_ref, 0, 0)
+        torch.testing.assert_close(new_lengths_cpu, new_lengths_ref, rtol=0, atol=0)
+        torch.testing.assert_close(new_indices_cpu, new_indices_ref, rtol=0, atol=0)
         if has_weight:
-            torch.testing.assert_allclose(new_weights_cpu, new_weights_ref)
+            torch.testing.assert_close(new_weights_cpu, new_weights_ref)
         if bucketize_pos:
-            torch.testing.assert_allclose(new_pos_cpu, new_pos_ref)
+            torch.testing.assert_close(new_pos_cpu, new_pos_ref)
         if gpu_available:
             (
                 new_lengths_gpu,
@@ -817,14 +769,19 @@ def test_bucketize_sparse_features(
                 indices.cuda(),
                 bucketize_pos,
                 2,
+                # pyre-fixme[16]: `Optional` has no attribute `cuda`.
                 weights.cuda() if has_weight else None,
             )
-            torch.testing.assert_allclose(new_lengths_gpu.cpu(), new_lengths_ref, 0, 0)
-            torch.testing.assert_allclose(new_indices_gpu.cpu(), new_indices_ref, 0, 0)
+            torch.testing.assert_close(
+                new_lengths_gpu.cpu(), new_lengths_ref, rtol=0, atol=0
+            )
+            torch.testing.assert_close(
+                new_indices_gpu.cpu(), new_indices_ref, rtol=0, atol=0
+            )
             if has_weight:
-                torch.testing.assert_allclose(new_weights_gpu.cpu(), new_weights_cpu)
+                torch.testing.assert_close(new_weights_gpu.cpu(), new_weights_cpu)
             if bucketize_pos:
-                torch.testing.assert_allclose(new_pos_gpu.cpu(), new_pos_cpu)
+                torch.testing.assert_close(new_pos_gpu.cpu(), new_pos_cpu)
 
     @unittest.skipIf(*gpu_unavailable)
     # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
@@ -974,374 +931,6 @@ def test_reorder_batched_ad_indices_cpu(
             cat_ad_indices.view(B, T, A, L),
         )
 
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=20,
-        deadline=None,
-    )
-    # pyre-ignore [56]
-    @given(
-        B=st.integers(min_value=1, max_value=128),
-        D=st.integers(min_value=1, max_value=128),
-        max_sequence_length=st.integers(min_value=1, max_value=200),
-        is_half=st.booleans(),
-    )
-    def test_jagged_2d_to_dense(
-        self,
-        B: int,
-        D: int,
-        max_sequence_length: int,
-        is_half: bool,
-    ) -> None:
-        D = D * 4
-        lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B)
-        total_lengths = lengths_.sum()
-        lengths = torch.from_numpy(lengths_)
-        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-
-        ref_values = torch.rand(total_lengths, D)
-        ref_output_values = var_list_to_coo(
-            lengths,
-            ref_values,
-            max_sequence_length,
-            D,
-        ).to_dense()
-        if is_half:
-            ref_output_values = ref_output_values.half()
-
-        # test cpu forward
-        if is_half:
-            values = ref_values.clone().half().detach().requires_grad_(True)
-        else:
-            values = ref_values.clone().detach().requires_grad_(True)
-        output_values = torch.ops.fbgemm.jagged_2d_to_dense(
-            values=values,
-            offsets=offsets,
-            max_sequence_length=max_sequence_length,
-        )
-        torch.testing.assert_close(ref_output_values, output_values)
-
-        if torch.cuda.is_available():
-            # test gpu forward
-            ref_values = ref_values.cuda()
-            if is_half:
-                values = ref_values.clone().half().detach().requires_grad_(True)
-            else:
-                values = ref_values.clone().detach().requires_grad_(True)
-            offsets = offsets.cuda()
-            ref_output_values = ref_output_values.cuda()
-            output_values = torch.ops.fbgemm.jagged_2d_to_dense(
-                values=values,
-                offsets=offsets,
-                max_sequence_length=max_sequence_length,
-            )
-            torch.testing.assert_close(ref_output_values, output_values)
-
-            # test gpu backward
-            output_values.backward(ref_output_values)
-            if is_half:
-                ref_values = ref_values.half()
-            torch.testing.assert_close(ref_values, values.grad)
-
-    def test_jagged_2d_to_dense_truncation(self) -> None:
-        # Test the case where max_sequence_length < max(lengths[i])
-        lengths_ = np.array([2, 3, 0, 1])
-        lengths = torch.from_numpy(lengths_)
-        total_lengths = lengths_.sum()
-        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-
-        embedding_dim = 16
-        max_sequence_length = 2
-        ref_values = torch.rand(total_lengths, embedding_dim)
-        ref_output_values = var_list_to_coo(
-            lengths,
-            ref_values,
-            3,
-            embedding_dim,
-        ).to_dense()[:, :max_sequence_length, :]
-
-        # test cpu forward
-        values = ref_values.clone().detach().requires_grad_(True)
-        output_values = torch.ops.fbgemm.jagged_2d_to_dense(
-            values=values,
-            offsets=offsets,
-            max_sequence_length=max_sequence_length,
-        )
-        torch.testing.assert_close(ref_output_values, output_values)
-
-        if torch.cuda.is_available():
-            # test gpu forward
-            ref_values = ref_values.cuda()
-            values = ref_values.clone().detach().requires_grad_(True)
-            offsets = offsets.cuda()
-            ref_output_values = ref_output_values.cuda()
-            output_values = torch.ops.fbgemm.jagged_2d_to_dense(
-                values=values,
-                offsets=offsets,
-                max_sequence_length=max_sequence_length,
-            )
-            torch.testing.assert_close(ref_output_values, output_values)
-
-            # test gpu backward
-            expected_grad = ref_values
-            expected_grad[4, :] = 0  # due to truncation
-            expected_grad = expected_grad.cuda()
-            output_values.backward(ref_output_values)
-            torch.testing.assert_close(expected_grad, values.grad)
-
-    @unittest.skipIf(*gpu_unavailable)
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=20,
-        deadline=None,
-    )
-    # pyre-ignore [56]
-    @given(
-        T=st.integers(min_value=1, max_value=5),
-        B=st.integers(min_value=1, max_value=64),
-        D=st.integers(min_value=1, max_value=128),
-        max_sequence_length=st.integers(min_value=1, max_value=300),
-    )
-    def test_stacked_jagged_2d_to_dense(
-        self,
-        T: int,
-        B: int,
-        D: int,
-        max_sequence_length: int,
-    ) -> None:
-        D = D * 4
-        lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B * T)
-        total_lengths = lengths_.sum()
-        lengths = torch.from_numpy(lengths_).cuda()
-        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-        ref_values = torch.rand(total_lengths, D).cuda()
-        ref_output_values = var_list_to_coo(
-            lengths,
-            ref_values,
-            max_sequence_length,
-            D,
-        ).to_dense()
-        lengths = lengths.view(T, B)
-
-        values = ref_values.clone().detach().requires_grad_(True)
-        output_values_per_table = torch.ops.fbgemm.stacked_jagged_2d_to_dense(
-            values=values,
-            lengths=lengths,
-            offset_per_key=[0]
-            + np.cumsum([lengths[t].sum().item() for t in range(T)]).tolist(),
-            max_lengths_per_key=[max_sequence_length] * T,
-        )
-        ref_output_values = torch.ops.fbgemm.jagged_2d_to_dense(
-            values=ref_values,
-            offsets=offsets,
-            max_sequence_length=max_sequence_length,
-        )
-        torch.testing.assert_close(
-            ref_output_values, torch.cat(output_values_per_table)
-        )
-
-        # test backward
-        output_values = torch.cat(output_values_per_table)
-        output_values.backward(ref_output_values)
-        torch.testing.assert_close(ref_values, values.grad)
-
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=20,
-        deadline=None,
-    )
-    # pyre-ignore [56]
-    @given(
-        B=st.integers(min_value=1, max_value=128),
-        max_sequence_length=st.integers(min_value=1, max_value=500),
-        padding_value=st.integers(min_value=-100000, max_value=100000),
-    )
-    def test_jagged_1d_to_dense(
-        self,
-        B: int,
-        max_sequence_length: int,
-        padding_value: int,
-    ) -> None:
-        def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
-            return torch.repeat_interleave(
-                torch._dim_arange(lengths, 0).long(),
-                lengths.long(),
-            )
-
-        # Converts lengths + values format to COO format
-        # [B], [N] -> [B, N'].
-        # pyre-ignore Missing return annotation [3]
-        def var_list_to_coo(
-            lengths: torch.Tensor,
-            values: torch.Tensor,
-            N: int,
-        ):
-            rows = lengths_to_segment_ids(lengths)
-            num_rows = lengths.size()[0]
-            # This does D&H sync
-            offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-            output_size = lengths.sum()
-            # This does D&H sync
-            cols = torch.ops.fbgemm.offsets_range(offsets, output_size)
-            indices = torch.stack([rows, cols])
-            dims = [num_rows, N]
-            # torch.sparse_coo_tensor is not supported by torch.fx, wrap it.
-            return torch.sparse_coo_tensor(
-                indices=indices,
-                values=values,
-                size=dims,
-            )
-
-        lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B)
-        total_lengths = lengths_.sum()
-        lengths = torch.from_numpy(lengths_)
-        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-
-        ref_values = torch.randint(low=0, high=1000000000, size=(total_lengths,))
-        ref_values_mask = var_list_to_coo(
-            lengths, torch.ones_like(ref_values), max_sequence_length
-        ).to_dense()
-        ref_output_values = (
-            var_list_to_coo(
-                lengths,
-                ref_values,
-                max_sequence_length,
-            ).to_dense()
-            + (1 - ref_values_mask) * torch.ones_like(ref_values_mask) * padding_value
-        )
-
-        # test cpu forward
-        values = ref_values.clone().detach().requires_grad_(False)
-        output_values = torch.ops.fbgemm.jagged_1d_to_dense(
-            values=values,
-            offsets=offsets,
-            max_sequence_length=max_sequence_length,
-            padding_value=padding_value,
-        )
-        torch.testing.assert_close(ref_output_values, output_values)
-
-        if torch.cuda.is_available():
-            # test gpu forward
-            ref_values = ref_values.cuda()
-            values = ref_values.clone().detach().requires_grad_(False)
-            offsets = offsets.cuda()
-            ref_output_values = ref_output_values.cuda()
-            output_values = torch.ops.fbgemm.jagged_1d_to_dense(
-                values=values,
-                offsets=offsets,
-                max_sequence_length=max_sequence_length,
-                padding_value=padding_value,
-            )
-            torch.testing.assert_close(ref_output_values, output_values)
-
-    def test_jagged_1d_to_dense_truncation(self) -> None:
-        lengths_ = np.array([1, 3, 0, 1])
-        lengths = torch.from_numpy(lengths_)
-        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-
-        ref_values = torch.from_numpy(np.array([100, 3, 4, 5, 6]))
-        ref_output = torch.from_numpy(np.array([100, 3, -1, 6])).reshape(-1, 1)
-
-        # test cpu forward
-        values = ref_values.clone().detach().requires_grad_(False)
-        output = torch.ops.fbgemm.jagged_1d_to_dense(
-            values=values,
-            offsets=offsets,
-            max_sequence_length=1,
-            padding_value=-1,
-        )
-        torch.testing.assert_close(ref_output, output)
-
-        if torch.cuda.is_available():
-            # test gpu forward
-            ref_values = ref_values.cuda()
-            values = ref_values.clone().detach().requires_grad_(False)
-            offsets = offsets.cuda()
-            ref_output = ref_output.cuda()
-            output = torch.ops.fbgemm.jagged_1d_to_dense(
-                values=values,
-                offsets=offsets,
-                max_sequence_length=1,
-                padding_value=-1,
-            )
-            torch.testing.assert_close(ref_output, output)
-
-    @unittest.skipIf(*gpu_unavailable)
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=20,
-        deadline=None,
-    )
-    # pyre-ignore [56]
-    @given(
-        T=st.integers(min_value=1, max_value=20),
-        B=st.integers(min_value=1, max_value=128),
-        max_sequence_length=st.integers(min_value=1, max_value=500),
-        padding_value=st.integers(min_value=-100000, max_value=100000),
-    )
-    def test_stacked_jagged_1d_to_dense(
-        self,
-        T: int,
-        B: int,
-        max_sequence_length: int,
-        padding_value: int,
-    ) -> None:
-        def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor:
-            return torch.repeat_interleave(
-                torch._dim_arange(lengths, 0).long(),
-                lengths.long(),
-            )
-
-        # Converts lengths + values format to COO format
-        # [B], [N] -> [B, N'].
-        # pyre-ignore Missing return annotation [3]
-        def var_list_to_coo(
-            lengths: torch.Tensor,
-            values: torch.Tensor,
-            N: int,
-        ):
-            rows = lengths_to_segment_ids(lengths)
-            num_rows = lengths.size()[0]
-            # This does D&H sync
-            offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-            output_size = lengths.sum()
-            # This does D&H sync
-            cols = torch.ops.fbgemm.offsets_range(offsets, output_size)
-            indices = torch.stack([rows, cols])
-            dims = [num_rows, N]
-            # torch.sparse_coo_tensor is not supported by torch.fx, wrap it.
-            return torch.sparse_coo_tensor(
-                indices=indices,
-                values=values,
-                size=dims,
-            )
-
-        lengths_ = np.random.randint(low=0, high=max_sequence_length, size=B * T)
-        total_lengths = lengths_.sum()
-        lengths = torch.from_numpy(lengths_).cuda()
-        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-        lengths = lengths.view(T, B)
-        ref_values = torch.randint(low=0, high=1000000000, size=(total_lengths,)).cuda()
-
-        values = ref_values.clone().detach().requires_grad_(False)
-        output_values_per_table = torch.ops.fbgemm.stacked_jagged_1d_to_dense(
-            values=values,
-            lengths=lengths,
-            offset_per_key=[0]
-            + np.cumsum([lengths[t].sum().item() for t in range(T)]).tolist(),
-            max_lengths_per_key=[max_sequence_length] * T,
-            padding_value=padding_value,
-        )
-        ref_output_values = torch.ops.fbgemm.jagged_1d_to_dense(
-            values=ref_values,
-            offsets=offsets,
-            max_sequence_length=max_sequence_length,
-            padding_value=padding_value,
-        )
-        torch.testing.assert_close(
-            ref_output_values, torch.cat(output_values_per_table)
-        )
-
     # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
     @given(data_type=st.sampled_from([torch.half, torch.float32]))
     @settings(verbosity=Verbosity.verbose, deadline=None)
@@ -1716,402 +1305,246 @@ def test_segment_sum_csr(self) -> None:
                 segment_sum_cuda.cpu(), torch.Tensor([10.0, 11.0, 34.0]), rtol=0, atol=0
             )
 
-    # TODO: reuse code with var_list_to_coo and to_dense
-    def _to_padded_dense(
+    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
+    @given(
+        batch_size=st.just(2),
+        m=st.just(3),
+        k=st.just(4),
+        n=st.just(5),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_permute102_baddbmm_permute102(
         self,
-        values: torch.Tensor,
-        offsets: List[torch.LongTensor],
-        max_lengths: List[int],
-        padding_value: float = 0,
-    ) -> torch.Tensor:
-        outer_dense_size = len(offsets[0]) - 1
-        inner_dense_size = values.size(-1)
-        dense = torch.empty(
-            (outer_dense_size,) + tuple(max_lengths) + (inner_dense_size,),
-            dtype=values.dtype,
-            device=values.device,
-        )
-        for i in range(outer_dense_size):
-            for jagged_coord in itertools.product(
-                *(list(range(max_l)) for max_l in max_lengths)
-            ):
-                cur_offset = i
-                is_zero = False
-                for d in range(len(max_lengths)):
-                    begin = offsets[d][cur_offset].item()
-                    end = offsets[d][cur_offset + 1].item()
-                    if jagged_coord[d] >= end - begin:
-                        is_zero = True
-                        break
-                    cur_offset = begin + jagged_coord[d]
-                dense[(i,) + jagged_coord] = (
-                    padding_value if is_zero else values[cur_offset]
-                )
-        return dense
+        batch_size: int,
+        m: int,
+        k: int,
+        n: int,
+        use_cpu: bool,
+    ) -> None:
+        # baddbmm doesn't support half
+        dtype = torch.float if use_cpu else torch.half
+        device = torch.device("cpu" if use_cpu else "cuda")
 
-    # TODO: reuse this code in test_(stacked)_jagged_1/2d
-    def _generate_jagged_tensor(
+        A = torch.rand((m, batch_size, k), dtype=dtype, device=device)
+        B = torch.rand((batch_size, k, n), dtype=dtype, device=device)
+        # bias_permute102 = torch.rand(batch_size, 1, n).half().cuda()
+        # bias = bias_permute102.permute(1, 0, 2)
+
+        bias = torch.rand((batch_size, n), dtype=dtype, device=device)
+        bias_permute102 = bias.unsqueeze(1)
+        # bias = bias_short.unsqueeze(0)
+
+        A_permute102 = A.permute(1, 0, 2)
+        C_permute102 = torch.baddbmm(bias_permute102, A_permute102, B)
+        C_ref = C_permute102.permute(1, 0, 2)  # (m, batch_size, n)
+
+        C = torch.ops.fbgemm.permute102_baddbmm_permute102(bias, A, B)
+        torch.testing.assert_close(C.cpu(), C_ref.cpu())
+
+    def _pack_segments_ref(
         self,
-        num_jagged_dim: int,
-        outer_dense_size: int,
-        inner_dense_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> Tuple[torch.Tensor, List[torch.LongTensor], List[int]]:
-        max_lengths = np.random.randint(low=1, high=10, size=(num_jagged_dim,))
-        x_offsets: List[torch.LongTensor] = []
-        num_lengths = outer_dense_size
-        for d in range(num_jagged_dim):
-            # Sometimes length[i] exceed max_L meaning jagged->dense will be
-            # truncation vs. padding
-            lengths = torch.randint(
-                low=0, high=max_lengths[d] * 2, size=(num_lengths,), device=device
+        lengths: torch.Tensor,
+        tensor: torch.Tensor,
+        max_length: Optional[int] = None,
+    ) -> np.ndarray:
+        lengths = lengths.numpy()
+        sections = np.split(tensor, np.cumsum(lengths))
+        max_length = np.max(lengths, initial=0) if max_length is None else max_length
+        padded_arrs = []
+        for arr in sections[:-1]:  # Last section is always a blank
+            arr = arr[: min(max_length, len(arr)), ...]
+            padded_arr = np.pad(
+                arr,
+                [(0, max(max_length - arr.shape[0], 0))]
+                + ([(0, 0)] * (len(arr.shape) - 1)),
+                constant_values=0,
             )
-            x_offsets.append(torch.ops.fbgemm.asynchronous_complete_cumsum(lengths))
-            num_lengths = x_offsets[-1][-1].item()
+            padded_arrs.append(padded_arr)
 
-        x_values = torch.rand(
-            x_offsets[-1][-1] * inner_dense_size, dtype=dtype, device=device
-        ).reshape(x_offsets[-1][-1].item(), inner_dense_size)
+        if len(padded_arrs) == 0:
+            padded_arrs = torch.empty((0, 0) + tuple(tensor.shape[1:]))
+        else:
+            padded_arrs = torch.Tensor(np.stack(padded_arrs))
 
-        return x_values, x_offsets, max_lengths
+        # pyre-fixme[7]: Expected `ndarray` but got `Tensor`.
+        return padded_arrs
 
-    # pyre-ignore [56]
+    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
     @given(
-        num_jagged_dim=st.integers(1, 5),
-        outer_dense_size=st.integers(0, 5),
-        inner_dense_size=st.integers(0, 5),
-        use_cpu=st.booleans() if gpu_available else st.just(True),
-        precompute_total_L=st.booleans(),
+        n=st.integers(2, 10),
+        k=st.integers(2, 10),
+        batch_size=st.integers(1, 30),
+        divisions=st.integers(1, 10),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
-    def test_dense_to_jagged(
+    @settings(deadline=None)
+    def test_pack_segments(
         self,
-        num_jagged_dim: int,
-        outer_dense_size: int,
-        inner_dense_size: int,
-        use_cpu: bool,
-        precompute_total_L: bool,
+        n: int,
+        k: int,
+        batch_size: int,
+        divisions: int,
     ) -> None:
+        input_raw = np.random.rand(batch_size, n, k)
+        input_data = torch.tensor(input_raw, dtype=torch.float32, requires_grad=True)
+        lengths = torch.tensor(
+            get_n_rand_num_summing_to_k(divisions, batch_size), dtype=torch.int
+        )
+        max_length = lengths.max().item()
 
-        # Generate multi-dim jagged tensor
-        device = torch.device("cpu" if use_cpu else "cuda")
-        values_2d, offsets, max_lengths = self._generate_jagged_tensor(
-            num_jagged_dim, outer_dense_size, inner_dense_size, torch.float, device
+        packed_tensor = torch.ops.fbgemm.pack_segments(
+            t_in=input_data, lengths=lengths, max_length=max_length
         )
-        values_2d = values_2d.clone().detach().requires_grad_(True)
 
-        # jagged -> dense
-        dense = torch.ops.fbgemm.jagged_to_padded_dense(values_2d, offsets, max_lengths)
+        packed_ref = self._pack_segments_ref(lengths, input_raw)
 
-        # dense -> jagged (op which is being tested)
-        if precompute_total_L:
-            total_L = values_2d.size(0)
-            jagged_values, jagged_offsets = torch.ops.fbgemm.dense_to_jagged(
-                dense, offsets, total_L
-            )
-        else:
-            jagged_values, jagged_offsets = torch.ops.fbgemm.dense_to_jagged(
-                dense, offsets
-            )
+        # pyre-fixme[6]: For 2nd param expected `Tensor` but got `ndarray`.
+        self.assertTrue(torch.equal(packed_tensor, packed_ref))
 
-        # jagged -> dense
-        dense2 = torch.ops.fbgemm.jagged_to_padded_dense(
-            jagged_values, jagged_offsets, max_lengths
+        grad_cpu = torch.tensor(
+            np.random.uniform(low=0.01, high=0.5, size=packed_ref.shape).astype(
+                np.float32
+            )
         )
+        # CPU backward
+        packed_tensor.backward(grad_cpu)
 
-        # verify forward
-        torch.testing.assert_allclose(dense, dense2)
+        if gpu_available:
+            packed_cuda = torch.ops.fbgemm.pack_segments(
+                t_in=input_data.cuda(),
+                lengths=lengths.cuda(),
+                max_length=max_length,
+            )
+            self.assertTrue(torch.equal(packed_tensor, packed_cuda.cpu()))
 
-        # verify backward
-        dense.retain_grad()
-        ref_output_values = jagged_values.clone().detach().requires_grad_(True)
-        ref_values = dense.clone().detach().requires_grad_(True)
-        jagged_values.backward(ref_output_values)
-        torch.testing.assert_allclose(dense.grad, ref_values)
+            # GPU backward
+            packed_cuda.backward(grad_cpu.cuda())
 
-    # pyre-ignore [56]
+    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
     @given(
-        num_jagged_dim=st.integers(1, 5),
-        outer_dense_size=st.integers(0, 5),
-        inner_dense_size=st.integers(0, 5),
-        padding_value=st.sampled_from([0, -1e-8]),
-        dtype=st.sampled_from([torch.float, torch.half, torch.bfloat16, torch.double]),
-        use_cpu=st.booleans() if gpu_available else st.just(True),
+        n=st.integers(2, 10),
+        k=st.integers(2, 10),
+        batch_size=st.integers(1, 30),
+        divisions=st.integers(1, 10),
+        max_length=st.integers(1, 20),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
-    def test_jagged_to_padded_dense(
+    @settings(deadline=None)
+    def test_pack_segments_smaller_max_len(
         self,
-        num_jagged_dim: int,
-        outer_dense_size: int,
-        inner_dense_size: int,
-        padding_value: float,
-        dtype: torch.dtype,
-        use_cpu: bool,
+        n: int,
+        k: int,
+        batch_size: int,
+        divisions: int,
+        max_length: int,
     ) -> None:
-        # CPU doesn't support bfloat16
-        assume(not use_cpu or dtype != torch.bfloat16)
-
-        # Testing with a basic crafted example.
-        # dense representation is
-        # [[[[0, 1], [ 0,  0], [0, 0]],
-        #   [[2, 3], [ 4,  5], [6, 7]],
-        #   [[0, 0], [ 0,  0], [0, 0]],
-        #   [[0, 0], [ 0,  0], [0, 0]]],
-        #  [[[0, 0], [ 0,  0], [0, 0]],
-        #   [[0, 0], [ 0,  0], [0, 0]],
-        #   [[0, 0], [ 0,  0], [0, 0]],
-        #   [[0, 0], [ 0,  0], [0, 0]]],
-        #  [[[8, 9], [10, 11], [0, 0]],
-        #   [[0, 0], [ 0,  0], [0, 0]],
-        #   [[0, 0], [ 0,  0], [0, 0]],
-        #   [[0, 0], [ 0,  0], [0, 0]]]],
-        # inner_dense_size = 2
-        # x_offsets = [
-        #     torch.LongTensor([0, 2, 2, 3]),  # lengths torch.Tensor([2, 0, 1]),
-        #     torch.LongTensor([0, 1, 4, 6]),  # lengths torch.Tensor([1, 3, 2]),
-        # ]
-        # outer_dense_size = len(x_offsets[0]) - 1
-        # max_lengths = [4, 3]
-
-        device = torch.device("cpu" if use_cpu else "cuda")
-
-        x_values, x_offsets, max_lengths = self._generate_jagged_tensor(
-            num_jagged_dim, outer_dense_size, inner_dense_size, torch.float, device
+        input_data = torch.tensor(np.random.rand(batch_size, n, k), dtype=torch.float32)
+        lengths = torch.tensor(
+            get_n_rand_num_summing_to_k(divisions, batch_size), dtype=torch.int
         )
 
-        output_ref = self._to_padded_dense(
-            x_values, x_offsets, max_lengths, padding_value=padding_value
-        )
-        output = torch.ops.fbgemm.jagged_to_padded_dense(
-            x_values,
-            x_offsets,
-            max_lengths,
-            padding_value=padding_value,
+        packed_tensor = torch.ops.fbgemm.pack_segments(
+            t_in=input_data,
+            lengths=lengths,
+            max_length=max_length,
         )
+        self.assertEqual(packed_tensor.shape, (divisions, max_length, n, k))
 
-        torch.testing.assert_close(output, output_ref)
-
-        torch.autograd.gradcheck(
-            torch.ops.fbgemm.jagged_to_padded_dense,
-            (
-                x_values.double().requires_grad_(True),
-                x_offsets,
-                max_lengths,
-                padding_value,
-            ),
+        packed_ref = self._pack_segments_ref(
+            lengths,
+            input_data,
+            max_length=max_length,
         )
+        # pyre-fixme[6]: For 2nd param expected `Tensor` but got `ndarray`.
+        self.assertTrue(torch.equal(packed_tensor, packed_ref))
+
+        if gpu_available:
+            packed_cuda = torch.ops.fbgemm.pack_segments(
+                t_in=input_data.cuda(),
+                lengths=lengths.cuda(),
+                max_length=max_length,
+            )
+            self.assertTrue(torch.equal(packed_tensor, packed_cuda.cpu()))
 
     # pyre-ignore [56]
     @given(
-        num_jagged_dim=st.integers(1, 4),
-        outer_dense_size=st.integers(0, 4),
-        inner_dense_size=st.integers(0, 4),
-        operation=st.sampled_from(["add", "add_jagged_output", "mul"]),
+        N=st.integers(1, 32),
+        shape=st.lists(st.integers(1, 32), min_size=1, max_size=2),
         dtype=st.sampled_from([torch.float, torch.half, torch.double]),
         use_cpu=st.booleans() if gpu_available else st.just(True),
+        consecutive_indices=st.booleans(),
     )
-    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
-    def test_jagged_elementwise_binary(
+    @settings(max_examples=20, deadline=None)
+    def test_index_select_dim0(
         self,
-        num_jagged_dim: int,
-        outer_dense_size: int,
-        inner_dense_size: int,
-        operation: str,
+        N: int,
+        shape: List[int],
         dtype: torch.dtype,
         use_cpu: bool,
+        consecutive_indices: bool,
     ) -> None:
         device = torch.device("cpu" if use_cpu else "cuda")
-
-        x_values, x_offsets, max_lengths = self._generate_jagged_tensor(
-            num_jagged_dim, outer_dense_size, inner_dense_size, dtype, device
-        )
-        y = torch.rand(
-            outer_dense_size * np.prod(max_lengths) * inner_dense_size,
-            dtype=dtype,
-            device=device,
-        ).reshape((outer_dense_size,) + tuple(max_lengths) + (inner_dense_size,))
-
-        x_padded = self._to_padded_dense(x_values, x_offsets, max_lengths)
-        if operation == "add":
-            output_ref = x_padded + y
-            output = torch.ops.fbgemm.jagged_dense_elementwise_add(
-                x_values, x_offsets, y
-            )
-        elif operation == "add_jagged_output":
-            # create a jagged tensor and then densify
-            y = self._to_padded_dense(
-                torch.rand(
-                    (
-                        max(outer_dense_size * np.prod(max_lengths), x_values.size(0)),
-                        inner_dense_size,
-                    ),
-                    dtype=dtype,
-                    device=device,
-                ),
-                x_offsets,
-                max_lengths,
-            )
-            output_ref = x_padded + y
-            (
-                output,
-                output_offsets,
-            ) = torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output(
-                x_values, x_offsets, y
-            )
-            output = self._to_padded_dense(output, output_offsets, max_lengths)
-        elif operation == "mul":
-            output_ref = x_padded * y
-            output, output_offsets = torch.ops.fbgemm.jagged_dense_elementwise_mul(
-                x_values, x_offsets, y
-            )
-            output = self._to_padded_dense(output, output_offsets, max_lengths)
+        U = random.randint(0, N + 1)
+        if consecutive_indices:
+            start = np.random.randint(0, U)
+            length = np.random.randint(1, U - start + 1)
+            indices = list(range(start, start + length))
+            np_arr = np.array(indices)
+            for _ in range(N - U):
+                indices.append(np.random.randint(start, start + length))
+                np_arr = np.array(indices)
+                np.random.shuffle(np_arr)
+            indices = torch.from_numpy(np_arr).to(torch.int).to(device)
+            kwargs = {
+                "consecutive_range_start": start,
+                "consecutive_range_length": length,
+            }
         else:
-            raise AssertionError(f"Unknown operation {operation}")
+            indices = torch.randint(U, (N,), device=device)
+            kwargs = {}
+        input = torch.rand((U,) + tuple(shape), dtype=dtype, device=device)
 
-        torch.testing.assert_close(output, output_ref)
-
-        if operation == "add":
-            f = torch.ops.fbgemm.jagged_dense_elementwise_add
-        elif operation == "add_jagged_output":
-
-            # pyre-fixme[2]: Parameter must be annotated.
-            def add_jagged_output_func(*args) -> torch.Tensor:
-                return torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output(
-                    *args
-                )[0]
-
-            f = add_jagged_output_func
-        else:
-            assert operation == "mul"
+        output_ref = torch.ops.fbgemm.index_select_dim0(input, indices, **kwargs)
+        output = torch.index_select(input, 0, indices)
 
-            # pyre-fixme[2]: Parameter must be annotated.
-            def mul_func(*args) -> torch.Tensor:
-                return torch.ops.fbgemm.jagged_dense_elementwise_mul(*args)[0]
+        torch.testing.assert_close(output, output_ref)
 
-            f = mul_func
+        gradcheck_args = [input.clone().detach().double().requires_grad_(True), indices]
+        for k in kwargs:
+            gradcheck_args.append(kwargs[k])
 
-        torch.autograd.gradcheck(
-            f,
-            (
-                x_values.double().requires_grad_(True),
-                x_offsets,
-                y.double().requires_grad_(True),
-            ),
-        )
+        torch.autograd.gradcheck(torch.ops.fbgemm.index_select_dim0, gradcheck_args)
 
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=20,
-        deadline=None,
-    )
     # pyre-ignore [56]
     @given(
-        B=st.integers(0, 32),
-        H=st.integers(1, 3),
-        max_L=st.integers(1, 32),
-        D=st.integers(0, 32),
-        dtype=st.sampled_from([torch.float, torch.half, torch.bfloat16, torch.double]),
-        use_cpu=st.booleans() if gpu_available else st.just(True),
+        T=st.integers(1, 5),
+        B=st.integers(1, 5),
+        L=st.integers(1, 5),
     )
-    def test_batched_dense_vec_jagged_2d_mul(
+    @settings(max_examples=20, deadline=None)
+    def test_bottom_unique_k_per_row(
         self,
+        T: int,
         B: int,
-        H: int,
-        max_L: int,
-        D: int,
-        dtype: torch.dtype,
-        use_cpu: bool,
-    ) -> None:
-        assume(H == 1 or B != 0)
-        # CPU doesn't support bfloat16
-        assume(not use_cpu or dtype != torch.bfloat16)
-
-        device = torch.device("cpu" if use_cpu else "cuda")
-        torch.backends.cuda.matmul.allow_tf32 = False
-
-        # Sometimes length[i] exceed max_L meaning jagged->dense will be
-        # truncation vs. padding
-        lengths = torch.randint(max_L * 2, size=(B,), device=device)
-        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-        values = torch.rand((offsets[-1], H * D), dtype=dtype, device=device)
-        dense = torch.rand((B * H, max_L), dtype=dtype, device=device)
-        padded_values = torch.ops.fbgemm.jagged_to_padded_dense(
-            values,
-            [offsets],
-            [max_L],
-        )  # [B, N, H * D]
-
-        bmm_arg1 = dense.unsqueeze(1)
-        bmm_arg2 = (
-            padded_values.reshape(B, max_L, H, D)
-            .transpose(1, 2)
-            .reshape(B * H, max_L, D)
-        )
-        # torch.bmm not implemented for Half on CPU
-        if dtype == torch.half and use_cpu:
-            bmm_arg1 = bmm_arg1.float()
-            bmm_arg2 = bmm_arg2.float()
-        output_ref = torch.bmm(bmm_arg1, bmm_arg2).squeeze(
-            1
-        )  # [B H, 1, N] x [B H, N, D] = [B H, 1, D]
-        if dtype == torch.half and use_cpu:
-            output_ref = output_ref.half()
-        output = torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul(
-            dense, values, offsets
-        )
-        torch.testing.assert_close(
-            output,
-            output_ref,
-            rtol=1e-2 if dtype == torch.half else None,
-            atol=1e-2 if dtype == torch.half else None,
-        )
-
-        torch.autograd.gradcheck(
-            torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul,
-            (
-                dense.clone().detach().double().requires_grad_(True),
-                values.clone().detach().double().requires_grad_(True),
-                offsets,
-            ),
-        )
-
-    # pyre-ignore [56]: Invalid decoration, was not able to infer the type of argument
-    @given(
-        batch_size=st.just(2),
-        m=st.just(3),
-        k=st.just(4),
-        n=st.just(5),
-        use_cpu=st.booleans() if gpu_available else st.just(True),
-    )
-    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
-    def test_permute102_baddbmm_permute102(
-        self,
-        batch_size: int,
-        m: int,
-        k: int,
-        n: int,
-        use_cpu: bool,
+        L: int,
     ) -> None:
-        # baddbmm doesn't support half
-        dtype = torch.float if use_cpu else torch.half
-        device = torch.device("cpu" if use_cpu else "cuda")
-
-        A = torch.rand((m, batch_size, k), dtype=dtype, device=device)
-        B = torch.rand((batch_size, k, n), dtype=dtype, device=device)
-        # bias_permute102 = torch.rand(batch_size, 1, n).half().cuda()
-        # bias = bias_permute102.permute(1, 0, 2)
-
-        bias = torch.rand((batch_size, n), dtype=dtype, device=device)
-        bias_permute102 = bias.unsqueeze(1)
-        # bias = bias_short.unsqueeze(0)
-
-        A_permute102 = A.permute(1, 0, 2)
-        C_permute102 = torch.baddbmm(bias_permute102, A_permute102, B)
-        C_ref = C_permute102.permute(1, 0, 2)  # (m, batch_size, n)
-
-        C = torch.ops.fbgemm.permute102_baddbmm_permute102(bias, A, B)
-        torch.testing.assert_close(C.cpu(), C_ref.cpu())
+        E = 1000000
+        all_indices = (np.random.zipf(a=1.15, size=(T, B, 3 * L)) - 1) % E
+        all_indices_deduped = torch.ops.fbgemm.bottom_unique_k_per_row(
+            torch.as_tensor(all_indices), L
+        )
+        for index_tuple in itertools.product(range(T), range(B)):
+            # sample without replacement from
+            # https://stats.stackexchange.com/questions/20590/how-do-i-sample-without-replacement-using-a-sampling-with-replacement-function
+            r = set()
+            for x in all_indices[index_tuple]:
+                if x not in r:
+                    r.add(x)
+                    if len(r) == L:
+                        break
+            assert (len(r)) == L, "too skewed distribution (alpha too big)"
+            all_indices[index_tuple][:L] = sorted(r)
+        all_indices_deduped_ref = torch.as_tensor(all_indices[:, :, :L])
+        torch.testing.assert_close(all_indices_deduped, all_indices_deduped_ref)
 
 
 if __name__ == "__main__":
diff --git a/fbgemm_gpu/test/split_embedding_inference_converter_test.py b/fbgemm_gpu/test/split_embedding_inference_converter_test.py
index 296f55d52..f11e027ac 100644
--- a/fbgemm_gpu/test/split_embedding_inference_converter_test.py
+++ b/fbgemm_gpu/test/split_embedding_inference_converter_test.py
@@ -9,6 +9,7 @@
 
 import logging
 import math
+import random
 import unittest
 from typing import Optional, Tuple
 
@@ -17,10 +18,14 @@
 import hypothesis.strategies as st
 import numpy as np
 import torch
-from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.split_embedding_configs import (
+    FP8QuantizationConfig,
+    QuantizationConfig,
+    SparseType,
+)
 from fbgemm_gpu.split_embedding_inference_converter import SplitEmbInferenceConverter
 from fbgemm_gpu.split_table_batched_embeddings_ops import OptimType
-from hypothesis import Verbosity, given, settings
+from hypothesis import given, settings, Verbosity
 from torch import nn
 
 # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
@@ -130,12 +135,13 @@ class QuantizedSplitEmbeddingsTest(unittest.TestCase):
             [
                 SparseType.FP32,
                 SparseType.FP16,
+                SparseType.FP8,
                 SparseType.INT8,
                 SparseType.INT4,
                 SparseType.INT2,
             ]
         ),
-        use_cpu=st.booleans(),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
         pruning_ratio=st.sampled_from([None, 0.0]),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
@@ -151,7 +157,7 @@ def test_quantize_workflow(
         pruning_ratio: Optional[float],
         use_cpu: bool,
     ) -> None:
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         Es = [E] * T
         D_alignment = 8 if not quantize_type == SparseType.INT2 else 16
         D = div_round_up(D, D_alignment)
@@ -162,6 +168,8 @@ def test_quantize_workflow(
         (indices, offsets) = get_table_batched_offsets_from_dense(x, use_cpu=use_cpu)
         sparse_arch = SparseArch(emb_dim=D, num_tables=T, num_rows=E, use_cpu=use_cpu)
 
+        quantization_config = QuantizationConfig()
+
         # Fake quantize to make the original weight in FP32 all be exactly
         # representable by INT8 row-wise quantized values
         if quantize_type == quantize_type.INT8:
@@ -184,6 +192,22 @@ def test_quantize_workflow(
                         bit_rate=quantize_type.bit_rate(),
                     )
                 )
+        elif quantize_type == SparseType.FP8:
+            quantization_config = FP8QuantizationConfig(random.choice([4, 5]), 7)
+
+            for t in range(T):
+                sparse_arch.emb_module.split_embedding_weights()[t].data.copy_(
+                    torch.ops.fbgemm.HFP8QuantizedToFloat(
+                        torch.ops.fbgemm.FloatToHFP8Quantized(
+                            sparse_arch.emb_module.split_embedding_weights()[t].data,
+                            quantization_config.get("exponent_bits"),
+                            quantization_config.get("exponent_bias"),
+                            quantization_config.get("max_position"),
+                        ),
+                        quantization_config.get("exponent_bits"),
+                        quantization_config.get("exponent_bias"),
+                    )
+                )
 
         emb_out = sparse_arch(indices, offsets)  # B, T, D
 
@@ -191,6 +215,7 @@ def test_quantize_workflow(
         split_emb_infer_converter = SplitEmbInferenceConverter(
             quantize_type=quantize_type,
             pruning_ratio=pruning_ratio,
+            quantization_config=quantization_config,
         )
         split_emb_infer_converter.convert_model(sparse_arch)
         assert (
@@ -209,7 +234,7 @@ def test_quantize_workflow(
         )
 
     @given(
-        use_cpu=st.booleans(),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
         use_array_for_index_remapping=st.booleans(),
         quantize_type=st.sampled_from(
             [
@@ -299,7 +324,7 @@ def test_l2_norm_pruning_workflow(
         D=st.integers(min_value=2, max_value=128),
         log_E=st.integers(min_value=3, max_value=5),
         pruning_ratio=st.floats(min_value=0.0, max_value=1.0, exclude_max=True),
-        use_cpu=st.booleans(),
+        use_cpu=st.booleans() if gpu_available else st.just(True),
         use_array_for_index_remapping=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
@@ -312,7 +337,7 @@ def test_pruning_workflow_large_scale(
         use_cpu: bool,
         use_array_for_index_remapping: bool,
     ) -> None:
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         D_alignment = 8
         D = div_round_up(D, D_alignment)
         sparse_arch = SparseArch(emb_dim=D, num_tables=T, num_rows=E, use_cpu=use_cpu)
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
index a01123579..f6bde3d7c 100644
--- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -18,13 +18,14 @@
 import hypothesis.strategies as st
 import numpy as np
 import torch
+from fbgemm_gpu.split_embedding_configs import FP8QuantizationConfig
 from fbgemm_gpu.split_table_batched_embeddings_ops import (
+    BoundsCheckMode,
     OptimType,
-    SparseType,
     RecordCacheMetrics,
-    BoundsCheckMode,
-    WeightDecayMode,
     rounded_row_size_in_bytes,
+    SparseType,
+    WeightDecayMode,
 )
 
 
@@ -35,9 +36,13 @@
     # pyre-ignore[21]
     from test_utils import gpu_available, gpu_unavailable, TEST_WITH_ROCM
 else:
-    from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable
+    from fbgemm_gpu.test.test_utils import (
+        gpu_available,
+        gpu_unavailable,
+        TEST_WITH_ROCM,
+    )
 
-from hypothesis import HealthCheck, Verbosity, assume, given, settings
+from hypothesis import assume, given, HealthCheck, settings, Verbosity
 from hypothesis.strategies import composite
 from torch import Tensor
 
@@ -64,6 +69,7 @@ def get_nbit_weights_ty(draw) -> Optional[SparseType]:
             [
                 SparseType.FP32,
                 SparseType.FP16,
+                SparseType.FP8,
                 SparseType.INT8,
                 SparseType.INT4,
                 SparseType.INT2,
@@ -144,18 +150,22 @@ def generate_requests(
             low=0,
             high=E,
             size=(iters, T, B * L),
+            # pyre-fixme[6]: For 4th param expected `Union[None, str, device]` but
+            #  got `int`.
             device=torch.cuda.current_device(),
             dtype=torch.int32,
         )
     else:
         all_indices = (
             torch.as_tensor(np.random.zipf(a=alpha, size=(iters, T, B * L)))
-            .to(torch.cuda.current_device())
-            .int()
+            # pyre-fixme[6]: For 1st param expected `dtype` but got `int`.
+            .to(torch.cuda.current_device()).int()
             % E
         )
     for it in range(iters - 1):
         for t in range(T):
+            # pyre-fixme[6]: For 2nd param expected `Union[None, str, device]` but
+            #  got `int`.
             reused_indices = torch.randperm(B * L, device=torch.cuda.current_device())[
                 : int(B * L * reuse)
             ]
@@ -168,6 +178,8 @@ def generate_requests(
             if not weighted
             else torch.randn(
                 T * B * L,
+                # pyre-fixme[6]: For 2nd param expected `Union[None, str, device]`
+                #  but got `int`.
                 device=torch.cuda.current_device(),
                 dtype=torch.float16 if weights_precision else torch.float32,
             )
@@ -278,7 +290,7 @@ def execute_forward_(
             # This proves that we have exhaustively checked all PoolingModes
             raise RuntimeError("Unknown PoolingMode!")
 
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         if use_cpu:
             D = (D + 15) // 16 * 4
         else:
@@ -327,6 +339,8 @@ def execute_forward_(
             ]
         else:
             bs = [
+                # pyre-fixme[6]: For 1st param expected `Deviceable` but got
+                #  `Embedding`.
                 to_device(torch.nn.Embedding(E, D, sparse=True), use_cpu)
                 for (E, D) in zip(Es, Ds)
             ]
@@ -867,7 +881,7 @@ def test_forward_fused_pooled_emb_quant(
             round_up(np.random.randint(low=int(max(0.25 * D, 1)), high=int(1.0 * D)), 4)
             for _ in range(T)
         ]
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         Es = [np.random.randint(low=int(0.5 * E), high=int(2.0 * E)) for _ in range(T)]
 
         op = split_table_batched_embeddings_ops.SplitTableBatchedEmbeddingBagsCodegen(
@@ -881,22 +895,23 @@ def test_forward_fused_pooled_emb_quant(
                 for (E, D) in zip(Es, Ds)
             ],
             output_dtype=output_dtype,
+            # pyre-fixme[6]: For 3rd param expected `Optional[device]` but got `int`.
             device=torch.cuda.current_device(),
         )
-        op_ref = (
-            split_table_batched_embeddings_ops.SplitTableBatchedEmbeddingBagsCodegen(
-                embedding_specs=[
-                    (
-                        E,
-                        D,
-                        split_table_batched_embeddings_ops.EmbeddingLocation.DEVICE,
-                        split_table_batched_embeddings_ops.ComputeDevice.CUDA,
-                    )
-                    for (E, D) in zip(Es, Ds)
-                ],
-                output_dtype=SparseType.FP32,
-                device=torch.cuda.current_device(),
-            )
+        op_ref = split_table_batched_embeddings_ops.SplitTableBatchedEmbeddingBagsCodegen(
+            embedding_specs=[
+                (
+                    E,
+                    D,
+                    split_table_batched_embeddings_ops.EmbeddingLocation.DEVICE,
+                    split_table_batched_embeddings_ops.ComputeDevice.CUDA,
+                )
+                for (E, D) in zip(Es, Ds)
+            ],
+            output_dtype=SparseType.FP32,
+            # pyre-fixme[6]: For 3rd param expected `Optional[device]` but got
+            #  `int`.
+            device=torch.cuda.current_device(),
         )
         # sync weights between two ops
         split_weights = op.split_embedding_weights()
@@ -1003,7 +1018,7 @@ def test_nbit_forward_fused_pooled_emb_quant(
             for _ in range(T)
         ]
         Ds = [D] * T
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         Es = [np.random.randint(low=int(0.5 * E), high=int(2.0 * E)) for _ in range(T)]
 
         weights_ty_list = [weights_ty] * T
@@ -1130,7 +1145,11 @@ def test_nbit_forward_fused_pooled_emb_quant(
                 split_table_batched_embeddings_ops.PoolingMode.NONE,
             ]
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
     )
     @settings(
         verbosity=Verbosity.verbose,
@@ -1190,7 +1209,7 @@ def test_backward_dense(
             # This proves that we have exhaustively checked all PoolingModes
             raise RuntimeError("Unknown PoolingMode!")
 
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         if use_cpu:
             D = (D + 15) // 16 * 4
         else:
@@ -1213,6 +1232,8 @@ def test_backward_dense(
             ]
         else:
             bs = [
+                # pyre-fixme[6]: For 1st param expected `Deviceable` but got
+                #  `Embedding`.
                 to_device(torch.nn.Embedding(E, D, sparse=False), use_cpu)
                 for (E, D) in zip(Es, Ds)
             ]
@@ -1303,9 +1324,9 @@ def test_backward_dense(
             rtol=5.0e-3 if weights_precision == SparseType.FP16 else 1.0e-5,
         )
         if do_pooling:
-            goc = torch.cat([go.view(B, -1) for go in gos], dim=1).contiguous()
+            goc = torch.cat([go.view(B, -1) for go in gos], dim=1)
         else:
-            goc = torch.cat(gos, dim=0).contiguous()
+            goc = torch.cat(gos, dim=0)
         fc2.backward(goc)
         torch.testing.assert_close(
             cc.weights.grad,
@@ -1349,7 +1370,11 @@ def test_backward_dense(
                 split_table_batched_embeddings_ops.PoolingMode.NONE,
             ]
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         exact=st.booleans(),
         output_dtype=st.just(SparseType.FP32),
     )
@@ -1419,7 +1444,7 @@ def test_backward_sgd(  # noqa C901
             # This proves that we have exhaustively checked all PoolingModes
             raise RuntimeError("Unknown PoolingMode!")
 
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         if use_cpu:
             D = (D + 15) // 16 * 4
         else:
@@ -1468,6 +1493,8 @@ def test_backward_sgd(  # noqa C901
             ]
         else:
             bs = [
+                # pyre-fixme[6]: For 1st param expected `Deviceable` but got
+                #  `Embedding`.
                 to_device(torch.nn.Embedding(E, D, sparse=True), use_cpu)
                 for (E, D) in zip(Es, Ds)
             ]
@@ -1557,9 +1584,9 @@ def test_backward_sgd(  # noqa C901
             else cc(indices, offsets, to_device(xw.contiguous().view(-1), use_cpu))
         )
         if do_pooling:
-            goc = torch.cat([go.view(B, -1) for go in gos], dim=1).contiguous()
+            goc = torch.cat([go.view(B, -1) for go in gos], dim=1)
         else:
-            goc = torch.cat(gos, dim=0).contiguous()
+            goc = torch.cat(gos, dim=0)
         fc2.backward(goc)
         if use_cache:
             cc.flush()
@@ -1642,11 +1669,11 @@ def execute_backward_adagrad_(  # noqa C901
         # stochastic rounding only implemented for rowwise
         assume(not stochastic_rounding or row_wise)
         # need unique indices for non-exact tests
-        assume(exact or int(10 ** log_E) > int(2.1 * B * L))
+        assume(exact or int(10**log_E) > int(2.1 * B * L))
         # only row-wise supports caching
         assume(row_wise or not use_cache)
 
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         if use_cpu:
             D = (D + 15) // 16 * 4
         else:
@@ -1695,6 +1722,8 @@ def execute_backward_adagrad_(  # noqa C901
             ]
         else:
             bs = [
+                # pyre-fixme[6]: For 1st param expected `Deviceable` but got
+                #  `Embedding`.
                 to_device(torch.nn.Embedding(E, D, sparse=True), use_cpu)
                 for (E, D) in zip(Es, Ds)
             ]
@@ -1788,7 +1817,7 @@ def execute_backward_adagrad_(  # noqa C901
         if do_pooling:
             goc = torch.cat([go.view(B, -1) for go in gos], dim=1)
         else:
-            goc = torch.cat(gos, dim=0).contiguous()
+            goc = torch.cat(gos, dim=0)
         fc2.backward(goc)
         cc.flush()
         split_optimizer_states = [s for (s,) in cc.split_optimizer_states()]
@@ -1865,6 +1894,7 @@ def execute_backward_adagrad_(  # noqa C901
             param.requires_grad = False
         y = cc(indices, offsets, per_sample_weights)
         y.sum().backward()
+        # pyre-fixme[16]: `Optional` has no attribute `clone`.
         indice_weight_grad_all = per_sample_weights.grad.clone().cpu()
         T_ = len(xws)
         feature_requires_grad = to_device(
@@ -1909,7 +1939,11 @@ def execute_backward_adagrad_(  # noqa C901
         cache_algorithm=st.sampled_from(
             split_table_batched_embeddings_ops.CacheAlgorithm
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         exact=st.booleans(),
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
@@ -1974,7 +2008,11 @@ def test_backward_adagrad_fp16_pmSUM(  # noqa C901
         cache_algorithm=st.sampled_from(
             split_table_batched_embeddings_ops.CacheAlgorithm
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         exact=st.booleans(),
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
@@ -2039,7 +2077,11 @@ def test_backward_adagrad_fp16_pmMEAN(  # noqa C901
         cache_algorithm=st.sampled_from(
             split_table_batched_embeddings_ops.CacheAlgorithm
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         exact=st.booleans(),
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
@@ -2104,7 +2146,11 @@ def test_backward_adagrad_fp16_pmNONE(  # noqa C901
         cache_algorithm=st.sampled_from(
             split_table_batched_embeddings_ops.CacheAlgorithm
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         exact=st.booleans(),
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
@@ -2169,7 +2215,11 @@ def test_backward_adagrad_fp32_pmSUM(  # noqa C901
         cache_algorithm=st.sampled_from(
             split_table_batched_embeddings_ops.CacheAlgorithm
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         exact=st.booleans(),
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
@@ -2234,7 +2284,11 @@ def test_backward_adagrad_fp32_pmMEAN(  # noqa C901
         cache_algorithm=st.sampled_from(
             split_table_batched_embeddings_ops.CacheAlgorithm
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         exact=st.booleans(),
         output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
     )
@@ -2307,7 +2361,7 @@ def test_cache_pipeline(
         cache_algorithm: split_table_batched_embeddings_ops.CacheAlgorithm,
     ) -> None:
         iters = 3
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         D = D * 4
         if not mixed:
             Ds = [D] * T
@@ -2437,7 +2491,7 @@ def execute_backward_optimizers_(  # noqa C901
             # This proves that we have exhaustively checked all PoolingModes
             raise RuntimeError("Unknown PoolingMode!")
 
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         if use_cpu:
             D = (D + 15) // 16 * 4
         else:
@@ -2474,6 +2528,8 @@ def execute_backward_optimizers_(  # noqa C901
             ]
         else:
             bs = [
+                # pyre-fixme[6]: For 1st param expected `Deviceable` but got
+                #  `Embedding`.
                 to_device(torch.nn.Embedding(E, D, sparse=True), use_cpu)
                 for (E, D) in zip(Es, Ds)
             ]
@@ -2581,7 +2637,7 @@ def execute_backward_optimizers_(  # noqa C901
         if do_pooling:
             goc = torch.cat([go.view(B, -1) for go in gos], dim=1)
         else:
-            goc = torch.cat(gos, dim=0).contiguous()
+            goc = torch.cat(gos, dim=0)
         fc2.backward(goc)
         cc.flush()
 
@@ -2627,6 +2683,8 @@ def execute_backward_optimizers_(  # noqa C901
                         dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu()
                     )
                 else:
+                    # pyre-fixme[58]: `/` is not supported for operand types `float`
+                    #  and `Tensor`.
                     weights_ref = bs[t].weight.cpu() - lr * dense_cpu_grad / denom
                 # TODO: why is tolerance off here?
                 torch.testing.assert_close(
@@ -2655,7 +2713,10 @@ def execute_backward_optimizers_(  # noqa C901
                 )
                 weights_new = split_weights[t]
                 weights_ref = bs[t].weight.cpu() - lr * lambda_ * dense_cpu_grad / (
-                    torch.pow(m1_ref.view(m1_ref.numel(), 1), 1.0 / 3) + eps
+                    # pyre-fixme[58]: `/` is not supported for operand types `float`
+                    #  and `Tensor`.
+                    torch.pow(m1_ref.view(m1_ref.numel(), 1), 1.0 / 3)
+                    + eps
                 )
                 torch.testing.assert_close(
                     weights_new.index_select(dim=0, index=x[t].view(-1)).cpu(),
@@ -2678,9 +2739,9 @@ def execute_backward_optimizers_(  # noqa C901
                 m1_ref = dense_cpu_grad * (1.0 - beta1)
                 torch.testing.assert_close(m1.cpu(), m1_ref, atol=1.0e-4, rtol=1.0e-4)
                 iter_ = cc.iter.item()
-                v_hat_t = m2_ref / (1 - beta2 ** iter_)
+                v_hat_t = m2_ref / (1 - beta2**iter_)
                 v_hat_t = v_hat_t if not rowwise else v_hat_t.view(v_hat_t.numel(), 1)
-                m_hat_t = m1_ref / (1 - beta1 ** iter_)
+                m_hat_t = m1_ref / (1 - beta1**iter_)
                 weights_new = split_weights[t]
                 weights_ref = (
                     torch.addcdiv(
@@ -2712,9 +2773,9 @@ def execute_backward_optimizers_(  # noqa C901
                 m1_ref = dense_cpu_grad * (1.0 - beta1)
                 torch.testing.assert_close(m1.cpu(), m1_ref, atol=1.0e-4, rtol=1.0e-4)
                 iter_ = cc.iter.item()
-                v_hat_t = m2_ref / (1 - beta2 ** iter_)
+                v_hat_t = m2_ref / (1 - beta2**iter_)
                 v_hat_t = v_hat_t if not rowwise else v_hat_t.view(v_hat_t.numel(), 1)
-                m_hat_t = m1_ref / (1 - beta1 ** iter_)
+                m_hat_t = m1_ref / (1 - beta1**iter_)
                 rtw = (m_hat_t / (torch.sqrt(v_hat_t) + eps)) + weight_decay * bs[
                     t
                 ].weight.cpu()
@@ -2787,7 +2848,11 @@ def execute_backward_optimizers_(  # noqa C901
                 split_table_batched_embeddings_ops.PoolingMode.NONE,
             ]
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
     )
     @settings(
         verbosity=Verbosity.verbose,
@@ -2847,7 +2912,11 @@ def test_backward_optimizers_adam(  # noqa C901
                 split_table_batched_embeddings_ops.PoolingMode.NONE,
             ]
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         weight_decay_mode=st.sampled_from(
             [
                 WeightDecayMode.L2,
@@ -2914,7 +2983,11 @@ def test_backward_optimizers_adagrad(  # noqa C901
                 split_table_batched_embeddings_ops.PoolingMode.NONE,
             ]
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
     )
     @settings(
         verbosity=Verbosity.verbose,
@@ -2968,7 +3041,11 @@ def test_backward_optimizers_lamb(  # noqa C901
                 split_table_batched_embeddings_ops.PoolingMode.NONE,
             ]
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
     )
     @settings(
         verbosity=Verbosity.verbose,
@@ -3047,7 +3124,7 @@ def execute_nbit_forward_(
         else:
             mode = "sum"
             do_pooling = False
-        E = int(10 ** log_E)
+        E = int(10**log_E)
 
         if not mixed_weights_ty:
             weights_ty_list = [weights_ty] * T
@@ -3057,6 +3134,7 @@ def execute_nbit_forward_(
                     [
                         SparseType.FP32,
                         SparseType.FP16,
+                        SparseType.FP8,
                         SparseType.INT8,
                         SparseType.INT4,
                         SparseType.INT2,
@@ -3094,6 +3172,8 @@ def execute_nbit_forward_(
             ]
         else:
             bs = [
+                # pyre-fixme[6]: For 1st param expected `Deviceable` but got
+                #  `Embedding`.
                 to_device(torch.nn.Embedding(E, D, sparse=True), use_cpu)
                 for (E, D) in zip(Es, Ds)
             ]
@@ -3123,6 +3203,13 @@ def execute_nbit_forward_(
                 for _ in range(T)
             ]
 
+        # Fix exponent bias to 7 for now (TODO: Randomize it from a range of integers)
+        if SparseType.FP8 in weights_ty_list:
+            fp8_config = FP8QuantizationConfig(random.choice([4, 5]), 7)
+            has_fp8_weight = True
+        else:
+            has_fp8_weight = False
+
         xs = [to_device(torch.randint(low=0, high=e, size=(B, L)), use_cpu) for e in Es]
         xws = [to_device(torch.randn(size=(B, L)), use_cpu) for _ in range(T)]
 
@@ -3146,6 +3233,12 @@ def execute_nbit_forward_(
             cache_algorithm=cache_algorithm,
             use_array_for_index_remapping=use_array_for_index_remapping,
             output_dtype=output_dtype,
+            fp8_exponent_bits=fp8_config.get("exponent_bits")
+            if has_fp8_weight
+            else None,
+            fp8_exponent_bias=fp8_config.get("exponent_bias")
+            if has_fp8_weight
+            else None,
         )
         # Initilize the random weights for int nbit table split embedding bag
         cc.fill_random_weights()
@@ -3238,6 +3331,24 @@ def comp(i: int) -> np.ndarray:
                 )
                 bs[t].weight.detach().copy_(to_device(torch.tensor(comps), use_cpu))
 
+            elif weights_ty_list[t] == SparseType.FP8:
+                # Quantize FP32 to HPF8
+                comps = torch.ops.fbgemm.FloatToHFP8Quantized(
+                    bs[t].weight.detach().float(),
+                    fp8_config.get("exponent_bits"),
+                    fp8_config.get("exponent_bias"),
+                    fp8_config.get("max_position"),
+                )
+                weights.copy_(comps)
+
+                # Dequantize HPF8 to FP32
+                comps = torch.ops.fbgemm.HFP8QuantizedToFloat(
+                    comps,
+                    fp8_config.get("exponent_bits"),
+                    fp8_config.get("exponent_bias"),
+                )
+                bs[t].weight.data.copy_(comps)
+
             elif weights_ty_list[t] == SparseType.FP16:
                 comps = bs[t].weight.detach().half().cpu().numpy().view(np.uint8)
                 weights.copy_(torch.tensor(comps))
@@ -3456,7 +3567,7 @@ def test_nbit_forward_uvm_cache(
         mixed = random.choice([True, False])
 
         iters = 3
-        E = int(10 ** log_E)
+        E = int(10**log_E)
 
         D_alignment = (
             1 if weights_ty.bit_rate() % 8 == 0 else int(8 / weights_ty.bit_rate())
@@ -3532,7 +3643,11 @@ def test_nbit_forward_uvm_cache(
         T=st.integers(min_value=1, max_value=5),
         B=st.integers(min_value=1, max_value=8),
         L=st.integers(min_value=0, max_value=8),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         use_cpu_hashtable=st.booleans(),
         use_array_for_index_remapping=st.booleans(),
     )
@@ -3593,16 +3708,28 @@ def test_pruning(
 
         # Initialize and insert Array index remapping based data structure
         index_remappings_array = torch.tensor(
-            [-1] * original_E * T, dtype=torch.int32, device=current_device
+            [-1] * original_E * T,
+            dtype=torch.int32,
+            # pyre-fixme[6]: For 3rd param expected `Union[None, str, device]` but
+            #  got `Union[int, str]`.
+            device=current_device,
         )
         index_remappings_array_offsets = torch.empty(
-            T + 1, dtype=torch.int64, device=current_device
+            T + 1,
+            dtype=torch.int64,
+            # pyre-fixme[6]: For 3rd param expected `Union[None, str, device]` but
+            #  got `Union[int, str]`.
+            device=current_device,
         )
         index_remappings_array_offsets[0] = 0
         for t in range(T):
+            # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int, str]`.
             indice_t = (indices.view(T, B, L))[t].long().view(-1).to(current_device)
             dense_indice_t = (
-                (dense_indices.view(T, B, L))[t].view(-1).to(current_device)
+                (dense_indices.view(T, B, L))[t].view(-1)
+                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
+                #  str]`.
+                .to(current_device)
             )
             selected_indices = torch.add(indice_t, t * original_E)[:E]
             index_remappings_array[selected_indices] = dense_indice_t
@@ -3621,12 +3748,26 @@ def test_pruning(
                 index_remappings_array,
                 index_remappings_array_offsets,
             ) = (
+                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
+                #  str]`.
                 indices.to(current_device),
+                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
+                #  str]`.
                 dense_indices.to(current_device),
+                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
+                #  str]`.
                 offsets.to(current_device),
+                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
+                #  str]`.
                 hash_table.to(current_device),
+                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
+                #  str]`.
                 hash_table_offsets.to(current_device),
+                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
+                #  str]`.
                 index_remappings_array.to(current_device),
+                # pyre-fixme[6]: For 1st param expected `dtype` but got `Union[int,
+                #  str]`.
                 index_remappings_array_offsets.to(current_device),
             )
 
@@ -3701,7 +3842,7 @@ def test_cache_update_function(self, L: int, H: int, S: int) -> None:
         # Create an abstract split table
         D = 8
         T = 2
-        E = 10 ** 3
+        E = 10**3
         Ds = [D] * T
         Es = [E] * T
         emb_op = (
@@ -3734,7 +3875,7 @@ def test_cache_miss_counter(self, N: int) -> None:
         # Create an abstract split table
         D = 8
         T = 2
-        E = 10 ** 3
+        E = 10**3
         Ds = [D] * T
         Es = [E] * T
         emb_op = (
@@ -3809,11 +3950,13 @@ def test_nbit_cache_update_function(self, L: int, H: int, S: int) -> None:
         expect_out = sum(unique_cache_miss_ids >= 0)
         linear_cache_indices = linear_cache_indices_cpu.to(torch.int32).cuda()
         lxu_cache_locations = lxu_cache_locations_cpu.to(torch.int32).cuda()
+        expected_unique_access = len(torch.unique(linear_cache_indices_cpu))
+        expected_total_access = len(linear_cache_indices_cpu)
 
         # Create an abstract split table
         D = 8
         T = 2
-        E = 10 ** 3
+        E = 10**3
         Ds = [D] * T
         Es = [E] * T
         cc = split_table_batched_embeddings_ops.IntNBitTableBatchedEmbeddingBagsCodegen(
@@ -3836,10 +3979,14 @@ def test_nbit_cache_update_function(self, L: int, H: int, S: int) -> None:
         (
             cache_miss_forward_count,
             unique_cache_miss_count,
+            unique_access_count,
+            total_access_count,
         ) = cc.get_cache_miss_counter().cpu()
 
         self.assertEqual(unique_cache_miss_count, expect_out)
         self.assertLessEqual(cache_miss_forward_count, unique_cache_miss_count)
+        self.assertEqual(unique_access_count, expected_unique_access)
+        self.assertEqual(total_access_count, expected_total_access)
 
     @unittest.skipIf(*gpu_unavailable)
     @given(N=st.integers(min_value=1, max_value=8))
@@ -3848,7 +3995,7 @@ def test_nbit_cache_miss_counter(self, N: int) -> None:
         # Create an abstract split table
         D = 8
         T = 2
-        E = 10 ** 3
+        E = 10**3
         Ds = [D] * T
         Es = [E] * T
         cc = split_table_batched_embeddings_ops.IntNBitTableBatchedEmbeddingBagsCodegen(
@@ -3884,6 +4031,8 @@ def test_nbit_cache_miss_counter(self, N: int) -> None:
                 (
                     cache_miss_forward_count,
                     unique_cache_miss_count,
+                    _,
+                    _,
                 ) = cc.get_cache_miss_counter().cpu()
                 tablewise_cache_miss = cc.get_table_wise_cache_miss().cpu()
                 self.assertEqual(cache_miss_forward_count, t_counter[0])
@@ -3902,7 +4051,11 @@ def test_nbit_cache_miss_counter(self, N: int) -> None:
                 BoundsCheckMode.IGNORE,
             ]
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         weighted=st.booleans(),
         dtype=st.sampled_from(
             [
@@ -3950,6 +4103,7 @@ def test_bounds_check(
                 warning.cuda(),
             )
             if weighted:
+                # pyre-fixme[16]: `Optional` has no attribute `cuda`.
                 weights = weights.cuda()
         indices_copy = indices.clone()
         offsets_copy = offsets.clone()
@@ -4137,11 +4291,11 @@ def test_lxu_cache_lookup(self) -> None:
         lxu_cache_state_gpu = torch.arange(ASSOC, dtype=torch.int64).unsqueeze(0).cuda()
 
         # Testing all miss.
-        linear_cache_indices_0 = torch.tensor(
-            [32, 33, 34, 35, 36, 100, 1000, 1725]
-        ).cuda() if ASSOC == 32 else torch.tensor(
-            [64, 65, 66, 67, 68, 100, 1000, 1725]
-        ).cuda()
+        linear_cache_indices_0 = (
+            torch.tensor([32, 33, 34, 35, 36, 100, 1000, 1725]).cuda()
+            if ASSOC == 32
+            else torch.tensor([64, 65, 66, 67, 68, 100, 1000, 1725]).cuda()
+        )
         lxu_locations = torch.ops.fbgemm.lxu_cache_lookup(
             linear_cache_indices_0, lxu_cache_state_gpu, max_index
         )
@@ -4222,7 +4376,11 @@ def test_lxu_cache_lookup(self) -> None:
                 SparseType.INT8,
             ]
         ),
-        use_cpu=st.booleans() if (gpu_available and not TEST_WITH_ROCM) else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True),
+        use_cpu=st.booleans()
+        if (gpu_available and not TEST_WITH_ROCM)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True),
         test_internal=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
@@ -4246,7 +4404,7 @@ def test_embedding_inplace_update(
             )
             for _ in range(T)
         ]
-        E = int(10 ** log_E)
+        E = int(10**log_E)
         Es = [np.random.randint(low=int(0.5 * E), high=int(2.0 * E)) for _ in range(T)]
         row_alignment = 1 if use_cpu else 16
         current_device = "cpu" if use_cpu else torch.cuda.current_device()
@@ -4317,6 +4475,8 @@ def test_embedding_inplace_update(
             high=255,
             size=(update_weight_size,),
             dtype=torch.uint8,
+            # pyre-fixme[6]: For 5th param expected `Union[None, str, device]` but
+            #  got `Union[int, str]`.
             device=current_device,
         )
 
@@ -4338,7 +4498,11 @@ def test_embedding_inplace_update(
                 update_offsets += D_bytes
 
             update_weights_tensor = torch.tensor(
-                update_weights, device=current_device, dtype=torch.uint8
+                update_weights,
+                # pyre-fixme[6]: For 2nd param expected `Union[None, str, device]`
+                #  but got `Union[int, str]`.
+                device=current_device,
+                dtype=torch.uint8,
             )
             update_weights_list.append(update_weights_tensor)
 
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index 82de24ea2..9a8c7a690 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -4,17 +4,17 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
+import os
 import struct
-from typing import Callable, List, Tuple
+import unittest
+from functools import wraps
+from typing import Any, Callable, List, Tuple
 
 import hypothesis.strategies as st
 import numpy as np
 import torch
-import os
-from functools import wraps
-import unittest
 
-TEST_WITH_ROCM = os.getenv('FBGEMM_TEST_WITH_ROCM', '0') == '1'
+TEST_WITH_ROCM: bool = os.getenv("FBGEMM_TEST_WITH_ROCM", "0") == "1"
 # Eigen/Python round 0.5 away from 0, Numpy rounds to even
 round_to_nearest: Callable[[np.ndarray], np.ndarray] = np.vectorize(round)
 
@@ -187,13 +187,20 @@ def cpu_and_maybe_gpu() -> st.SearchStrategy[List[torch.device]]:
 def cpu_only() -> st.SearchStrategy[List[torch.device]]:
     return st.sampled_from([torch.device("cpu")])
 
-def skipIfRocm(reason="test doesn't currently work on the ROCm stack"):
-    def skipIfRocmDecorator(fn):
+
+# pyre-fixme[3]: Return annotation cannot be `Any`.
+def skipIfRocm(reason: str = "test doesn't currently work on the ROCm stack") -> Any:
+    # pyre-fixme[3]: Return annotation cannot be `Any`.
+    # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
+    def skipIfRocmDecorator(fn: Callable) -> Any:
         @wraps(fn)
-        def wrapper(*args, **kwargs):
+        # pyre-fixme[3]: Return annotation cannot be `Any`.
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
             if TEST_WITH_ROCM:
                 raise unittest.SkipTest(reason)
             else:
                 fn(*args, **kwargs)
+
         return wrapper
-    return skipIfRocmDecorator
\ No newline at end of file
+
+    return skipIfRocmDecorator
diff --git a/fbgemm_gpu/test/uvm_test.py b/fbgemm_gpu/test/uvm_test.py
index a5dc7f30d..e27f4969c 100644
--- a/fbgemm_gpu/test/uvm_test.py
+++ b/fbgemm_gpu/test/uvm_test.py
@@ -19,15 +19,15 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_unavailable, gpu_available
+    from test_utils import gpu_available, gpu_unavailable, skipIfRocm
 else:
-    from fbgemm_gpu.test.test_utils import gpu_unavailable, gpu_available
+    from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, skipIfRocm
 
 if gpu_available:
-    # pyre-ignore[21]
-    from fbgemm_gpu.uvm import cudaMemAdvise, cudaMemoryAdvise, cudaMemPrefetchAsync
+        # pyre-ignore[21]
+        from fbgemm_gpu.uvm import cudaMemAdvise, cudaMemoryAdvise, cudaMemPrefetchAsync
 
-from hypothesis import Verbosity, given, settings
+from hypothesis import given, settings, Verbosity
 
 MAX_EXAMPLES = 40
 
@@ -80,6 +80,7 @@ def test_enum(self) -> None:
         # pyre-ignore[16]
         assert cudaMemoryAdvise.cudaMemAdviseSetAccessedBy.value == 5
 
+    @skipIfRocm
     @unittest.skipIf(*gpu_unavailable)
     @given(
         sizes=st.lists(
@@ -123,6 +124,7 @@ def test_cudaMemPrefetchAsync(self, sizes: List[int], vanilla: bool) -> None:
 
         torch.cuda.synchronize(torch.device("cuda:0"))
 
+    @skipIfRocm
     @unittest.skipIf(*gpu_unavailable or torch.cuda.device_count() < 2)
     @given(
         sizes=st.lists(
@@ -154,6 +156,7 @@ def test_uvm_to_device(self, sizes: List[int], vanilla: bool) -> None:
         assert torch.ops.fbgemm.uvm_storage(second_t)
         assert second_t.device == device_prototype.device
 
+    @skipIfRocm
     @unittest.skipIf(*gpu_unavailable)
     @given(
         sizes=st.lists(
@@ -183,6 +186,7 @@ def test_uvm_slice(self, sizes: List[int], vanilla: bool) -> None:
             assert torch.ops.fbgemm.is_uvm_tensor(uvm_slice)
             assert torch.ops.fbgemm.uvm_storage(cpu_slice)
 
+    @skipIfRocm
     @unittest.skipIf(*gpu_unavailable)
     @given(
         sizes=st.lists(
diff --git a/fbgemm_gpu/version.py b/fbgemm_gpu/version.py
index 85f58ae08..8c85b7fef 100644
--- a/fbgemm_gpu/version.py
+++ b/fbgemm_gpu/version.py
@@ -14,4 +14,4 @@
 # 0.1.0bN  # Beta release
 # 0.1.0rcN  # Release Candidate
 # 0.1.0  # Final release
-__version__ = "0.1.0"
+__version__ = "0.1.1"
diff --git a/include/fbgemm/FbgemmConvert.h b/include/fbgemm/FbgemmConvert.h
index 8da42ef39..286de975a 100644
--- a/include/fbgemm/FbgemmConvert.h
+++ b/include/fbgemm/FbgemmConvert.h
@@ -160,4 +160,24 @@ FBGEMM_API void RoundToFloat16(
     bool clamp = false,
     bool clamp_denorms = false);
 
+/**
+ * @brief Quantize float32 to float8. The code is a copy of float_to_hfp8() in
+ * fbgemm_gpu/quantize_ops_utils.h
+ */
+FBGEMM_API void FloatToFloat8_ref(
+    const float input,
+    uint8_t* output,
+    int exponent_bits,
+    int exponent_bias);
+
+/**
+ * @brief Dequantize float8 to float32. The code is a copy of hf8_to_float() in
+ * fbgemm_gpu/quantize_ops_utils.h
+ */
+FBGEMM_API void Float8ToFloat_ref(
+    const uint8_t input,
+    float* output,
+    int exponent_bits,
+    int exponent_bias);
+
 } // namespace fbgemm
diff --git a/include/fbgemm/FbgemmEmbedding.h b/include/fbgemm/FbgemmEmbedding.h
index ced6ffee0..8fc4ca0ec 100644
--- a/include/fbgemm/FbgemmEmbedding.h
+++ b/include/fbgemm/FbgemmEmbedding.h
@@ -163,6 +163,34 @@ GenerateEmbeddingSpMDMNBitWithStrides(
     std::int64_t input_stride = -1,
     bool scale_bias_last = true);
 
+/**
+ * @param output_stride If -1, output_stride is same as block_size
+ * @param input_stride in Bytes. If -1, input_stride is same as
+ *                     block_size / num_elem_per_byte + 2 * sizeof(float16)
+ * @param exponent_bits is the number of exponent bits in the FP8 encode
+ *                      (normally 4 or 5)
+ * @param exponent_bias is subtracted from the exponent to obtain the actual
+ *                      exponent for the floating-point number
+ */
+template <
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename OutType = float>
+FBGEMM_API typename EmbeddingSpMDMKernelSignature<
+    std::uint8_t,
+    IndexType,
+    OffsetType,
+    OutType>::Type
+GenerateEmbeddingSpMDMFP8WithStrides(
+    const std::int64_t block_size,
+    bool normalize_by_lengths,
+    bool is_weight_positional = false,
+    bool use_offsets = true,
+    std::int64_t output_stride = -1,
+    std::int64_t input_stride = -1,
+    int exponent_bits = 4,
+    int exponent_bias = 7);
+
 template <
     typename InType,
     typename IndexType,
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 6d895a72e..2aaf68863 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -5,22 +5,17 @@
  * LICENSE file in the root directory of this source tree.
  */
 #pragma once
+
+#include "./FbgemmBuild.h"
+#include "./UtilsAvx2.h"
+
 #include <algorithm>
 #include <array>
 #include <cmath>
 #include <string>
 #include <type_traits>
-#include "./FbgemmBuild.h"
-#include "./UtilsAvx2.h"
 
-// forward declarations to asmjit
-namespace asmjit {
-namespace x86 {
-class Xmm;
-class Ymm;
-class Zmm;
-} // namespace x86
-} // namespace asmjit
+#include <asmjit/asmjit.h>
 
 namespace fbgemm {
 
diff --git a/src/EmbeddingSpMDM.cc b/src/EmbeddingSpMDM.cc
index 0177a793d..96d4f5389 100644
--- a/src/EmbeddingSpMDM.cc
+++ b/src/EmbeddingSpMDM.cc
@@ -313,7 +313,7 @@ GenEmbeddingSpMDMLookup<
                   const float*, // weights
                   outType*, // out
                   const int32_t*, // compressed_indices_table and then mask
-                  const int*>(asmjit::CallConv::kIdHost),
+                  const int*>(asmjit::CallConvId::kHost),
               a->environment());
         } else {
           func.init(
@@ -327,7 +327,7 @@ GenEmbeddingSpMDMLookup<
                   const offsetType*, // offsets or lengths
                   const float*, // weights
                   outType*, // out and then mask
-                  const int*>(asmjit::CallConv::kIdHost),
+                  const int*>(asmjit::CallConvId::kHost),
               a->environment());
         }
 
@@ -336,12 +336,12 @@ GenEmbeddingSpMDMLookup<
 
         if (instSet == inst_set_t::avx2) {
           frame.setDirtyRegs(
-              x86::Reg::kGroupVec,
+              asmjit::RegGroup::kVec,
               asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
                   asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
         } else {
           frame.setDirtyRegs(
-              x86::Reg::kGroupVec,
+              asmjit::RegGroup::kVec,
               asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
                   asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15) |
                   asmjit::Support::bitMask(16, 17, 18, 19, 20, 21, 22, 23) |
@@ -349,7 +349,7 @@ GenEmbeddingSpMDMLookup<
         }
 
         frame.setDirtyRegs(
-            x86::Reg::kGroupGp,
+            asmjit::RegGroup::kGp,
             reg_id == 15
                 ? asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15)
                 : asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14));
@@ -1137,6 +1137,53 @@ typename EmbeddingSpMDMKernelSignature<inType, indxType, offsetType, outType>::
       use_offsets);
 }
 
+template <typename indxType, typename offsetType, typename outType>
+typename EmbeddingSpMDMKernelSignature<uint8_t, indxType, offsetType, outType>::
+    Type
+    GenerateEmbeddingSpMDMFP8WithStrides(
+        const int64_t block_size,
+        bool normalize_by_lengths,
+        bool is_weight_positional,
+        bool use_offsets,
+        int64_t output_stride /*=-1*/,
+        int64_t input_stride /*=-1*/,
+        int exponent_bits,
+        int exponent_bias) {
+  if (output_stride == -1) {
+    output_stride = block_size;
+  }
+  if (input_stride == -1) {
+    input_stride = block_size;
+  }
+  // There is only the reference implementation for FP8 embedding
+  return [=](int64_t output_size,
+             int64_t index_size,
+             int64_t data_size,
+             const uint8_t* input,
+             const indxType* indices,
+             const offsetType* offsets_or_lengths,
+             const float* weights,
+             outType* out) {
+    return EmbeddingSpMDMFP8_ref(
+        block_size,
+        output_size,
+        index_size,
+        data_size,
+        input,
+        indices,
+        offsets_or_lengths,
+        weights,
+        normalize_by_lengths,
+        out,
+        is_weight_positional,
+        use_offsets,
+        output_stride,
+        input_stride,
+        exponent_bits,
+        exponent_bias);
+  };
+}
+
 template <typename inType, typename indxType, typename offsetType>
 typename EmbeddingSpMDMRowWiseSparseKernelSignature<
     inType,
@@ -1294,6 +1341,22 @@ GenerateEmbeddingSpMDMRowWiseSparse(
       int64_t input_stride,                                   \
       bool scale_bias_last);
 
+#define INSTANTIATE_SPMDMFP8_BASE(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)       \
+  template FBGEMM_API typename EmbeddingSpMDMKernelSignature<              \
+      uint8_t,                                                             \
+      INDEX_TYPE,                                                          \
+      OFFSET_TYPE,                                                         \
+      OUT_TYPE>::Type                                                      \
+  GenerateEmbeddingSpMDMFP8WithStrides<INDEX_TYPE, OFFSET_TYPE, OUT_TYPE>( \
+      const int64_t block_size,                                            \
+      bool normalize_by_lengths,                                           \
+      bool is_weight_positional,                                           \
+      bool use_offsets,                                                    \
+      int64_t output_stride,                                               \
+      int64_t input_stride,                                                \
+      int exponent_bits,                                                   \
+      int exponent_bias);
+
 #define INSTANTIATE_SPMDM_NOSTRIDE_BASE(                      \
     IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, THREAD_LOCAL) \
   template FBGEMM_API typename EmbeddingSpMDMKernelSignature< \
@@ -1327,6 +1390,11 @@ GenerateEmbeddingSpMDMRowWiseSparse(
       bool is_weight_positional,                                           \
       bool use_offsets);
 
+#define INSTANTIATE_SPMDMFP8_BASE_uint8_t(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE) \
+  INSTANTIATE_SPMDMFP8_BASE(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)
+#define INSTANTIATE_SPMDMFP8_BASE_float(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)
+#define INSTANTIATE_SPMDMFP8_BASE_float16(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)
+
 #define INSTANTIATE_SPMDM_THREAD_LOCAL(                                     \
     IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)                             \
   INSTANTIATE_SPMDM_BASE(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, true)  \
@@ -1334,7 +1402,8 @@ GenerateEmbeddingSpMDMRowWiseSparse(
   INSTANTIATE_SPMDM_NOSTRIDE_BASE(                                          \
       IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, true)                     \
   INSTANTIATE_SPMDM_NOSTRIDE_BASE(                                          \
-      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, false)
+      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, false)                    \
+  INSTANTIATE_SPMDMFP8_BASE_##IN_TYPE(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)
 
 #define INSTANTIATE_SPMDM_OUT_T(IN_TYPE, INDEX_TYPE, OFFSET_TYPE)           \
   INSTANTIATE_SPMDM_THREAD_LOCAL(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, float)   \
@@ -1358,6 +1427,7 @@ INSTANTIATE_SPMDM_INDEX_T(uint8_t)
 #undef INSTANTIATE_SPMDM_OUT_T
 #undef INSTANTIATE_SPMDM_THREAD_LOCAL
 #undef INSTANTIATE_SPMDM_BASE
+#undef INSTANTIATE_SPMDMFP8_BASE
 #undef INSTANTIATE_SPMDM_NOSTRIDE_BASE
 #undef INSTANTIATE_SPMDM_ROWWISE_BASE
 
diff --git a/src/EmbeddingSpMDMAvx2.cc b/src/EmbeddingSpMDMAvx2.cc
index ed68448dd..a85c2524a 100644
--- a/src/EmbeddingSpMDMAvx2.cc
+++ b/src/EmbeddingSpMDMAvx2.cc
@@ -99,9 +99,11 @@ bool EmbeddingSpMDMBlockSize1_(
     }
 #endif
 
+    float temp = out[m];
     for (; i < len; ++i) {
       int64_t idx = indices[current];
       if (idx < 0 || idx >= data_size) {
+        out[m] = temp;
         return false;
       }
 
@@ -111,18 +113,19 @@ bool EmbeddingSpMDMBlockSize1_(
       }
 
       const InType* inptr = input + indices[current];
-      out[m] = std::fma(
+      temp = std::fma(
           w,
           std::is_same<InType, float16>::value ? cpu_half2float(*inptr)
                                                : *inptr,
-          out[m]);
+          temp);
 
       ++current;
     }
     if (normalize_by_lengths && len) {
       float scale = 1.f / len;
-      out[m] *= scale;
+      temp *= scale;
     }
+    out[m] = temp;
   }
   return current == index_size;
 }
diff --git a/src/EmbeddingSpMDMNBit.cc b/src/EmbeddingSpMDMNBit.cc
index e9244f221..b8325c178 100644
--- a/src/EmbeddingSpMDMNBit.cc
+++ b/src/EmbeddingSpMDMNBit.cc
@@ -299,7 +299,7 @@ GenEmbeddingSpMDMNBitLookup<
                   const float*, // weights
                   float*, // out
                   const int32_t* /* compressed_indices_table */,
-                  const int* /* mask */>(asmjit::CallConv::kIdHost),
+                  const int* /* mask */>(asmjit::CallConvId::kHost),
               a->environment());
         } else {
           func.init(
@@ -313,7 +313,7 @@ GenEmbeddingSpMDMNBitLookup<
                   const offsetType*, // offsets or lengths
                   const float*, // weights
                   float*, // out
-                  const int* /* mask */>(asmjit::CallConv::kIdHost),
+                  const int* /* mask */>(asmjit::CallConvId::kHost),
               a->environment());
         }
 
@@ -321,14 +321,14 @@ GenEmbeddingSpMDMNBitLookup<
         frame.init(func);
 
         frame.setDirtyRegs(
-            x86::Reg::kGroupVec,
+            asmjit::RegGroup::kVec,
             asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
                 asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15) |
                 asmjit::Support::bitMask(16, 17, 18, 19, 20, 21, 22, 23) |
                 asmjit::Support::bitMask(24, 25, 26, 27, 28, 29, 30, 31));
 
         frame.setDirtyRegs(
-            x86::Reg::kGroupGp,
+            asmjit::RegGroup::kGp,
             reg_id == 15
                 ? asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15)
                 : asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14));
@@ -601,6 +601,9 @@ GenEmbeddingSpMDMNBitLookup<
             }
             a->jne(ValidIndexLabel);
             a->add(indices, static_cast<asmjit::Imm>(sizeof(indxType)));
+            if (has_weight) {
+              a->add(weights, static_cast<asmjit::Imm>(sizeof(float)));
+            }
             a->jmp(LoopDataIndexBegin);
             a->bind(ValidIndexLabel);
           }
diff --git a/src/FbgemmI64.cc b/src/FbgemmI64.cc
index 00959dd3d..e775b4390 100644
--- a/src/FbgemmI64.cc
+++ b/src/FbgemmI64.cc
@@ -182,20 +182,20 @@ CodeGenBase<int64_t, int64_t, int64_t, int64_t>::getOrCreate(
             int64_t*,
             int64_t*,
             int,
-            int>(asmjit::CallConv::kIdHost),
+            int>(asmjit::CallConvId::kHost),
         a->environment());
 
     asmjit::FuncFrame frame;
     frame.init(func);
 
     frame.setDirtyRegs(
-        x86::Reg::kGroupVec,
+        asmjit::RegGroup::kVec,
         asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
             asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15) |
             asmjit::Support::bitMask(16, 17, 18, 19, 20, 21, 22, 23) |
             asmjit::Support::bitMask(24, 25, 26, 27, 28, 29, 30, 31));
     frame.setDirtyRegs(
-        x86::Reg::kGroupGp,
+        asmjit::RegGroup::kGp,
         asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
     asmjit::FuncArgsAssignment args(&func);
diff --git a/src/GenerateI8Depthwise.cc b/src/GenerateI8Depthwise.cc
index 3b0f01d25..30bbec61d 100644
--- a/src/GenerateI8Depthwise.cc
+++ b/src/GenerateI8Depthwise.cc
@@ -271,18 +271,18 @@ GenI8Depthwise::jit_kernel_signature GenI8Depthwise::getOrCreate(
             int,
             const int*,
             int,
-            const std::int32_t*>(asmjit::CallConv::kIdHost),
+            const std::int32_t*>(asmjit::CallConvId::kHost),
         e->environment());
 
     asmjit::FuncFrame frame;
     frame.init(func);
 
     frame.setDirtyRegs(
-        x86::Reg::kGroupVec,
+        asmjit::RegGroup::kVec,
         asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
             asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
     frame.setDirtyRegs(
-        x86::Reg::kGroupGp,
+        asmjit::RegGroup::kGp,
         asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
     asmjit::FuncArgsAssignment args(&func);
diff --git a/src/GenerateKernelDirectConvU8S8S32ACC32.cc b/src/GenerateKernelDirectConvU8S8S32ACC32.cc
index 576286b03..1058d4c57 100644
--- a/src/GenerateKernelDirectConvU8S8S32ACC32.cc
+++ b/src/GenerateKernelDirectConvU8S8S32ACC32.cc
@@ -233,7 +233,7 @@ DirectConvCodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreateDirectConv(
             int8_t*,
             int32_t*,
             int,
-            int>(asmjit::CallConv::kIdHost),
+            int>(asmjit::CallConvId::kHost),
         a->environment());
 
     asmjit::FuncFrame frame;
@@ -246,9 +246,9 @@ DirectConvCodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreateDirectConv(
           asmjit::Support::bitMask(24, 25, 26, 27, 28, 29, 30, 31);
     }
 
-    frame.setDirtyRegs(x86::Reg::kGroupVec, dirtyVecRegs);
+    frame.setDirtyRegs(asmjit::RegGroup::kVec, dirtyVecRegs);
     frame.setDirtyRegs(
-        x86::Reg::kGroupGp,
+        asmjit::RegGroup::kGp,
         asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
     asmjit::FuncArgsAssignment args(&func);
@@ -660,7 +660,7 @@ DirectConvCodeGenBase<uint8_t, int8_t, int32_t, int32_t>::
             int,
             int,
             int,
-            int>(asmjit::CallConv::kIdHost),
+            int>(asmjit::CallConvId::kHost),
         a->environment());
 
     asmjit::FuncFrame frame;
@@ -673,9 +673,9 @@ DirectConvCodeGenBase<uint8_t, int8_t, int32_t, int32_t>::
           asmjit::Support::bitMask(24, 25, 26, 27, 28, 29, 30, 31);
     }
 
-    frame.setDirtyRegs(x86::Reg::kGroupVec, dirtyVecRegs);
+    frame.setDirtyRegs(asmjit::RegGroup::kVec, dirtyVecRegs);
     frame.setDirtyRegs(
-        x86::Reg::kGroupGp,
+        asmjit::RegGroup::kGp,
         asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
     asmjit::FuncArgsAssignment args(&func);
diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc
index 3b04dad33..b4d3d54ea 100644
--- a/src/GenerateKernelU8S8S32ACC16.cc
+++ b/src/GenerateKernelU8S8S32ACC16.cc
@@ -189,17 +189,18 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx2>(
             int8_t*,
             int32_t*,
             int,
-            int>(asmjit::CallConv::kIdHost),
+            int>(asmjit::CallConvId::kHost),
         a->environment());
 
     asmjit::FuncFrame frame;
     frame.init(func);
     frame.setDirtyRegs(
-        x86::Reg::kGroupVec,
+        asmjit::RegGroup::kVec,
         asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
             asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
     frame.setDirtyRegs(
-        x86::Reg::kGroupGp, asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14));
+        asmjit::RegGroup::kGp,
+        asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14));
 
     asmjit::FuncArgsAssignment args(&func);
     args.assignAll(buffer_A, buffer_B, B_pf, CBase, kSize, ldcReg);
diff --git a/src/GenerateKernelU8S8S32ACC16Avx512.cc b/src/GenerateKernelU8S8S32ACC16Avx512.cc
index 185f9e116..6b98b24c5 100644
--- a/src/GenerateKernelU8S8S32ACC16Avx512.cc
+++ b/src/GenerateKernelU8S8S32ACC16Avx512.cc
@@ -151,20 +151,20 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate(
             int8_t*,
             int32_t*,
             int,
-            int>(asmjit::CallConv::kIdHost),
+            int>(asmjit::CallConvId::kHost),
         a->environment());
 
     asmjit::FuncFrame frame;
     frame.init(func);
 
     frame.setDirtyRegs(
-        x86::Reg::kGroupVec,
+        asmjit::RegGroup::kVec,
         asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
             asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15) |
             asmjit::Support::bitMask(16, 17, 18, 19, 20, 21, 22, 23) |
             asmjit::Support::bitMask(24, 25, 26, 27, 28, 29, 30, 31));
     frame.setDirtyRegs(
-        x86::Reg::kGroupGp,
+        asmjit::RegGroup::kGp,
         asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
     asmjit::FuncArgsAssignment args(&func);
diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc
index 708b2fcb0..50f0f9c99 100644
--- a/src/GenerateKernelU8S8S32ACC32.cc
+++ b/src/GenerateKernelU8S8S32ACC32.cc
@@ -188,7 +188,7 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate(
             int8_t*,
             int32_t*,
             int,
-            int>(asmjit::CallConv::kIdHost),
+            int>(asmjit::CallConvId::kHost),
         a->environment());
 
     asmjit::FuncFrame frame;
@@ -201,9 +201,9 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate(
           asmjit::Support::bitMask(24, 25, 26, 27, 28, 29, 30, 31);
     }
 
-    frame.setDirtyRegs(x86::Reg::kGroupVec, dirtyVecRegs);
+    frame.setDirtyRegs(asmjit::RegGroup::kVec, dirtyVecRegs);
     frame.setDirtyRegs(
-        x86::Reg::kGroupGp,
+        asmjit::RegGroup::kGp,
         asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
     asmjit::FuncArgsAssignment args(&func);
diff --git a/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc b/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc
index 583dc1693..020c90457 100644
--- a/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc
+++ b/src/GenerateKernelU8S8S32ACC32Avx512VNNI.cc
@@ -142,20 +142,20 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate(
             int8_t*,
             int32_t*,
             int,
-            int>(asmjit::CallConv::kIdHost),
+            int>(asmjit::CallConvId::kHost),
         a->environment());
 
     asmjit::FuncFrame frame;
     frame.init(func);
 
     frame.setDirtyRegs(
-        x86::Reg::kGroupVec,
+        asmjit::RegGroup::kVec,
         asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
             asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15) |
             asmjit::Support::bitMask(16, 17, 18, 19, 20, 21, 22, 23) |
             asmjit::Support::bitMask(24, 25, 26, 27, 28, 29, 30, 31));
     frame.setDirtyRegs(
-        x86::Reg::kGroupGp,
+        asmjit::RegGroup::kGp,
         asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
     asmjit::FuncArgsAssignment args(&func);
diff --git a/src/GroupwiseConv.cc b/src/GroupwiseConv.cc
index 5f16e779d..97d00d87d 100644
--- a/src/GroupwiseConv.cc
+++ b/src/GroupwiseConv.cc
@@ -224,17 +224,17 @@ jit_conv_kernel_fp GenConvKernel<SPATIAL_DIM, INST_SET>::getOrCreate() {
           int32_t,
           int32_t,
           int32_t,
-          int32_t*>(asmjit::CallConv::kIdHost),
+          int32_t*>(asmjit::CallConvId::kHost),
       a->environment());
 
   frame_.init(func_);
 
   frame_.setDirtyRegs(
-      x86::Reg::kGroupVec,
+      asmjit::RegGroup::kVec,
       asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
           asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
   frame_.setDirtyRegs(
-      x86::Reg::kGroupGp,
+      asmjit::RegGroup::kGp,
       asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
   asmjit::FuncArgsAssignment args(&func_);
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index 3e31eedb2..b6dfbba60 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -52,12 +52,14 @@ void QuantizeAvx2(
   // clang-format on
   __m256i permute_mask_v =
       _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
+  const auto zero_point_v_legacy = _mm256_set1_ps(qparams.zero_point);
+  const auto zero_point_v_non_legacy = _mm256_set1_epi32(qparams.zero_point);
   for (; i < len / VLEN * VLEN; i += VLEN) {
     __m256 src_v = _mm256_loadu_ps(src + i);
     __m256 transformed_v;
     if (LEGACY) { // static if
-      transformed_v = _mm256_fmadd_ps(
-          src_v, inverse_scale_v, _mm256_set1_ps(qparams.zero_point));
+      transformed_v =
+          _mm256_fmadd_ps(src_v, inverse_scale_v, zero_point_v_legacy);
     } else {
       transformed_v = _mm256_mul_ps(src_v, inverse_scale_v);
     }
@@ -69,8 +71,7 @@ void QuantizeAvx2(
 
     __m256i rounded_v = _mm256_cvtps_epi32(transformed_v);
     if (!LEGACY) {
-      rounded_v =
-          _mm256_add_epi32(rounded_v, _mm256_set1_epi32(qparams.zero_point));
+      rounded_v = _mm256_add_epi32(rounded_v, zero_point_v_non_legacy);
     }
     __m256i clipped_v = _mm256_min_epi32(
         _mm256_max_epi32(rounded_v, _mm256_set1_epi32(min_val)),
@@ -94,8 +95,8 @@ void QuantizeAvx2(
     __m256 src_v = _mm256_maskload_ps(src + i, mask_v);
     __m256 transformed_v;
     if (LEGACY) {
-      transformed_v = _mm256_fmadd_ps(
-          src_v, inverse_scale_v, _mm256_set1_ps(qparams.zero_point));
+      transformed_v =
+          _mm256_fmadd_ps(src_v, inverse_scale_v, zero_point_v_legacy);
     } else {
       transformed_v = _mm256_mul_ps(src_v, inverse_scale_v);
     }
@@ -104,8 +105,7 @@ void QuantizeAvx2(
 
     __m256i rounded_v = _mm256_cvtps_epi32(transformed_v);
     if (!LEGACY) {
-      rounded_v =
-          _mm256_add_epi32(rounded_v, _mm256_set1_epi32(qparams.zero_point));
+      rounded_v = _mm256_add_epi32(rounded_v, zero_point_v_non_legacy);
     }
     __m256i clipped_v = _mm256_min_epi32(
         _mm256_max_epi32(rounded_v, _mm256_set1_epi32(min_val)),
@@ -2147,7 +2147,11 @@ void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfAvx2(
     for (col = 0; col < output_columns / VLEN * VLEN; col += VLEN) {
       __m256 in_v = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
           _mm_loadl_epi64(reinterpret_cast<const __m128i*>(input_row + col))));
+#ifdef __FMA__
+      __m256 dequantzed_v = _mm256_fmadd_ps(in_v, scale_v, bias_v);
+#else
       __m256 dequantzed_v = _mm256_add_ps(_mm256_mul_ps(in_v, scale_v), bias_v);
+#endif
       if (std::is_same<OutputType, float>()) {
         float* output_row_float = reinterpret_cast<float*>(output_row);
         _mm256_storeu_ps(output_row_float + col, dequantzed_v);
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index a6f16c8ec..38ee5e225 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -23,6 +23,11 @@ using namespace std;
 
 namespace fbgemm {
 
+typedef union {
+  uint32_t I;
+  float F;
+} fint32;
+
 // Thread-safe random number generator
 //
 // Return a random 32bit integer using xoshiro128++
@@ -115,6 +120,97 @@ void Bfloat16ToFloat_ref(const bfloat16* src, float* dst, size_t size) {
   }
 }
 
+void FloatToFloat8_ref(
+    const float input,
+    uint8_t* output,
+    int exponent_bits,
+    int exponent_bias) {
+  float max_pos = (1 << ((1 << exponent_bits) - 2 - exponent_bias)) *
+      (2 - std::pow(2, exponent_bits - 7));
+  int mantissa_bits = 7 - exponent_bits;
+  fint32 val_out, bouncer, smallest_normal;
+
+  val_out.F = input;
+  uint32_t sign_bit = val_out.I & 0x80000000;
+  val_out.I = val_out.I & 0x7FFFFFFF;
+  val_out.F = fminf(val_out.F, max_pos);
+
+  smallest_normal.I = (127 - exponent_bias + 1)
+      << 23; // smallest hfp8 normal number in FP32
+  // I don't know if the input "min_pos" is the smallest denormalized number
+  // or the smallest normalized number. The test below needs to be done with
+  // the smallest normal number, which is the numerical value 2^(1-bias)
+
+  // The conversion for denormalized values are slightly different. HFP8 is so
+  // low precision that gradual underflow is probably crucial
+  if (val_out.F >= smallest_normal.F) {
+    // Use round to nearest even. We make use of the standard rounding mechanism
+    // in FP32 rather than rounding the mantissa and handling tie-to-even and
+    // incrementing exponent We want to round of 23-mbits of the FP32 value
+    // val_in This can be done by adding a power of 2 exactly 23-mbits larger
+    // than the exponent of val_in This forces val_in to be moved to the right
+    // and rounding exact at the location corresponding to having mbits of
+    // explicit mantissa left
+    bouncer.I = (val_out.I & 0xFF800000) + ((23 - mantissa_bits) << 23);
+    val_out.F = (bouncer.F + val_out.F) - bouncer.F;
+    // adding the bouncer rounds off bits, and subtracting bouncer
+    // leaves the desired value, albeit in FP32 encoding
+    // All we need is to change the exponent encoding to using "bias"
+    val_out.I = uint32_t(val_out.I - ((127 - exponent_bias) << 23))
+        << (8 - exponent_bits);
+    val_out.I =
+        ((val_out.I | sign_bit) >>
+         24); // the 8 lsbs is the desired HFP8 encoding
+
+  } else {
+    // When the value is in the denormal range, IEEE numbers essentially becomes
+    // a fixed point number. The lsb is the smallest non-zero number
+    // 2^(1-bias-mbits) Hence, we define the bouncer so that its lsb is this
+    // smallest non-zero number Adding the input to this bouncer forces rounding
+    // to occur appropriately Also, in this situation, after adding the bouncer,
+    // the 8 least significant bits of the sum is already the HFP8 encoding of
+    // the desired result. Just need to restore the sign bit
+    bouncer.I = (127 + (23 + (1 - exponent_bias - mantissa_bits))) << 23;
+    val_out.F = bouncer.F + val_out.F;
+    val_out.I = val_out.I | (sign_bit >> 24);
+  }
+
+  *output = val_out.I; // get the 8 lsbs
+}
+
+void Float8ToFloat_ref(
+    const uint8_t input,
+    float* output,
+    int exponent_bits,
+    int exponent_bias) {
+  fint32 val_out, sign, multiplier;
+
+  sign.I = (input & 0x80) << 24;
+  val_out.I = (input & 0x7F) << (24 - (8 - exponent_bits));
+  // so that the mantissa bits start at the mantissa bit positions of FP32
+  // encoding
+
+  // Let the hfp8 mantissa bits correspond to the value frac, 0 <= frac < 1
+  // So if the hfp8 value is a normal number, it's value is 2^e x (1+frac)
+  // where e is its (true, unbiased) exponent
+  // If the hfp8 value is denormal, the value is 2^(1-bias) x frac
+
+  // However, the bit pattern in the 8-bit exponent field of val_out.F
+  // is bias+e when hfp8 is normal, and 0 when hfp8 is subnormal.
+  // So, as an FP32 value, when hfp8 is normal, val_out.F represents the value
+  // of 2^(bias+e-127) * (1+frac)
+  // And when hfp8 is subnormal, val_out.F is also subnormal, and represents the
+  // value of 2^(-126) * frac In either case, val_out.F corresponds to
+  // 2^(bias-127) * (value of hfp8 input) Thus, if we multiply val_out.F by
+  // 2^(127-bias), we obtain the hfp8 value as an FP32 number
+
+  multiplier.I = (127 + (127 - exponent_bias))
+      << 23; // multiplier.F is 2^(127-bias)
+  val_out.F *= multiplier.F;
+  val_out.I |= sign.I;
+  *output = val_out.F;
+}
+
 void requantize_u8acc32_ref(
     int M,
     int N,
@@ -1315,6 +1411,80 @@ bool EmbeddingSpMDMNBit_ref(
   return current == index_size;
 }
 
+template <typename IndexType, typename OffsetType, typename OutType>
+bool EmbeddingSpMDMFP8_ref(
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const uint8_t* input,
+    const IndexType* indices,
+    const OffsetType* offsets_or_lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    OutType* out,
+    bool is_weight_positional,
+    bool use_offsets,
+    int64_t output_stride,
+    int64_t input_stride,
+    int exponent_bits,
+    int exponent_bias) {
+  if (output_stride == -1) {
+    output_stride = block_size;
+  }
+
+  vector<float> buf(block_size);
+
+  if (input_stride == -1) {
+    input_stride = block_size;
+  }
+
+  // Reference implementation of FP8 SLS. The algorithm is similar to FP32 SLS
+  // except for the FP8->FP32 conversion after reading the embedding weight.
+  int64_t current = 0;
+  for (int m = 0; m < output_size; ++m) {
+    memset(buf.data(), 0, sizeof(float) * block_size);
+    int len = use_offsets ? offsets_or_lengths[m + 1] - offsets_or_lengths[m]
+                          : offsets_or_lengths[m];
+    if (current + len > index_size) {
+      return false;
+    }
+    for (int i = 0; i < len; ++i) {
+      int64_t idx = indices[current];
+      if (idx < 0 || idx >= data_size) {
+        return false;
+      }
+
+      float w = 1.f;
+      if (weights) {
+        w = weights[is_weight_positional ? i : current];
+      }
+
+      for (int j = 0; j < block_size; ++j) {
+        const uint8_t* inptr = input + input_stride * idx + j;
+        float input_f;
+        // Dequantize FP8 to FP32 before compute
+        Float8ToFloat_ref(*inptr, &input_f, exponent_bits, exponent_bias);
+        buf[j] = std::fma(w, input_f, buf[j]);
+      }
+
+      ++current;
+    }
+    if (normalize_by_lengths && len) {
+      float scale = 1.f / len;
+      for (int j = 0; j < block_size; ++j) {
+        buf[j] *= scale;
+      }
+    }
+    for (int j = 0; j < block_size; ++j) {
+      out[j] =
+          is_same<OutType, float16>::value ? cpu_float2half_rn(buf[j]) : buf[j];
+    }
+    out += output_stride;
+  }
+  return current == index_size;
+}
+
 template <typename InType, typename IndexType, typename OffsetType>
 bool EmbeddingSpMDMRowWiseSparse_ref(
     const int64_t block_size,
@@ -1858,7 +2028,24 @@ INSTANTIATE_SPMDM_INDEX_T(std::uint8_t)
       bool use_offsets,                                           \
       int64_t output_stride,                                      \
       int64_t input_stride,                                       \
-      bool scale_bias_last);
+      bool scale_bias_last);                                      \
+  template FBGEMM_API bool EmbeddingSpMDMFP8_ref(                 \
+      const int64_t block_size,                                   \
+      const int64_t output_size,                                  \
+      const int64_t index_size,                                   \
+      const int64_t data_size,                                    \
+      const uint8_t* input,                                       \
+      const INDEX_TYPE* indices,                                  \
+      const OFFSET_TYPE* offsets_or_lengths,                      \
+      const float* weights,                                       \
+      bool normalize_by_lengths,                                  \
+      OUT_TYPE* out,                                              \
+      bool is_weight_positional,                                  \
+      bool use_offsets,                                           \
+      int64_t output_stride,                                      \
+      int64_t input_stride,                                       \
+      int exponent_bits,                                          \
+      int exponent_bias);
 
 #define INSTANTIATE_SPMDM_OUT_T(INDEX_TYPE, OFFSET_TYPE)        \
   INSTANTIATE_SPMDM_BASE(INDEX_TYPE, OFFSET_TYPE, float)        \
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index dcc26fbb6..499bbce35 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -258,6 +258,28 @@ FBGEMM_API bool EmbeddingSpMDMNBit_ref(
     std::int64_t input_stride = -1,
     bool scale_bias_last = true);
 
+template <
+    typename IndexType = std::int64_t,
+    typename OffsetType = std::int32_t,
+    typename OutType = float>
+bool EmbeddingSpMDMFP8_ref(
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const uint8_t* input,
+    const IndexType* indices,
+    const OffsetType* offsets_or_lengths,
+    const float* weights,
+    bool normalize_by_lengths,
+    OutType* out,
+    bool is_weight_positional = false,
+    bool use_offsets = true,
+    int64_t output_stride = -1,
+    int64_t input_stride = -1,
+    int exponent_bits = 4,
+    int exponent_bias = 7);
+
 template <
     typename InType = std::uint8_t,
     typename IndexType = std::int64_t,
diff --git a/src/RowWiseSparseAdagradFused.cc b/src/RowWiseSparseAdagradFused.cc
index 053a74bc5..b01476915 100644
--- a/src/RowWiseSparseAdagradFused.cc
+++ b/src/RowWiseSparseAdagradFused.cc
@@ -177,7 +177,7 @@ typename ReturnFunctionSignature<indxType, offsetType, dataType>::
                 const int*, // lengths
                 float, // epsilon
                 float, // lr then rand_buffer
-                uint32_t*>(asmjit::CallConv::kIdHost),
+                uint32_t*>(asmjit::CallConvId::kHost),
             a->environment());
 
         asmjit::FuncFrame frame;
@@ -185,12 +185,12 @@ typename ReturnFunctionSignature<indxType, offsetType, dataType>::
 
         if (instSet == inst_set_t::avx2) {
           frame.setDirtyRegs(
-              x86::Reg::kGroupVec,
+              asmjit::RegGroup::kVec,
               asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
                   asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
         } else {
           frame.setDirtyRegs(
-              x86::Reg::kGroupVec,
+              asmjit::RegGroup::kVec,
               asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
                   asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15) |
                   asmjit::Support::bitMask(16, 17, 18, 19, 20, 21, 22, 23) |
@@ -198,7 +198,7 @@ typename ReturnFunctionSignature<indxType, offsetType, dataType>::
         }
 
         frame.setDirtyRegs(
-            x86::Reg::kGroupGp,
+            asmjit::RegGroup::kGp,
             asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14));
 
         asmjit::FuncArgsAssignment args(&func);
diff --git a/src/SparseAdagrad.cc b/src/SparseAdagrad.cc
index 970b9852a..911d786c8 100644
--- a/src/SparseAdagrad.cc
+++ b/src/SparseAdagrad.cc
@@ -510,7 +510,7 @@ GenSparseAdagrad<indxType, instSet>::getOrCreate(
                 const int*, // mask_avx2
                 float, // weight_decay
                 const double*, // counter then counter_halflife
-                std::int64_t>(asmjit::CallConv::kIdHost),
+                std::int64_t>(asmjit::CallConvId::kHost),
             a->environment());
 
         asmjit::FuncFrame frame;
@@ -518,12 +518,12 @@ GenSparseAdagrad<indxType, instSet>::getOrCreate(
 
         if (instSet == inst_set_t::avx2) {
           frame.setDirtyRegs(
-              x86::Reg::kGroupVec,
+              asmjit::RegGroup::kVec,
               asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
                   asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
         } else {
           frame.setDirtyRegs(
-              x86::Reg::kGroupVec,
+              asmjit::RegGroup::kVec,
               asmjit::Support::bitMask(0, 1, 2, 3, 4, 5, 6, 7) |
                   asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15) |
                   asmjit::Support::bitMask(16, 17, 18, 19, 20, 21, 22, 23) |
@@ -531,7 +531,7 @@ GenSparseAdagrad<indxType, instSet>::getOrCreate(
         }
 
         frame.setDirtyRegs(
-            x86::Reg::kGroupGp,
+            asmjit::RegGroup::kGp,
             asmjit::Support::bitMask(8, 9, 10, 11, 12, 13, 14, 15));
 
         asmjit::FuncArgsAssignment args(&func);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 336b0e8d2..a30735354 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -17,6 +17,11 @@ if(FBGEMM_BUILD_TESTS AND NOT TARGET gtest)
     message(WARNING "gtest_force_shared_crt is OFF")
   endif()
   add_subdirectory("${GOOGLETEST_SOURCE_DIR}" "${FBGEMM_BINARY_DIR}/googletest")
+  # add flags required for mac build
+  if(NOT MSVC)
+    target_compile_options(gtest PRIVATE "-Wno-deprecated-copy")
+    target_compile_options(gmock PRIVATE "-Wno-deprecated-copy")
+  endif()
 endif()
 
 if (USE_SANITIZER)
diff --git a/third_party/asmjit b/third_party/asmjit
index 8b35b4cff..d3fbf7c9b 160000
--- a/third_party/asmjit
+++ b/third_party/asmjit
@@ -1 +1 @@
-Subproject commit 8b35b4cffb62ecb58a903bf91cb7537d7a672211
+Subproject commit d3fbf7c9bc7c1d1365a94a45614b91c5a3706b81
diff --git a/third_party/asmjit.BUILD b/third_party/asmjit.BUILD
index 13fadb2e8..71dc5c7e6 100644
--- a/third_party/asmjit.BUILD
+++ b/third_party/asmjit.BUILD
@@ -5,11 +5,13 @@ cc_library(
     srcs = glob([
         "src/asmjit/core/*.cpp",
         "src/asmjit/x86/*.cpp",
+        "src/asmjit/arm/*.cpp",
     ]),
     hdrs = glob([
         "src/asmjit/x86/*.h",
         "src/asmjit/core/*.h",
         "src/asmjit/*.h",
+        "src/asmjit/arm/*.h",
     ]),
     copts = [
         "-DASMJIT_STATIC",
diff --git a/third_party/hipify_torch b/third_party/hipify_torch
index 59e17e5fc..1840658c1 160000
--- a/third_party/hipify_torch
+++ b/third_party/hipify_torch
@@ -1 +1 @@
-Subproject commit 59e17e5fcf00d4fb7c0a64cd727ca08e5100d9bd
+Subproject commit 1840658c184f3eeba787dae0f06c45756c1daaf5