[fbgemm_gpu] Break down fbgemm_gpu_tbe_training_backward module fur…

…ther, pt 3 - Break down `fbgemm_gpu_tbe_training_backward` module further, to work around instruction relocation issues in CUDA 12.8
pytorch · Feb 15, 2025 · db89295 · db89295
1 parent a4be13a
commit db89295
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 39 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -118,7 +118,8 @@ __configure_fbgemm_gpu_build_nvcc () {
 }
 
 __configure_fbgemm_gpu_cuda_home () {
-  if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
     # shellcheck disable=SC2155,SC2086
     local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
     local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
@@ -222,7 +223,8 @@ __configure_fbgemm_gpu_build_cuda () {
 
     if  [[ $cuda_version_nvcc == *"V12.1"* ]] ||
         [[ $cuda_version_nvcc == *"V12.4"* ]] ||
-        [[ $cuda_version_nvcc == *"V12.6"* ]]; then
+        [[ $cuda_version_nvcc == *"V12.6"* ]] ||
+        [[ $cuda_version_nvcc == *"V12.8"* ]]; then
       # sm_90 and sm_90a are only available for CUDA 12.1+
       # NOTE: CUTLASS kernels for Hopper require sm_90a to be enabled
       # See:

diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -18,11 +18,12 @@ __set_cuda_symlinks_envvars () {
   local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
   local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
 
-  if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
     # CUDA 12.6 installation has a very different package layout than previous
     # CUDA versions - notably, NVTX has been moved elsewhere, which causes
     # PyTorch CMake scripts to complain.
-    echo "[INSTALL] Fixing file placements for CUDA 12.6+ ..."
+    echo "[INSTALL] Fixing file placements for CUDA ${BUILD_CUDA_VERSION}+ ..."
 
     echo "[INSTALL] Creating symlinks: libnvToolsExt.so"
     print_exec ln -sf "${conda_prefix}/lib/libnvToolsExt.so.1" "${conda_prefix}/lib/libnvToolsExt.so"
@@ -89,7 +90,8 @@ __set_nvcc_prepend_flags () {
   # which overrides whatever `-ccbin` flag we set manually, so remove this
   # unwanted hook
   print_exec ls -la "${conda_prefix}/etc/conda/activate.d"
-  if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
     echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..."
     print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh"
   fi
@@ -192,7 +194,8 @@ install_cuda () {
   # in the future, we will be using conda-forge for installing all CUDA versions
   # (except for versions 11.8 and below, which are only available through
   # nvidia/label/cuda-*)
-  if [[ "$cuda_version" =~ ^12.6.*$ ]]; then
+  if  [[ "$cuda_version" =~ ^12.6.*$ ]] ||
+      [[ "$cuda_version" =~ ^12.8.*$ ]]; then
     # shellcheck disable=SC2086
     (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
       cuda=${cuda_version}) || return 1

diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -73,7 +73,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -157,7 +157,7 @@ jobs:
           # { arch: x86, instance: "linux.gcp.a100" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.4.1" ]
         compiler: [ "gcc", "clang" ]

diff --git a/fbgemm_gpu/cmake/TbeTraining.cmake b/fbgemm_gpu/cmake/TbeTraining.cmake
@@ -30,12 +30,14 @@ get_tbe_sources_list(gen_gpu_files_training_pt2)
 get_tbe_sources_list(gen_gpu_files_training_dense)
 get_tbe_sources_list(gen_gpu_files_training_split_host)
 get_tbe_sources_list(gen_gpu_files_training_gwd)
+get_tbe_sources_list(gen_gpu_files_training_vbe)
 handle_genfiles_rocm(gen_cpu_files_training)
 handle_genfiles_rocm(gen_gpu_files_training)
 handle_genfiles_rocm(gen_gpu_files_training_pt2)
 handle_genfiles_rocm(gen_gpu_files_training_dense)
 handle_genfiles_rocm(gen_gpu_files_training_split_host)
 handle_genfiles_rocm(gen_gpu_files_training_gwd)
+handle_genfiles_rocm(gen_gpu_files_training_vbe)
 
 # Index Select
 get_tbe_sources_list(static_cpu_files_index_select)
@@ -204,7 +206,6 @@ gpu_cpp_library(
   DESTINATION
     fbgemm_gpu)
 
-
 gpu_cpp_library(
   PREFIX
     fbgemm_gpu_tbe_training_backward_gwd
@@ -221,6 +222,21 @@ gpu_cpp_library(
   DESTINATION
     fbgemm_gpu)
 
+gpu_cpp_library(
+  PREFIX
+    fbgemm_gpu_tbe_training_backward_vbe
+  TYPE
+    SHARED
+  INCLUDE_DIRS
+    ${fbgemm_sources_include_directories}
+  GPU_SRCS
+    ${gen_gpu_files_training_vbe}
+  NVCC_FLAGS
+    ${TORCH_CUDA_OPTIONS}
+  DEPS
+    fbgemm_gpu_tbe_training_backward
+  DESTINATION
+    fbgemm_gpu)
 
 gpu_cpp_library(
   PREFIX

diff --git a/fbgemm_gpu/cmake/tbe_sources.py b/fbgemm_gpu/cmake/tbe_sources.py
@@ -421,37 +421,8 @@
     )
 ]
 
-gen_gpu_files_training = (
+gen_gpu_files_training_vbe = (
     [
-        "gen_embedding_backward_split_grad_embedding_ops.cu",
-    ]
-    + [
-        # Backward-split positional weights and forward src files
-        fstring.format(desc)
-        for desc in DENSE_OPTIONS
-        for fstring in [
-            "gen_embedding_backward_{}_indice_weights_codegen_cuda.cu",
-        ]
-    ]
-    + [
-        fstring.format(
-            optimizer,
-            "ssd" if ssd else "split",
-            wdesc,
-        )
-        for ssd in [
-            True,
-            False,
-        ]
-        for optimizer in (SSD_OPTIMIZERS if ssd else GPU_OPTIMIZERS)
-        for wdesc in WEIGHT_OPTIONS
-        for fstring in [
-            "gen_embedding_backward_{}_{}_{}_cuda.cu",
-            "gen_embedding_backward_{}_{}_{}_kernel_cta.cu",
-            "gen_embedding_backward_{}_{}_{}_kernel_warp.cu",
-        ]
-    ]
-    + [
         fstring.format(optimizer, wdesc)
         for optimizer in VBE_OPTIMIZERS
         for wdesc in PARTIAL_WEIGHT_OPTIONS
@@ -487,6 +458,38 @@
     ]
 )
 
+gen_gpu_files_training = (
+    [
+        "gen_embedding_backward_split_grad_embedding_ops.cu",
+    ]
+    + [
+        # Backward-split positional weights and forward src files
+        fstring.format(desc)
+        for desc in DENSE_OPTIONS
+        for fstring in [
+            "gen_embedding_backward_{}_indice_weights_codegen_cuda.cu",
+        ]
+    ]
+    + [
+        fstring.format(
+            optimizer,
+            "ssd" if ssd else "split",
+            wdesc,
+        )
+        for ssd in [
+            True,
+            False,
+        ]
+        for optimizer in (SSD_OPTIMIZERS if ssd else GPU_OPTIMIZERS)
+        for wdesc in WEIGHT_OPTIONS
+        for fstring in [
+            "gen_embedding_backward_{}_{}_{}_cuda.cu",
+            "gen_embedding_backward_{}_{}_{}_kernel_cta.cu",
+            "gen_embedding_backward_{}_{}_{}_kernel_warp.cu",
+        ]
+    ]
+)
+
 gen_hip_files_training = [
     "gen_embedding_backward_split_{}{}_device_kernel_hip.hip".format(
         "weighted" if weighted else "unweighted",

diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -47,6 +47,7 @@ def _load_library(filename: str) -> None:
     "fbgemm_gpu_tbe_training_backward_dense",
     "fbgemm_gpu_tbe_training_backward_split_host",
     "fbgemm_gpu_tbe_training_backward_gwd",
+    "fbgemm_gpu_tbe_training_backward_vbe",
     "fbgemm_gpu_py",
 ]