Skip to content

Commit

Permalink
[fbgemm_gpu] Break down fbgemm_gpu_tbe_training_backward module fur…
Browse files Browse the repository at this point in the history
…ther, pt 3

- Break down `fbgemm_gpu_tbe_training_backward` module further, to work around
instruction relocation issues in CUDA 12.8
  • Loading branch information
q10 committed Feb 15, 2025
1 parent a4be13a commit db89295
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 39 deletions.
6 changes: 4 additions & 2 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ __configure_fbgemm_gpu_build_nvcc () {
}

__configure_fbgemm_gpu_cuda_home () {
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
Expand Down Expand Up @@ -222,7 +223,8 @@ __configure_fbgemm_gpu_build_cuda () {

if [[ $cuda_version_nvcc == *"V12.1"* ]] ||
[[ $cuda_version_nvcc == *"V12.4"* ]] ||
[[ $cuda_version_nvcc == *"V12.6"* ]]; then
[[ $cuda_version_nvcc == *"V12.6"* ]] ||
[[ $cuda_version_nvcc == *"V12.8"* ]]; then
# sm_90 and sm_90a are only available for CUDA 12.1+
# NOTE: CUTLASS kernels for Hopper require sm_90a to be enabled
# See:
Expand Down
11 changes: 7 additions & 4 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ __set_cuda_symlinks_envvars () {
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"

if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
# CUDA 12.6 installation has a very different package layout than previous
# CUDA versions - notably, NVTX has been moved elsewhere, which causes
# PyTorch CMake scripts to complain.
echo "[INSTALL] Fixing file placements for CUDA 12.6+ ..."
echo "[INSTALL] Fixing file placements for CUDA ${BUILD_CUDA_VERSION}+ ..."

echo "[INSTALL] Creating symlinks: libnvToolsExt.so"
print_exec ln -sf "${conda_prefix}/lib/libnvToolsExt.so.1" "${conda_prefix}/lib/libnvToolsExt.so"
Expand Down Expand Up @@ -89,7 +90,8 @@ __set_nvcc_prepend_flags () {
# which overrides whatever `-ccbin` flag we set manually, so remove this
# unwanted hook
print_exec ls -la "${conda_prefix}/etc/conda/activate.d"
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..."
print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh"
fi
Expand Down Expand Up @@ -192,7 +194,8 @@ install_cuda () {
# in the future, we will be using conda-forge for installing all CUDA versions
# (except for versions 11.8 and below, which are only available through
# nvidia/label/cuda-*)
if [[ "$cuda_version" =~ ^12.6.*$ ]]; then
if [[ "$cuda_version" =~ ^12.6.*$ ]] ||
[[ "$cuda_version" =~ ^12.8.*$ ]]; then
# shellcheck disable=SC2086
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
cuda=${cuda_version}) || return 1
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_ci_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
{ arch: x86, instance: "linux.24xlarge" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
compiler: [ "gcc", "clang" ]

steps:
Expand Down Expand Up @@ -157,7 +157,7 @@ jobs:
# { arch: x86, instance: "linux.gcp.a100" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "12.4.1" ]
compiler: [ "gcc", "clang" ]
Expand Down
18 changes: 17 additions & 1 deletion fbgemm_gpu/cmake/TbeTraining.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ get_tbe_sources_list(gen_gpu_files_training_pt2)
get_tbe_sources_list(gen_gpu_files_training_dense)
get_tbe_sources_list(gen_gpu_files_training_split_host)
get_tbe_sources_list(gen_gpu_files_training_gwd)
get_tbe_sources_list(gen_gpu_files_training_vbe)
handle_genfiles_rocm(gen_cpu_files_training)
handle_genfiles_rocm(gen_gpu_files_training)
handle_genfiles_rocm(gen_gpu_files_training_pt2)
handle_genfiles_rocm(gen_gpu_files_training_dense)
handle_genfiles_rocm(gen_gpu_files_training_split_host)
handle_genfiles_rocm(gen_gpu_files_training_gwd)
handle_genfiles_rocm(gen_gpu_files_training_vbe)

# Index Select
get_tbe_sources_list(static_cpu_files_index_select)
Expand Down Expand Up @@ -204,7 +206,6 @@ gpu_cpp_library(
DESTINATION
fbgemm_gpu)


gpu_cpp_library(
PREFIX
fbgemm_gpu_tbe_training_backward_gwd
Expand All @@ -221,6 +222,21 @@ gpu_cpp_library(
DESTINATION
fbgemm_gpu)

gpu_cpp_library(
PREFIX
fbgemm_gpu_tbe_training_backward_vbe
TYPE
SHARED
INCLUDE_DIRS
${fbgemm_sources_include_directories}
GPU_SRCS
${gen_gpu_files_training_vbe}
NVCC_FLAGS
${TORCH_CUDA_OPTIONS}
DEPS
fbgemm_gpu_tbe_training_backward
DESTINATION
fbgemm_gpu)

gpu_cpp_library(
PREFIX
Expand Down
63 changes: 33 additions & 30 deletions fbgemm_gpu/cmake/tbe_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,37 +421,8 @@
)
]

gen_gpu_files_training = (
gen_gpu_files_training_vbe = (
[
"gen_embedding_backward_split_grad_embedding_ops.cu",
]
+ [
# Backward-split positional weights and forward src files
fstring.format(desc)
for desc in DENSE_OPTIONS
for fstring in [
"gen_embedding_backward_{}_indice_weights_codegen_cuda.cu",
]
]
+ [
fstring.format(
optimizer,
"ssd" if ssd else "split",
wdesc,
)
for ssd in [
True,
False,
]
for optimizer in (SSD_OPTIMIZERS if ssd else GPU_OPTIMIZERS)
for wdesc in WEIGHT_OPTIONS
for fstring in [
"gen_embedding_backward_{}_{}_{}_cuda.cu",
"gen_embedding_backward_{}_{}_{}_kernel_cta.cu",
"gen_embedding_backward_{}_{}_{}_kernel_warp.cu",
]
]
+ [
fstring.format(optimizer, wdesc)
for optimizer in VBE_OPTIMIZERS
for wdesc in PARTIAL_WEIGHT_OPTIONS
Expand Down Expand Up @@ -487,6 +458,38 @@
]
)

gen_gpu_files_training = (
[
"gen_embedding_backward_split_grad_embedding_ops.cu",
]
+ [
# Backward-split positional weights and forward src files
fstring.format(desc)
for desc in DENSE_OPTIONS
for fstring in [
"gen_embedding_backward_{}_indice_weights_codegen_cuda.cu",
]
]
+ [
fstring.format(
optimizer,
"ssd" if ssd else "split",
wdesc,
)
for ssd in [
True,
False,
]
for optimizer in (SSD_OPTIMIZERS if ssd else GPU_OPTIMIZERS)
for wdesc in WEIGHT_OPTIONS
for fstring in [
"gen_embedding_backward_{}_{}_{}_cuda.cu",
"gen_embedding_backward_{}_{}_{}_kernel_cta.cu",
"gen_embedding_backward_{}_{}_{}_kernel_warp.cu",
]
]
)

gen_hip_files_training = [
"gen_embedding_backward_split_{}{}_device_kernel_hip.hip".format(
"weighted" if weighted else "unweighted",
Expand Down
1 change: 1 addition & 0 deletions fbgemm_gpu/fbgemm_gpu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def _load_library(filename: str) -> None:
"fbgemm_gpu_tbe_training_backward_dense",
"fbgemm_gpu_tbe_training_backward_split_host",
"fbgemm_gpu_tbe_training_backward_gwd",
"fbgemm_gpu_tbe_training_backward_vbe",
"fbgemm_gpu_py",
]

Expand Down

0 comments on commit db89295

Please sign in to comment.