Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fbgemm_gpu] Break down fbgemm_gpu_tbe_training_backward module further, pt 3 #3694

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ __configure_fbgemm_gpu_build_nvcc () {
}

__configure_fbgemm_gpu_cuda_home () {
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
Expand Down Expand Up @@ -222,7 +223,8 @@ __configure_fbgemm_gpu_build_cuda () {

if [[ $cuda_version_nvcc == *"V12.1"* ]] ||
[[ $cuda_version_nvcc == *"V12.4"* ]] ||
[[ $cuda_version_nvcc == *"V12.6"* ]]; then
[[ $cuda_version_nvcc == *"V12.6"* ]] ||
[[ $cuda_version_nvcc == *"V12.8"* ]]; then
# sm_90 and sm_90a are only available for CUDA 12.1+
# NOTE: CUTLASS kernels for Hopper require sm_90a to be enabled
# See:
Expand Down
11 changes: 7 additions & 4 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ __set_cuda_symlinks_envvars () {
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"

if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
# CUDA 12.6 installation has a very different package layout than previous
# CUDA versions - notably, NVTX has been moved elsewhere, which causes
# PyTorch CMake scripts to complain.
echo "[INSTALL] Fixing file placements for CUDA 12.6+ ..."
echo "[INSTALL] Fixing file placements for CUDA ${BUILD_CUDA_VERSION}+ ..."

echo "[INSTALL] Creating symlinks: libnvToolsExt.so"
print_exec ln -sf "${conda_prefix}/lib/libnvToolsExt.so.1" "${conda_prefix}/lib/libnvToolsExt.so"
Expand Down Expand Up @@ -89,7 +90,8 @@ __set_nvcc_prepend_flags () {
# which overrides whatever `-ccbin` flag we set manually, so remove this
# unwanted hook
print_exec ls -la "${conda_prefix}/etc/conda/activate.d"
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..."
print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh"
fi
Expand Down Expand Up @@ -192,7 +194,8 @@ install_cuda () {
# in the future, we will be using conda-forge for installing all CUDA versions
# (except for versions 11.8 and below, which are only available through
# nvidia/label/cuda-*)
if [[ "$cuda_version" =~ ^12.6.*$ ]]; then
if [[ "$cuda_version" =~ ^12.6.*$ ]] ||
[[ "$cuda_version" =~ ^12.8.*$ ]]; then
# shellcheck disable=SC2086
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
cuda=${cuda_version}) || return 1
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_ci_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
{ arch: x86, instance: "linux.24xlarge" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
compiler: [ "gcc", "clang" ]

steps:
Expand Down Expand Up @@ -156,7 +156,7 @@ jobs:
# { arch: x86, instance: "linux.gcp.a100" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "12.4.1" ]
compiler: [ "gcc", "clang" ]
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,10 @@ on:
# PR Trigger
#
pull_request:
branches:
- main

# Push Trigger (enable to catch errors coming out of multiple merges)
#
push:
branches:
- main

# Manual Trigger
#
Expand Down
18 changes: 17 additions & 1 deletion fbgemm_gpu/cmake/TbeTraining.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ get_tbe_sources_list(gen_gpu_files_training_pt2)
get_tbe_sources_list(gen_gpu_files_training_dense)
get_tbe_sources_list(gen_gpu_files_training_split_host)
get_tbe_sources_list(gen_gpu_files_training_gwd)
get_tbe_sources_list(gen_gpu_files_training_vbe)
handle_genfiles_rocm(gen_cpu_files_training)
handle_genfiles_rocm(gen_gpu_files_training)
handle_genfiles_rocm(gen_gpu_files_training_pt2)
handle_genfiles_rocm(gen_gpu_files_training_dense)
handle_genfiles_rocm(gen_gpu_files_training_split_host)
handle_genfiles_rocm(gen_gpu_files_training_gwd)
handle_genfiles_rocm(gen_gpu_files_training_vbe)

# Index Select
get_tbe_sources_list(static_cpu_files_index_select)
Expand Down Expand Up @@ -204,7 +206,6 @@ gpu_cpp_library(
DESTINATION
fbgemm_gpu)


gpu_cpp_library(
PREFIX
fbgemm_gpu_tbe_training_backward_gwd
Expand All @@ -221,6 +222,21 @@ gpu_cpp_library(
DESTINATION
fbgemm_gpu)

gpu_cpp_library(
PREFIX
fbgemm_gpu_tbe_training_backward_vbe
TYPE
SHARED
INCLUDE_DIRS
${fbgemm_sources_include_directories}
GPU_SRCS
${gen_gpu_files_training_vbe}
NVCC_FLAGS
${TORCH_CUDA_OPTIONS}
DEPS
fbgemm_gpu_tbe_training_backward
DESTINATION
fbgemm_gpu)

gpu_cpp_library(
PREFIX
Expand Down
68 changes: 34 additions & 34 deletions fbgemm_gpu/cmake/tbe_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,40 @@
)
]

gen_gpu_files_training_vbe = [
fstring.format(optimizer, wdesc)
for optimizer in VBE_OPTIMIZERS
for wdesc in PARTIAL_WEIGHT_OPTIONS
for fstring in [
"gen_embedding_backward_{}_split_{}_vbe_meta.cpp",
]
+ (
[
"gen_embedding_backward_{}_ssd_{}_vbe_meta.cpp",
]
if optimizer in SSD_OPTIMIZERS
else []
)
] + [
fstring.format(optimizer, wdesc)
for optimizer in VBE_OPTIMIZERS
for wdesc in PARTIAL_WEIGHT_OPTIONS
for fstring in [
"gen_embedding_backward_{}_split_{}_vbe_cuda.cu",
"gen_embedding_backward_{}_split_{}_vbe_kernel_cta.cu",
"gen_embedding_backward_{}_split_{}_vbe_kernel_warp.cu",
]
+ (
[
"gen_embedding_backward_{}_ssd_{}_vbe_cuda.cu",
"gen_embedding_backward_{}_ssd_{}_vbe_kernel_cta.cu",
"gen_embedding_backward_{}_ssd_{}_vbe_kernel_warp.cu",
]
if optimizer in SSD_OPTIMIZERS
else []
)
]

gen_gpu_files_training = (
[
"gen_embedding_backward_split_grad_embedding_ops.cu",
Expand Down Expand Up @@ -451,40 +485,6 @@
"gen_embedding_backward_{}_{}_{}_kernel_warp.cu",
]
]
+ [
fstring.format(optimizer, wdesc)
for optimizer in VBE_OPTIMIZERS
for wdesc in PARTIAL_WEIGHT_OPTIONS
for fstring in [
"gen_embedding_backward_{}_split_{}_vbe_meta.cpp",
]
+ (
[
"gen_embedding_backward_{}_ssd_{}_vbe_meta.cpp",
]
if optimizer in SSD_OPTIMIZERS
else []
)
]
+ [
fstring.format(optimizer, wdesc)
for optimizer in VBE_OPTIMIZERS
for wdesc in PARTIAL_WEIGHT_OPTIONS
for fstring in [
"gen_embedding_backward_{}_split_{}_vbe_cuda.cu",
"gen_embedding_backward_{}_split_{}_vbe_kernel_cta.cu",
"gen_embedding_backward_{}_split_{}_vbe_kernel_warp.cu",
]
+ (
[
"gen_embedding_backward_{}_ssd_{}_vbe_cuda.cu",
"gen_embedding_backward_{}_ssd_{}_vbe_kernel_cta.cu",
"gen_embedding_backward_{}_ssd_{}_vbe_kernel_warp.cu",
]
if optimizer in SSD_OPTIMIZERS
else []
)
]
)

gen_hip_files_training = [
Expand Down
1 change: 1 addition & 0 deletions fbgemm_gpu/fbgemm_gpu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def _load_library(filename: str) -> None:
"fbgemm_gpu_tbe_training_backward_dense",
"fbgemm_gpu_tbe_training_backward_split_host",
"fbgemm_gpu_tbe_training_backward_gwd",
"fbgemm_gpu_tbe_training_backward_vbe",
"fbgemm_gpu_py",
]

Expand Down
Loading