Skip to content

Commit c08e41d

Browse files
q10facebook-github-bot
authored andcommitted
Break down fbgemm_gpu_tbe_training_backward module further, pt 3 (pytorch#779)
Summary: Pull Request resolved: facebookresearch/FBGEMM#779 - Break down `fbgemm_gpu_tbe_training_backward` module further, to work around instruction relocation issues in CUDA 12.8 X-link: pytorch#3694 Reviewed By: spcyppt Differential Revision: D69693785 Pulled By: q10 fbshipit-source-id: 2d98a05a7a8cad30cb87195d9cca1d808395303b
1 parent ba33284 commit c08e41d

File tree

7 files changed

+65
-47
lines changed

7 files changed

+65
-47
lines changed

.github/scripts/fbgemm_gpu_build.bash

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ __configure_fbgemm_gpu_build_nvcc () {
118118
}
119119

120120
__configure_fbgemm_gpu_cuda_home () {
121-
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
121+
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
122+
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
122123
# shellcheck disable=SC2155,SC2086
123124
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
124125
local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
@@ -222,7 +223,8 @@ __configure_fbgemm_gpu_build_cuda () {
222223

223224
if [[ $cuda_version_nvcc == *"V12.1"* ]] ||
224225
[[ $cuda_version_nvcc == *"V12.4"* ]] ||
225-
[[ $cuda_version_nvcc == *"V12.6"* ]]; then
226+
[[ $cuda_version_nvcc == *"V12.6"* ]] ||
227+
[[ $cuda_version_nvcc == *"V12.8"* ]]; then
226228
# sm_90 and sm_90a are only available for CUDA 12.1+
227229
# NOTE: CUTLASS kernels for Hopper require sm_90a to be enabled
228230
# See:

.github/scripts/utils_cuda.bash

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@ __set_cuda_symlinks_envvars () {
1818
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
1919
local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
2020

21-
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
21+
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
22+
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
2223
# CUDA 12.6 installation has a very different package layout than previous
2324
# CUDA versions - notably, NVTX has been moved elsewhere, which causes
2425
# PyTorch CMake scripts to complain.
25-
echo "[INSTALL] Fixing file placements for CUDA 12.6+ ..."
26+
echo "[INSTALL] Fixing file placements for CUDA ${BUILD_CUDA_VERSION}+ ..."
2627

2728
echo "[INSTALL] Creating symlinks: libnvToolsExt.so"
2829
print_exec ln -sf "${conda_prefix}/lib/libnvToolsExt.so.1" "${conda_prefix}/lib/libnvToolsExt.so"
@@ -89,7 +90,8 @@ __set_nvcc_prepend_flags () {
8990
# which overrides whatever `-ccbin` flag we set manually, so remove this
9091
# unwanted hook
9192
print_exec ls -la "${conda_prefix}/etc/conda/activate.d"
92-
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
93+
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
94+
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
9395
echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..."
9496
print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh"
9597
fi
@@ -192,7 +194,8 @@ install_cuda () {
192194
# in the future, we will be using conda-forge for installing all CUDA versions
193195
# (except for versions 11.8 and below, which are only available through
194196
# nvidia/label/cuda-*)
195-
if [[ "$cuda_version" =~ ^12.6.*$ ]]; then
197+
if [[ "$cuda_version" =~ ^12.6.*$ ]] ||
198+
[[ "$cuda_version" =~ ^12.8.*$ ]]; then
196199
# shellcheck disable=SC2086
197200
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
198201
cuda=${cuda_version}) || return 1

.github/workflows/fbgemm_gpu_ci_cuda.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ jobs:
7373
{ arch: x86, instance: "linux.24xlarge" },
7474
]
7575
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
76-
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
76+
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
7777
compiler: [ "gcc", "clang" ]
7878

7979
steps:
@@ -156,7 +156,7 @@ jobs:
156156
# { arch: x86, instance: "linux.gcp.a100" },
157157
]
158158
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
159-
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
159+
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
160160
# Specify exactly ONE CUDA version for artifact publish
161161
cuda-version-publish: [ "12.4.1" ]
162162
compiler: [ "gcc", "clang" ]

.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,10 @@ on:
1111
# PR Trigger
1212
#
1313
pull_request:
14-
branches:
15-
- main
1614

1715
# Push Trigger (enable to catch errors coming out of multiple merges)
1816
#
1917
push:
20-
branches:
21-
- main
2218

2319
# Manual Trigger
2420
#

fbgemm_gpu/cmake/TbeTraining.cmake

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,14 @@ get_tbe_sources_list(gen_gpu_files_training_pt2)
3030
get_tbe_sources_list(gen_gpu_files_training_dense)
3131
get_tbe_sources_list(gen_gpu_files_training_split_host)
3232
get_tbe_sources_list(gen_gpu_files_training_gwd)
33+
get_tbe_sources_list(gen_gpu_files_training_vbe)
3334
handle_genfiles_rocm(gen_cpu_files_training)
3435
handle_genfiles_rocm(gen_gpu_files_training)
3536
handle_genfiles_rocm(gen_gpu_files_training_pt2)
3637
handle_genfiles_rocm(gen_gpu_files_training_dense)
3738
handle_genfiles_rocm(gen_gpu_files_training_split_host)
3839
handle_genfiles_rocm(gen_gpu_files_training_gwd)
40+
handle_genfiles_rocm(gen_gpu_files_training_vbe)
3941

4042
# Index Select
4143
get_tbe_sources_list(static_cpu_files_index_select)
@@ -204,7 +206,6 @@ gpu_cpp_library(
204206
DESTINATION
205207
fbgemm_gpu)
206208

207-
208209
gpu_cpp_library(
209210
PREFIX
210211
fbgemm_gpu_tbe_training_backward_gwd
@@ -221,6 +222,21 @@ gpu_cpp_library(
221222
DESTINATION
222223
fbgemm_gpu)
223224

225+
gpu_cpp_library(
226+
PREFIX
227+
fbgemm_gpu_tbe_training_backward_vbe
228+
TYPE
229+
SHARED
230+
INCLUDE_DIRS
231+
${fbgemm_sources_include_directories}
232+
GPU_SRCS
233+
${gen_gpu_files_training_vbe}
234+
NVCC_FLAGS
235+
${TORCH_CUDA_OPTIONS}
236+
DEPS
237+
fbgemm_gpu_tbe_training_backward
238+
DESTINATION
239+
fbgemm_gpu)
224240

225241
gpu_cpp_library(
226242
PREFIX

fbgemm_gpu/cmake/tbe_sources.py

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,40 @@
421421
)
422422
]
423423

424+
gen_gpu_files_training_vbe = [
425+
fstring.format(optimizer, wdesc)
426+
for optimizer in VBE_OPTIMIZERS
427+
for wdesc in PARTIAL_WEIGHT_OPTIONS
428+
for fstring in [
429+
"gen_embedding_backward_{}_split_{}_vbe_meta.cpp",
430+
]
431+
+ (
432+
[
433+
"gen_embedding_backward_{}_ssd_{}_vbe_meta.cpp",
434+
]
435+
if optimizer in SSD_OPTIMIZERS
436+
else []
437+
)
438+
] + [
439+
fstring.format(optimizer, wdesc)
440+
for optimizer in VBE_OPTIMIZERS
441+
for wdesc in PARTIAL_WEIGHT_OPTIONS
442+
for fstring in [
443+
"gen_embedding_backward_{}_split_{}_vbe_cuda.cu",
444+
"gen_embedding_backward_{}_split_{}_vbe_kernel_cta.cu",
445+
"gen_embedding_backward_{}_split_{}_vbe_kernel_warp.cu",
446+
]
447+
+ (
448+
[
449+
"gen_embedding_backward_{}_ssd_{}_vbe_cuda.cu",
450+
"gen_embedding_backward_{}_ssd_{}_vbe_kernel_cta.cu",
451+
"gen_embedding_backward_{}_ssd_{}_vbe_kernel_warp.cu",
452+
]
453+
if optimizer in SSD_OPTIMIZERS
454+
else []
455+
)
456+
]
457+
424458
gen_gpu_files_training = (
425459
[
426460
"gen_embedding_backward_split_grad_embedding_ops.cu",
@@ -451,40 +485,6 @@
451485
"gen_embedding_backward_{}_{}_{}_kernel_warp.cu",
452486
]
453487
]
454-
+ [
455-
fstring.format(optimizer, wdesc)
456-
for optimizer in VBE_OPTIMIZERS
457-
for wdesc in PARTIAL_WEIGHT_OPTIONS
458-
for fstring in [
459-
"gen_embedding_backward_{}_split_{}_vbe_meta.cpp",
460-
]
461-
+ (
462-
[
463-
"gen_embedding_backward_{}_ssd_{}_vbe_meta.cpp",
464-
]
465-
if optimizer in SSD_OPTIMIZERS
466-
else []
467-
)
468-
]
469-
+ [
470-
fstring.format(optimizer, wdesc)
471-
for optimizer in VBE_OPTIMIZERS
472-
for wdesc in PARTIAL_WEIGHT_OPTIONS
473-
for fstring in [
474-
"gen_embedding_backward_{}_split_{}_vbe_cuda.cu",
475-
"gen_embedding_backward_{}_split_{}_vbe_kernel_cta.cu",
476-
"gen_embedding_backward_{}_split_{}_vbe_kernel_warp.cu",
477-
]
478-
+ (
479-
[
480-
"gen_embedding_backward_{}_ssd_{}_vbe_cuda.cu",
481-
"gen_embedding_backward_{}_ssd_{}_vbe_kernel_cta.cu",
482-
"gen_embedding_backward_{}_ssd_{}_vbe_kernel_warp.cu",
483-
]
484-
if optimizer in SSD_OPTIMIZERS
485-
else []
486-
)
487-
]
488488
)
489489

490490
gen_hip_files_training = [

fbgemm_gpu/fbgemm_gpu/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def _load_library(filename: str) -> None:
4747
"fbgemm_gpu_tbe_training_backward_dense",
4848
"fbgemm_gpu_tbe_training_backward_split_host",
4949
"fbgemm_gpu_tbe_training_backward_gwd",
50+
"fbgemm_gpu_tbe_training_backward_vbe",
5051
"fbgemm_gpu_py",
5152
]
5253

0 commit comments

Comments
 (0)