Break down fbgemm_gpu_tbe_training_backward module further, pt 3 (pytorch#779)

q10 · facebook-github-bot · commit c08e41daa0a7 · 2025-02-17T10:53:41.000-08:00
Summary: Pull Request resolved: facebookresearch/FBGEMM#779 - Break down `fbgemm_gpu_tbe_training_backward` module further, to work around instruction relocation issues in CUDA 12.8 X-link: pytorch#3694 Reviewed By: spcyppt Differential Revision: D69693785 Pulled By: q10 fbshipit-source-id: 2d98a05a7a8cad30cb87195d9cca1d808395303b
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -118,7 +118,8 @@ __configure_fbgemm_gpu_build_nvcc () {
 }
 
 __configure_fbgemm_gpu_cuda_home () {
-  if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
     # shellcheck disable=SC2155,SC2086
     local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
     local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
@@ -222,7 +223,8 @@ __configure_fbgemm_gpu_build_cuda () {
 
     if  [[ $cuda_version_nvcc == *"V12.1"* ]] ||
         [[ $cuda_version_nvcc == *"V12.4"* ]] ||
-        [[ $cuda_version_nvcc == *"V12.6"* ]]; then
+        [[ $cuda_version_nvcc == *"V12.6"* ]] ||
+        [[ $cuda_version_nvcc == *"V12.8"* ]]; then
       # sm_90 and sm_90a are only available for CUDA 12.1+
       # NOTE: CUTLASS kernels for Hopper require sm_90a to be enabled
       # See:
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -18,11 +18,12 @@ __set_cuda_symlinks_envvars () {
   local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
   local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
 
-  if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
     # CUDA 12.6 installation has a very different package layout than previous
     # CUDA versions - notably, NVTX has been moved elsewhere, which causes
     # PyTorch CMake scripts to complain.
-    echo "[INSTALL] Fixing file placements for CUDA 12.6+ ..."
+    echo "[INSTALL] Fixing file placements for CUDA ${BUILD_CUDA_VERSION}+ ..."
 
     echo "[INSTALL] Creating symlinks: libnvToolsExt.so"
     print_exec ln -sf "${conda_prefix}/lib/libnvToolsExt.so.1" "${conda_prefix}/lib/libnvToolsExt.so"
@@ -89,7 +90,8 @@ __set_nvcc_prepend_flags () {
   # which overrides whatever `-ccbin` flag we set manually, so remove this
   # unwanted hook
   print_exec ls -la "${conda_prefix}/etc/conda/activate.d"
-  if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]]; then
     echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..."
     print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh"
   fi
@@ -192,7 +194,8 @@ install_cuda () {
   # in the future, we will be using conda-forge for installing all CUDA versions
   # (except for versions 11.8 and below, which are only available through
   # nvidia/label/cuda-*)
-  if [[ "$cuda_version" =~ ^12.6.*$ ]]; then
+  if  [[ "$cuda_version" =~ ^12.6.*$ ]] ||
+      [[ "$cuda_version" =~ ^12.8.*$ ]]; then
     # shellcheck disable=SC2086
     (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
       cuda=${cuda_version}) || return 1
diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -73,7 +73,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -156,7 +156,7 @@ jobs:
           # { arch: x86, instance: "linux.gcp.a100" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
-        cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.4.1" ]
         compiler: [ "gcc", "clang" ]
diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
@@ -11,14 +11,10 @@ on:
   # PR Trigger
   #
   pull_request:
-    branches:
-      - main
 
   # Push Trigger (enable to catch errors coming out of multiple merges)
   #
   push:
-    branches:
-      - main
 
   # Manual Trigger
   #
diff --git a/fbgemm_gpu/cmake/TbeTraining.cmake b/fbgemm_gpu/cmake/TbeTraining.cmake
@@ -30,12 +30,14 @@ get_tbe_sources_list(gen_gpu_files_training_pt2)
 get_tbe_sources_list(gen_gpu_files_training_dense)
 get_tbe_sources_list(gen_gpu_files_training_split_host)
 get_tbe_sources_list(gen_gpu_files_training_gwd)
+get_tbe_sources_list(gen_gpu_files_training_vbe)
 handle_genfiles_rocm(gen_cpu_files_training)
 handle_genfiles_rocm(gen_gpu_files_training)
 handle_genfiles_rocm(gen_gpu_files_training_pt2)
 handle_genfiles_rocm(gen_gpu_files_training_dense)
 handle_genfiles_rocm(gen_gpu_files_training_split_host)
 handle_genfiles_rocm(gen_gpu_files_training_gwd)
+handle_genfiles_rocm(gen_gpu_files_training_vbe)
 
 # Index Select
 get_tbe_sources_list(static_cpu_files_index_select)
@@ -204,7 +206,6 @@ gpu_cpp_library(
   DESTINATION
     fbgemm_gpu)
 
-
 gpu_cpp_library(
   PREFIX
     fbgemm_gpu_tbe_training_backward_gwd
@@ -221,6 +222,21 @@ gpu_cpp_library(
   DESTINATION
     fbgemm_gpu)
 
+gpu_cpp_library(
+  PREFIX
+    fbgemm_gpu_tbe_training_backward_vbe
+  TYPE
+    SHARED
+  INCLUDE_DIRS
+    ${fbgemm_sources_include_directories}
+  GPU_SRCS
+    ${gen_gpu_files_training_vbe}
+  NVCC_FLAGS
+    ${TORCH_CUDA_OPTIONS}
+  DEPS
+    fbgemm_gpu_tbe_training_backward
+  DESTINATION
+    fbgemm_gpu)
 
 gpu_cpp_library(
   PREFIX
diff --git a/fbgemm_gpu/cmake/tbe_sources.py b/fbgemm_gpu/cmake/tbe_sources.py
@@ -421,6 +421,40 @@
     )
 ]
 
+gen_gpu_files_training_vbe = [
+    fstring.format(optimizer, wdesc)
+    for optimizer in VBE_OPTIMIZERS
+    for wdesc in PARTIAL_WEIGHT_OPTIONS
+    for fstring in [
+        "gen_embedding_backward_{}_split_{}_vbe_meta.cpp",
+    ]
+    + (
+        [
+            "gen_embedding_backward_{}_ssd_{}_vbe_meta.cpp",
+        ]
+        if optimizer in SSD_OPTIMIZERS
+        else []
+    )
+] + [
+    fstring.format(optimizer, wdesc)
+    for optimizer in VBE_OPTIMIZERS
+    for wdesc in PARTIAL_WEIGHT_OPTIONS
+    for fstring in [
+        "gen_embedding_backward_{}_split_{}_vbe_cuda.cu",
+        "gen_embedding_backward_{}_split_{}_vbe_kernel_cta.cu",
+        "gen_embedding_backward_{}_split_{}_vbe_kernel_warp.cu",
+    ]
+    + (
+        [
+            "gen_embedding_backward_{}_ssd_{}_vbe_cuda.cu",
+            "gen_embedding_backward_{}_ssd_{}_vbe_kernel_cta.cu",
+            "gen_embedding_backward_{}_ssd_{}_vbe_kernel_warp.cu",
+        ]
+        if optimizer in SSD_OPTIMIZERS
+        else []
+    )
+]
+
 gen_gpu_files_training = (
     [
         "gen_embedding_backward_split_grad_embedding_ops.cu",
@@ -451,40 +485,6 @@
             "gen_embedding_backward_{}_{}_{}_kernel_warp.cu",
         ]
     ]
-    + [
-        fstring.format(optimizer, wdesc)
-        for optimizer in VBE_OPTIMIZERS
-        for wdesc in PARTIAL_WEIGHT_OPTIONS
-        for fstring in [
-            "gen_embedding_backward_{}_split_{}_vbe_meta.cpp",
-        ]
-        + (
-            [
-                "gen_embedding_backward_{}_ssd_{}_vbe_meta.cpp",
-            ]
-            if optimizer in SSD_OPTIMIZERS
-            else []
-        )
-    ]
-    + [
-        fstring.format(optimizer, wdesc)
-        for optimizer in VBE_OPTIMIZERS
-        for wdesc in PARTIAL_WEIGHT_OPTIONS
-        for fstring in [
-            "gen_embedding_backward_{}_split_{}_vbe_cuda.cu",
-            "gen_embedding_backward_{}_split_{}_vbe_kernel_cta.cu",
-            "gen_embedding_backward_{}_split_{}_vbe_kernel_warp.cu",
-        ]
-        + (
-            [
-                "gen_embedding_backward_{}_ssd_{}_vbe_cuda.cu",
-                "gen_embedding_backward_{}_ssd_{}_vbe_kernel_cta.cu",
-                "gen_embedding_backward_{}_ssd_{}_vbe_kernel_warp.cu",
-            ]
-            if optimizer in SSD_OPTIMIZERS
-            else []
-        )
-    ]
 )
 
 gen_hip_files_training = [
diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -47,6 +47,7 @@ def _load_library(filename: str) -> None:
     "fbgemm_gpu_tbe_training_backward_dense",
     "fbgemm_gpu_tbe_training_backward_split_host",
     "fbgemm_gpu_tbe_training_backward_gwd",
+    "fbgemm_gpu_tbe_training_backward_vbe",
     "fbgemm_gpu_py",
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -11,14 +11,10 @@ on:`
`11`	`11`	`# PR Trigger`
`12`	`12`	`#`
`13`	`13`	`pull_request:`
`14`		`- branches:`
`15`		`- - main`
`16`	`14`
`17`	`15`	`# Push Trigger (enable to catch errors coming out of multiple merges)`
`18`	`16`	`#`
`19`	`17`	`push:`
`20`		`- branches:`
`21`		`- - main`
`22`	`18`
`23`	`19`	`# Manual Trigger`
`24`	`20`	`#`
Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,7 @@ def _load_library(filename: str) -> None:`
`47`	`47`	`"fbgemm_gpu_tbe_training_backward_dense",`
`48`	`48`	`"fbgemm_gpu_tbe_training_backward_split_host",`
`49`	`49`	`"fbgemm_gpu_tbe_training_backward_gwd",`
	`50`	`+ "fbgemm_gpu_tbe_training_backward_vbe",`
`50`	`51`	`"fbgemm_gpu_py",`
`51`	`52`	`]`
`52`	`53`