Limit the number of ROCm hardware targets (pytorch#886)

q10 · facebook-github-bot · commit e754b2dffb1b · 2025-03-11T20:16:41.000-07:00
Summary: Pull Request resolved: facebookresearch/FBGEMM#886 - Limit the number of ROCm hardware targets to reduce Nova ROCm build times X-link: pytorch#3797 Reviewed By: sryap Differential Revision: D70949678 Pulled By: q10 fbshipit-source-id: a14cc0f12c7988aa3e9b68549bad9d109f1d7ca6
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -171,12 +171,22 @@ __configure_fbgemm_gpu_build_rocm () {
         # cards, in which case the arch_list will be empty.
         echo "[BUILD] rocminfo did not return anything valid!"
 
-        # By default, we build just for MI100 and MI250 to save time.  This list
-        # needs to be updated if the CI ROCm machines have different hardware.
+        # By default, we build for a limited number of architectures to save on
+        # build time.  This list needs to be updated if the CI ROCm machines
+        # have different hardware.
         #
         # Architecture mapping can be found at:
         #   https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
-        local arch_list="gfx908,gfx90a,gfx942"
+        if [ -z "${BUILD_FROM_NOVA+x}" ]; then
+          # If BUILD_FROM_NOVA is unset, then we are building from AMD host with
+          # sufficient resources, so we can build for more architectures.
+          local arch_list="gfx908,gfx90a,gfx942"
+        else
+          # If BUILD_FROM_NOVA is set (regardless of 0 or 1), we are building in
+          # Nova.  Nova machines take a longer time to build FBGEMM_GPU for
+          # ROCm, so we limit to one architecture.
+          local arch_list="gfx942"
+        fi
       fi
     else
       echo "[BUILD] rocminfo not found in PATH!"
diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
@@ -13,6 +13,12 @@ if [[ "$working_dir" == "$FBGEMM_REPO" ]]; then cd fbgemm_gpu || echo "Failed to
 ## Build clean/wheel will be done in pre-script. Set flag such that setup.py will skip these steps in Nova workflow
 export BUILD_FROM_NOVA=1
 
+if [[ "$CU_VERSION" == "cu"* ]]; then
+    echo "Current TORCH_CUDA_ARCH_LIST value: ${TORCH_CUDA_ARCH_LIST}"
+elif [[ "$CU_VERSION" == "rocm"* ]]; then
+    echo "Current PYTORCH_ROCM_ARCH value: ${PYTORCH_ROCM_ARCH}"
+fi
+
 ## Overwrite existing ENV VAR in Nova
 if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi
 
@@ -34,4 +40,20 @@ elif [[ "$CU_VERSION" == "cu"* ]]; then
     echo ""
     echo "Will default to the TORCH_CUDA_ARCH_LIST supplied by the environment!!!"
     echo "################################################################################"
+
+elif [[ "$CU_VERSION" == "rocm6.3"* ]]; then
+    export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201"
+    echo "Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}"
+
+elif [[ "$CU_VERSION" == "rocm6.2"* ]]; then
+    export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942"
+    echo "Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}"
+
+elif [[ "$CU_VERSION" == "rocm"* ]]; then
+    echo "################################################################################"
+    echo "[NOVA] Currently building the ROCm variant, but the supplied CU_VERSION is"
+    echo "unknown or not supported in FBGEMM_GPU: ${CU_VERSION}"
+    echo ""
+    echo "Will default to the PYTORCH_ROCM_ARCH supplied by the environment!!!"
+    echo "################################################################################"
 fi
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -59,6 +59,20 @@ if(SKBUILD)
   BLOCK_PRINT("The project is built using scikit-build")
 endif()
 
+BLOCK_PRINT(
+  "Build Settings"
+  ""
+  "NVCC_VERBOSE           : ${NVCC_VERBOSE}"
+  "CUDNN_INCLUDE_DIR      : ${CUDNN_INCLUDE_DIR}"
+  "CUDNN_LIBRARY          : ${CUDNN_LIBRARY}"
+  "NVML_LIB_PATH          : ${NVML_LIB_PATH}"
+  "TORCH_CUDA_ARCH_LIST   : ${TORCH_CUDA_ARCH_LIST}"
+  ""
+  "HIP_ROOT_DIR           : ${HIP_ROOT_DIR}"
+  "HIPCC_VERBOSE          : ${HIPCC_VERBOSE}"
+  "AMDGPU_TARGETS         : ${AMDGPU_TARGETS}"
+  "PYTORCH_ROCM_ARCH      : ${PYTORCH_ROCM_ARCH}")
+
 if(FBGEMM_CPU_ONLY OR USE_ROCM)
   project(
     fbgemm_gpu