pytorch · q10 · Mar 14, 2025
diff --git a/.github/scripts/fbgemm_gpu_benchmarks.bash b/.github/scripts/fbgemm_gpu_benchmarks.bash
@@ -0,0 +1,98 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
+
+################################################################################
+# FBGEMM_GPU Test Helper Functions
+################################################################################
+
+run_tbe_microbench () {
+  local env_name="$1"
+
+  __single_run() {
+    local cache_type="$1"
+    local embedding_location="$2"
+
+    echo "################################################################################"
+    echo "# Running Benchmark: (${cache_type}, ${embedding_location})"
+    echo "#"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+    echo "################################################################################"
+    echo ""
+
+    # shellcheck disable=SC2155
+    local env_prefix=$(env_name_or_prefix "${env_name}")
+
+    if [ "$embedding_location" == "hbm" ]; then
+      local managed="device"
+    elif [ "$embedding_location" == "uvm" ]; then
+      local managed="managed"
+    fi
+
+    # Old TBE benchmark script
+    # shellcheck disable=SC2086
+    print_exec conda run --no-capture-output ${env_prefix} python tbe/split_table_batched_embeddings_benchmark.py device \
+      --batch-size 13107 \
+      --embedding-dim 256 \
+      --iters 400 \
+      --warmup-runs 50 \
+      --alpha 1.15 \
+      --bag-size 55 \
+      --weights-precision fp16 \
+      --cache-precision "${cache_type}" \
+      --output-dtype bf16 \
+      --managed="${managed}" \
+      --num-embeddings 10000000 \
+      --num-tables 1 \
+      --row-wise
+
+    # New TBE benchmark script
+    #
+    # Invoke `python tbe/tbe_training_benchmark.py device --help` for
+    # documentation on all available flags
+    # shellcheck disable=SC2086
+    print_exec conda run --no-capture-output ${env_prefix} python tbe/tbe_training_benchmark.py device \
+      --bench-iterations 400 \
+      --bench-warmup-iterations 50 \
+      --bench-num-requests 10 \
+      --tbe-batch-size 13107 \
+      --tbe-embedding-dim 256 \
+      --tbe-pooling-size 55 \
+      --tbe-num-embeddings 10000000 \
+      --tbe-num-tables 1 \
+      --weights-precision fp16 \
+      --cache-precision "${cache_type}" \
+      --output-dtype bf16 \
+      --managed="${managed}" \
+      --row-wise
+  }
+
+  pushd fbgemm_gpu/bench || return 1
+
+  local cache_types=(
+    # fp16
+    fp32
+  )
+
+  local embedding_locations=(
+    # uvm
+    hbm
+  )
+
+  for cache_type in "${cache_types[@]}"; do
+    for embedding_location in "${embedding_locations[@]}"; do
+      __single_run "${cache_type}" "${embedding_location}" || return 1
+      echo ""
+      echo ""
+    done
+  done
+
+  popd || return 1
+}
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
@@ -37,3 +37,5 @@
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_lint.bash"
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_test.bash"
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_benchmarks.bash"
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
@@ -165,22 +165,22 @@ print_gpu_info () {
   if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
     # Ensure that rocm-smi is available and returns GPU entries
     if ! rocm-smi; then
-      echo "[CHECK] ROCm drivers and ROCm device are required for this workflow, but does not appear to be installed or available!"
+      echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!"
       return 1
     fi
   else
-    if which rocminfo; then
-      # If rocminfo is installed on a machine without GPUs, this will return error
-      (print_exec rocminfo) || true
-    else
-      echo "[CHECK] rocminfo not found"
-    fi
-    if which rocm-smi; then
-      # If rocm-smi is installed on a machine without GPUs, this will return error
-      (print_exec rocm-smi) || true
-    else
-      echo "[CHECK] rocm-smi not found"
-    fi
+    local smi_programs=( rocminfo rocm-smi )
+
+    for smi_program in "${smi_programs[@]}"; do
+      # shellcheck disable=SC2086
+      if which $smi_program; then
+        # If the program is installed on a machine without GPUs, invoking it will return error
+        # shellcheck disable=SC2086
+        (print_exec $smi_program) || true
+      else
+        echo "[CHECK] $smi_program not found"
+      fi
+    done
   fi
 }
 

diff --git a/.github/workflows/fbgemm_gpu_benchmark_cpu.yml b/.github/workflows/fbgemm_gpu_benchmark_cpu.yml
@@ -0,0 +1,172 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This workflow is used for FBGEMM_GPU-CUDA Benchmarking
+name: FBGEMM_GPU-CPU Benchmark
+
+on:
+  # PR Trigger (enabled for regression checks and debugging)
+  #
+  pull_request:
+    branches:
+      - main
+
+  # Manual Trigger
+  #
+  workflow_dispatch:
+    inputs:
+      pytorch_channel_version:
+        description: Package Channel + Version to Use for PyTorch Installation, in `<channel>[/<version>]` Format
+        type: string
+        required: false
+        default: ""
+
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Build on CPU hosts, run tests, and upload to GHA
+  build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: cpu
+    continue-on-error: true
+    strategy:
+      # Don't fast-fail all the other builds if one of the them fails
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge" },
+          { arch: arm, instance: "linux.arm64.2xlarge" },
+        ]
+        python-version: [ "3.13" ]
+        compiler: [ "gcc" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install PyTorch-CPU Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch_channel_version) || 'nightly' }} cpu
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU Wheel
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cpu
+
+    - name: Upload Built Wheel as GHA Artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: fbgemm_gpu_nightly_cpu_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
+
+
+  # Download the built artifact from GHA and test on CPU
+  benchmark_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: cpu
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge", timeout: 20 },
+          { arch: arm, instance: "linux.arm64.2xlarge", timeout: 30 },
+        ]
+        python-version: [ "3.13" ]
+        compiler: [ "gcc" ]
+    needs: build_artifact
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+
+    - name: Download Wheel Artifact from GHA
+      uses: actions/download-artifact@v4
+      with:
+        name: fbgemm_gpu_nightly_cpu_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}.whl
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info; print_ec2_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers for Updated LIBGCC
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
+
+    - name: Install PyTorch-CPU Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch_channel_version) || 'nightly' }} cpu
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Install FBGEMM_GPU Wheel
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
+
+    - name: Run FBGEMM_GPU Benchmark
+      timeout-minutes: 40
+      run: . $PRELUDE; run_tbe_microbench $BUILD_ENV