Skip to content

[fbgemm_gpu] Add benchmark workflows #3713

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions .github/scripts/fbgemm_gpu_benchmarks.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


# shellcheck disable=SC1091,SC2128
. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"

################################################################################
# FBGEMM_GPU Test Helper Functions
################################################################################

run_tbe_microbench () {
local env_name="$1"

__single_run() {
local cache_type="$1"
local embedding_location="$2"

echo "################################################################################"
echo "# Running Benchmark: (${cache_type}, ${embedding_location})"
echo "#"
echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
echo "################################################################################"
echo ""

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

if [ "$embedding_location" == "hbm" ]; then
local managed="device"
elif [ "$embedding_location" == "uvm" ]; then
local managed="managed"
fi

# Old TBE benchmark script
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} python tbe/split_table_batched_embeddings_benchmark.py device \
--batch-size 13107 \
--embedding-dim 256 \
--iters 400 \
--warmup-runs 50 \
--alpha 1.15 \
--bag-size 55 \
--weights-precision fp16 \
--cache-precision "${cache_type}" \
--output-dtype bf16 \
--managed="${managed}" \
--num-embeddings 10000000 \
--num-tables 1 \
--row-wise

# New TBE benchmark script
#
# Invoke `python tbe/tbe_training_benchmark.py device --help` for
# documentation on all available flags
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} python tbe/tbe_training_benchmark.py device \
--bench-iterations 400 \
--bench-warmup-iterations 50 \
--bench-num-requests 10 \
--tbe-batch-size 13107 \
--tbe-embedding-dim 256 \
--tbe-pooling-size 55 \
--tbe-num-embeddings 10000000 \
--tbe-num-tables 1 \
--weights-precision fp16 \
--cache-precision "${cache_type}" \
--output-dtype bf16 \
--managed="${managed}" \
--row-wise
}

pushd fbgemm_gpu/bench || return 1

local cache_types=(
# fp16
fp32
)

local embedding_locations=(
# uvm
hbm
)

for cache_type in "${cache_types[@]}"; do
for embedding_location in "${embedding_locations[@]}"; do
__single_run "${cache_type}" "${embedding_location}" || return 1
echo ""
echo ""
done
done

popd || return 1
}
2 changes: 2 additions & 0 deletions .github/scripts/setup_env.bash
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@
. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_lint.bash"
# shellcheck disable=SC1091,SC2128
. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_test.bash"
# shellcheck disable=SC1091,SC2128
. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_benchmarks.bash"
26 changes: 13 additions & 13 deletions .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -165,22 +165,22 @@ print_gpu_info () {
if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
# Ensure that rocm-smi is available and returns GPU entries
if ! rocm-smi; then
echo "[CHECK] ROCm drivers and ROCm device are required for this workflow, but does not appear to be installed or available!"
echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!"
return 1
fi
else
if which rocminfo; then
# If rocminfo is installed on a machine without GPUs, this will return error
(print_exec rocminfo) || true
else
echo "[CHECK] rocminfo not found"
fi
if which rocm-smi; then
# If rocm-smi is installed on a machine without GPUs, this will return error
(print_exec rocm-smi) || true
else
echo "[CHECK] rocm-smi not found"
fi
local smi_programs=( rocminfo rocm-smi )

for smi_program in "${smi_programs[@]}"; do
# shellcheck disable=SC2086
if which $smi_program; then
# If the program is installed on a machine without GPUs, invoking it will return error
# shellcheck disable=SC2086
(print_exec $smi_program) || true
else
echo "[CHECK] $smi_program not found"
fi
done
fi
}

Expand Down
172 changes: 172 additions & 0 deletions .github/workflows/fbgemm_gpu_benchmark_cpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# This workflow is used for FBGEMM_GPU-CUDA Benchmarking
name: FBGEMM_GPU-CPU Benchmark

on:
# PR Trigger (enabled for regression checks and debugging)
#
pull_request:
branches:
- main

# Manual Trigger
#
workflow_dispatch:
inputs:
pytorch_channel_version:
description: Package Channel + Version to Use for PyTorch Installation, in `<channel>[/<version>]` Format
type: string
required: false
default: ""

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
# https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# Build on CPU hosts, run tests, and upload to GHA
build_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
options: --user root
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
BUILD_VARIANT: cpu
continue-on-error: true
strategy:
# Don't fast-fail all the other builds if one of the them fails
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.4xlarge" },
{ arch: arm, instance: "linux.arm64.2xlarge" },
]
python-version: [ "3.13" ]
compiler: [ "gcc" ]

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v4

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install C/C++ Compilers
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install PyTorch-CPU Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch_channel_version) || 'nightly' }} cpu

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU Wheel
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cpu

- name: Upload Built Wheel as GHA Artifact
uses: actions/upload-artifact@v4
with:
name: fbgemm_gpu_nightly_cpu_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}.whl
path: fbgemm_gpu/dist/*.whl
if-no-files-found: error


# Download the built artifact from GHA and test on CPU
benchmark_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
options: --user root
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
BUILD_VARIANT: cpu
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.4xlarge", timeout: 20 },
{ arch: arm, instance: "linux.arm64.2xlarge", timeout: 30 },
]
python-version: [ "3.13" ]
compiler: [ "gcc" ]
needs: build_artifact

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v4

- name: Download Wheel Artifact from GHA
uses: actions/download-artifact@v4
with:
name: fbgemm_gpu_nightly_cpu_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}.whl

- name: Display System Info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install C/C++ Compilers for Updated LIBGCC
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}

- name: Install PyTorch-CPU Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch_channel_version) || 'nightly' }} cpu

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU Wheel
run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl

- name: Run FBGEMM_GPU Benchmark
timeout-minutes: 40
run: . $PRELUDE; run_tbe_microbench $BUILD_ENV
Loading