GPU timing and basic reporting framework (rebase of D52716004) (pytorch#2314)

levythu · facebook-github-bot · commit 65cd9f02e05b · 2024-02-26T15:15:13.000-08:00
Summary:
Implements the reporting framework for internal state per TBE for better visibility.


Differential Revision: D53028585
diff --git a/fbgemm_gpu/fbgemm_gpu/embedding_offloading_metrics.py b/fbgemm_gpu/fbgemm_gpu/embedding_offloading_metrics.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import abc
+
+
+class IEmbeddingOffloadingMetricsReporter(abc.ABC):
+    """
+    All the report_XXX functions should be light weighted and fail-safe.
+    """
+
+    @abc.abstractmethod
+    def should_report(self, iteration_step: int) -> bool:
+        """
+        Return whether we should report metrics during this step.
+        This function should be cheap, side-effect free and return immediately.
+        """
+        ...
+
+    @abc.abstractmethod
+    def report_duration(
+        self,
+        iteration_step: int,
+        event_name: str,
+        duration_ms: float,
+        embedding_id: str = "",
+        tbe_id: str = "",
+    ) -> None:
+        """
+        Report the duration of a timed event.
+        """
+        ...
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -20,6 +20,7 @@
 from torch import nn, Tensor  # usort:skip
 
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
+from fbgemm_gpu.embedding_offloading_metrics import IEmbeddingOffloadingMetricsReporter
 from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType, SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     BoundsCheckMode,
@@ -348,6 +349,7 @@ def __init__(  # noqa C901
         # If a separate stream is used for prefetch, the optional forward_stream arg of prefetch function
         # should be set.
         prefetch_pipeline: bool = False,
+        metrics_reporter: Optional[IEmbeddingOffloadingMetricsReporter] = None,
     ) -> None:
         super(SplitTableBatchedEmbeddingBagsCodegen, self).__init__()
 
@@ -441,6 +443,8 @@ def __init__(  # noqa C901
         # 0: N_calls, 1: N_requested_indices, 2: N_unique_indices, 3: N_unique_misses,
         # 4: N_conflict_unique_misses, 5: N_conflict_misses
 
+        self.metrics_reporter = metrics_reporter
+
         self.int8_emb_row_dim_offset: int = INT8_EMB_ROW_DIM_OFFSET
 
         self.feature_table_map: List[int] = (
diff --git a/fbgemm_gpu/test/tbe/cache/cache_common.py b/fbgemm_gpu/test/tbe/cache/cache_common.py
@@ -7,10 +7,11 @@
 
 # pyre-ignore-all-errors[56]
 
-from typing import Tuple
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
+from fbgemm_gpu.embedding_offloading_metrics import IEmbeddingOffloadingMetricsReporter
 from fbgemm_gpu.split_embedding_configs import SparseType
 
 from fbgemm_gpu.split_embedding_utils import round_up
@@ -37,6 +38,27 @@
 VERBOSITY: Verbosity = Verbosity.verbose
 
 
+class TestingEmbeddingOffloadingMetricsReporter(IEmbeddingOffloadingMetricsReporter):
+    def __init__(self, reporting_interval: int = 1) -> None:
+        self.reported_data: List[List[Union[int, str, float]]] = []
+        self.reporting_interval = reporting_interval
+
+    def should_report(self, iteration_step: int) -> bool:
+        return (iteration_step - 1) % self.reporting_interval == 0
+
+    def report_duration(
+        self,
+        iteration_step: int,
+        event_name: str,
+        duration_ms: float,
+        embedding_id: str = "",
+        tbe_id: str = "",
+    ) -> None:
+        self.reported_data.append(
+            [iteration_step, event_name, duration_ms, embedding_id, tbe_id]
+        )
+
+
 def generate_cache_tbes(
     T: int,
     D: int,
@@ -48,6 +70,7 @@ def generate_cache_tbes(
     cache_sets: int = 0,
     weights_cache_precision: SparseType = SparseType.FP32,
     stochastic_rounding: bool = False,
+    reporter: Optional[TestingEmbeddingOffloadingMetricsReporter] = None,
 ) -> Tuple[
     SplitTableBatchedEmbeddingBagsCodegen,
     SplitTableBatchedEmbeddingBagsCodegen,
@@ -103,6 +126,7 @@ def generate_cache_tbes(
         cache_sets=cache_sets,
         weights_precision=weights_cache_precision,
         cache_precision=weights_cache_precision,
+        metrics_reporter=reporter,
     )
 
     if use_int_weight:
diff --git a/fbgemm_gpu/test/tbe/cache/cache_test.py b/fbgemm_gpu/test/tbe/cache/cache_test.py
@@ -38,6 +38,7 @@
     generate_cache_tbes,
     gpu_unavailable,
     optests,
+    TestingEmbeddingOffloadingMetricsReporter,
     VERBOSITY,
 )
 
@@ -122,6 +123,7 @@ def _test_cache_prefetch_pipeline(  # noqa C901
         """
 
         assert prefetch_location in ["before_fwd", "between_fwd_bwd"]
+        reporter = TestingEmbeddingOffloadingMetricsReporter(reporting_interval=2)
         cc, cc_ref, min_Es, sum_Ds = generate_cache_tbes(
             T,
             D,
@@ -132,6 +134,7 @@ def _test_cache_prefetch_pipeline(  # noqa C901
             use_int_weight=True,
             weights_cache_precision=weights_cache_precision,
             stochastic_rounding=stochastic_rounding,
+            reporter=reporter,
         )
         iters = 5
         requests = generate_requests(iters, B, T, L, min_Es, reuse=0.1)