Make indices related to cache eviction UVA tensors (pytorch#3077)

sryap · facebook-github-bot · commit 25d2486b622e · 2024-09-04T23:21:23.000-07:00
Summary: Pull Request resolved: pytorch#3077 X-link: facebookresearch/FBGEMM#171 This is a follow up diff of D62114877 which makes the indices related L1 cache eviction UVA buffers. Differential Revision: D62114882
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -309,6 +309,7 @@ def __init__(
                 * self.lxu_cache_weights.element_size()
             ), "The precomputed cache_size does not match the actual cache size"
 
+        # Buffers for cache eviction
         # For storing weights to evict
         # The max number of rows to be evicted is limited by the number of
         # slots in the cache. Thus, we allocate `lxu_cache_evicted_weights` to
@@ -325,6 +326,49 @@ def __init__(
                 is_host_mapped=self.uvm_host_mapped,
             ),
         )
+
+        # For storing embedding indices to evict to
+        self.register_buffer(
+            "lxu_cache_evicted_indices",
+            torch.ops.fbgemm.new_unified_tensor(
+                torch.zeros(
+                    1,
+                    device=self.current_device,
+                    dtype=torch.long,
+                ),
+                (self.lxu_cache_weights.shape[0],),
+                is_host_mapped=self.uvm_host_mapped,
+            ),
+        )
+
+        # For storing cache slots to evict
+        self.register_buffer(
+            "lxu_cache_evicted_slots",
+            torch.ops.fbgemm.new_unified_tensor(
+                torch.zeros(
+                    1,
+                    device=self.current_device,
+                    dtype=torch.int,
+                ),
+                (self.lxu_cache_weights.shape[0],),
+                is_host_mapped=self.uvm_host_mapped,
+            ),
+        )
+
+        # For storing the number of evicted rows
+        self.register_buffer(
+            "lxu_cache_evicted_count",
+            torch.ops.fbgemm.new_unified_tensor(
+                torch.zeros(
+                    1,
+                    device=self.current_device,
+                    dtype=torch.int,
+                ),
+                (1,),
+                is_host_mapped=self.uvm_host_mapped,
+            ),
+        )
+
         self.timestep = 0
 
         # Dummy profile configuration for measuring the SSD get/set time
@@ -1083,35 +1127,30 @@ def prefetch(  # noqa C901
                     self.local_ssd_cache_stats,
                 )
 
-            # Allocate output tensors for compact_indices
-            compact_evicted_indices = torch.empty_like(evicted_indices)
-            compact_assigned_cache_slots = torch.empty_like(assigned_cache_slots)
-            compact_actions_count_gpu = torch.empty_like(actions_count_gpu)
-
             # Defrag indices based on evicted_indices (removing -1 and making
             # the non -1 elements contiguous). We need to do this because the
             # number of rows in `lxu_cache_evicted_weights` might be smaller
             # than the number of elements in `evicted_indices`. Without this
             # step, we can run into the index out of bound issue
             current_stream.wait_event(self.ssd_event_cache_evict)
             torch.ops.fbgemm.compact_indices(
-                compact_indices=[compact_evicted_indices, compact_assigned_cache_slots],
-                compact_count=compact_actions_count_gpu,
+                compact_indices=[
+                    self.lxu_cache_evicted_indices,
+                    self.lxu_cache_evicted_slots,
+                ],
+                compact_count=self.lxu_cache_evicted_count,
                 indices=[evicted_indices, assigned_cache_slots],
                 masks=torch.where(evicted_indices != -1, 1, 0),
                 count=actions_count_gpu,
             )
 
-            evicted_indices = compact_evicted_indices
-
             with record_function("## ssd_d2h_inserted_indices ##"):
                 # Transfer actions_count and insert_indices right away to
                 # incrase an overlap opportunity
-                actions_count_cpu, compact_actions_count_cpu, inserted_indices_cpu = (
+                actions_count_cpu, inserted_indices_cpu = (
                     self.to_pinned_cpu_on_stream_wait_on_another_stream(
                         tensors=[
                             actions_count_gpu,
-                            compact_actions_count_gpu,
                             inserted_indices,
                         ],
                         stream=self.ssd_memcpy_stream,
@@ -1120,26 +1159,14 @@ def prefetch(  # noqa C901
                     )
                 )
 
-            with record_function("## ssd_d2h_evicted_indices ##"):
-                # Transfer evicted indices from GPU to CPU right away to increase a
-                # chance of overlapping with compute on the default stream
-                (evicted_indices_cpu,) = (
-                    self.to_pinned_cpu_on_stream_wait_on_another_stream(
-                        tensors=[evicted_indices],
-                        stream=self.ssd_eviction_stream,
-                        stream_to_wait_on=current_stream,
-                        post_event=None,
-                    )
-                )
-
             # Copy rows to be evicted into a separate buffer (will be evicted
             # later in the prefetch step)
             with record_function("## ssd_compute_evicted_rows ##"):
                 torch.ops.fbgemm.masked_index_select(
                     self.lxu_cache_evicted_weights,
-                    compact_assigned_cache_slots,
+                    self.lxu_cache_evicted_slots,
                     self.lxu_cache_weights,
-                    compact_actions_count_gpu,
+                    self.lxu_cache_evicted_count,
                 )
 
             # Allocation a scratch pad for the current iteration. The scratch
@@ -1293,8 +1320,8 @@ def prefetch(  # noqa C901
                 # Evict rows from cache to SSD
                 self.evict(
                     rows=self.lxu_cache_evicted_weights,
-                    indices_cpu=evicted_indices_cpu,
-                    actions_count_cpu=compact_actions_count_cpu,
+                    indices_cpu=self.lxu_cache_evicted_indices,
+                    actions_count_cpu=self.lxu_cache_evicted_count,
                     stream=self.ssd_eviction_stream,
                     pre_event=self.ssd_event_get,
                     # Record completion event after scratch pad eviction
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.cpp
@@ -16,16 +16,31 @@
 
 namespace kv_db {
 
+namespace {
+
+/// Read a scalar value from a tensor that is maybe a UVM tensor
+/// Note that `tensor.item<type>()` is not allowed on a UVM tensor in
+/// PyTorch
+inline int64_t get_maybe_uvm_scalar(const at::Tensor& tensor) {
+  return tensor.scalar_type() == at::ScalarType::Long
+      ? *(tensor.data_ptr<int64_t>())
+      : *(tensor.data_ptr<int32_t>());
+}
+
+}; // namespace
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor> tensor_copy(
     const at::Tensor& indices,
     const at::Tensor& weights,
     const at::Tensor& count) {
-  auto num_sets = count.item<long>();
+  auto num_sets = get_maybe_uvm_scalar(count);
   auto new_indices = at::empty(
       num_sets, at::TensorOptions().device(at::kCPU).dtype(indices.dtype()));
   auto new_weights = at::empty(
       {num_sets, weights.size(1)},
       at::TensorOptions().device(at::kCPU).dtype(weights.dtype()));
+  auto new_count =
+      at::empty({1}, at::TensorOptions().device(at::kCPU).dtype(at::kLong));
   FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(
       weights.scalar_type(), "cache_memcpy", [&] {
         auto indices_addr = indices.data_ptr<int64_t>();
@@ -42,7 +57,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> tensor_copy(
             weights_addr + num_sets * weights.size(1),
             new_weightss_addr); // dst_start
       });
-  return std::make_tuple(new_indices, new_weights, count.clone());
+  *new_count.data_ptr<int64_t>() = num_sets;
+  return std::make_tuple(new_indices, new_weights, new_count);
 }
 
 EmbeddingKVDB::EmbeddingKVDB(
@@ -182,7 +198,7 @@ void EmbeddingKVDB::set(
     const at::Tensor& weights,
     const at::Tensor& count,
     const bool is_bwd) {
-  if (auto num_evictions = count.item<long>(); num_evictions <= 0) {
+  if (auto num_evictions = get_maybe_uvm_scalar(count); num_evictions <= 0) {
     XLOG_EVERY_MS(INFO, 60000)
         << "[" << unique_id_ << "]skip set_cuda since number evictions is "
         << num_evictions;
@@ -204,7 +220,7 @@ void EmbeddingKVDB::get(
     const at::Tensor& indices,
     const at::Tensor& weights,
     const at::Tensor& count) {
-  if (auto num_lookups = count.item<long>(); num_lookups <= 0) {
+  if (auto num_lookups = get_maybe_uvm_scalar(count); num_lookups <= 0) {
     XLOG_EVERY_MS(INFO, 60000)
         << "[" << unique_id_ << "]skip get_cuda since number lookups is "
         << num_lookups;
@@ -255,7 +271,7 @@ std::shared_ptr<CacheContext> EmbeddingKVDB::get_cache(
   }
   auto start_ts = facebook::WallClockUtil::NowInUsecFast();
   auto indices_addr = indices.data_ptr<int64_t>();
-  auto num_lookups = count.item<long>();
+  auto num_lookups = get_maybe_uvm_scalar(count);
   auto cache_context = std::make_shared<CacheContext>(num_lookups);
 
   auto num_shards = executor_tp_->numThreads();
@@ -348,7 +364,7 @@ folly::Optional<std::pair<at::Tensor, at::Tensor>> EmbeddingKVDB::set_cache(
 
   l2_cache_->init_tensor_for_l2_eviction(indices, weights, count);
   auto indices_addr = indices.data_ptr<int64_t>();
-  auto num_lookups = count.item<long>();
+  const int64_t num_lookups = get_maybe_uvm_scalar(count);
   auto num_shards = executor_tp_->numThreads();
 
   std::vector<folly::coro::TaskWithExecutor<void>> tasks;